1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 7 #include "src/mat/impls/baij/seq/baij.h" 8 #include "src/inline/ilu.h" 9 #include "src/inline/dot.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 PetscErrorCode ierr; 17 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18 PetscInt *diag = a->diag; 19 MatScalar *aa=a->a,*v; 20 PetscScalar s1,*x,*b; 21 22 PetscFunctionBegin; 23 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 24 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 27 /* forward solve the U^T */ 28 for (i=0; i<n; i++) { 29 30 v = aa + diag[i]; 31 /* multiply by the inverse of the block diagonal */ 32 s1 = (*v++)*x[i]; 33 vi = aj + diag[i] + 1; 34 nz = ai[i+1] - diag[i] - 1; 35 while (nz--) { 36 x[*vi++] -= (*v++)*s1; 37 } 38 x[i] = s1; 39 } 40 /* backward solve the L^T */ 41 for (i=n-1; i>=0; i--){ 42 v = aa + diag[i] - 1; 43 vi = aj + diag[i] - 1; 44 nz = diag[i] - ai[i]; 45 s1 = x[i]; 46 while (nz--) { 47 x[*vi--] -= (*v--)*s1; 48 } 49 } 50 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 51 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52 ierr = PetscLogFlops(2*(a->nz) - A->n);CHKERRQ(ierr); 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 59 { 60 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 61 PetscErrorCode ierr; 62 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 63 PetscInt *diag = a->diag,oidx; 64 MatScalar *aa=a->a,*v; 65 PetscScalar s1,s2,x1,x2; 66 PetscScalar *x,*b; 67 68 PetscFunctionBegin; 69 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 70 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 71 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 72 73 /* forward solve the U^T */ 74 idx = 0; 75 for (i=0; i<n; i++) { 76 77 v = aa + 4*diag[i]; 78 /* multiply by the inverse of the block diagonal */ 79 x1 = x[idx]; x2 = x[1+idx]; 80 s1 = v[0]*x1 + v[1]*x2; 81 s2 = v[2]*x1 + v[3]*x2; 82 v += 4; 83 84 vi = aj + diag[i] + 1; 85 nz = ai[i+1] - diag[i] - 1; 86 while (nz--) { 87 oidx = 2*(*vi++); 88 x[oidx] -= v[0]*s1 + v[1]*s2; 89 x[oidx+1] -= v[2]*s1 + v[3]*s2; 90 v += 4; 91 } 92 x[idx] = s1;x[1+idx] = s2; 93 idx += 2; 94 } 95 /* backward solve the L^T */ 96 for (i=n-1; i>=0; i--){ 97 v = aa + 4*diag[i] - 4; 98 vi = aj + diag[i] - 1; 99 nz = diag[i] - ai[i]; 100 idt = 2*i; 101 s1 = x[idt]; s2 = x[1+idt]; 102 while (nz--) { 103 idx = 2*(*vi--); 104 x[idx] -= v[0]*s1 + v[1]*s2; 105 x[idx+1] -= v[2]*s1 + v[3]*s2; 106 v -= 4; 107 } 108 } 109 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 110 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 111 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->n);CHKERRQ(ierr); 112 PetscFunctionReturn(0); 113 } 114 115 #undef __FUNCT__ 116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 118 { 119 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 120 PetscErrorCode ierr; 121 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 122 PetscInt *diag = a->diag,oidx; 123 MatScalar *aa=a->a,*v; 124 PetscScalar s1,s2,s3,x1,x2,x3; 125 PetscScalar *x,*b; 126 127 PetscFunctionBegin; 128 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 129 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 130 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 131 132 /* forward solve the U^T */ 133 idx = 0; 134 for (i=0; i<n; i++) { 135 136 v = aa + 9*diag[i]; 137 /* multiply by the inverse of the block diagonal */ 138 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 139 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 140 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 141 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 142 v += 9; 143 144 vi = aj + diag[i] + 1; 145 nz = ai[i+1] - diag[i] - 1; 146 while (nz--) { 147 oidx = 3*(*vi++); 148 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 149 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 150 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 151 v += 9; 152 } 153 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 154 idx += 3; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + 9*diag[i] - 9; 159 vi = aj + diag[i] - 1; 160 nz = diag[i] - ai[i]; 161 idt = 3*i; 162 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 163 while (nz--) { 164 idx = 3*(*vi--); 165 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 166 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 167 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 168 v -= 9; 169 } 170 } 171 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 172 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 173 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->n);CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 180 { 181 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182 PetscErrorCode ierr; 183 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 184 PetscInt *diag = a->diag,oidx; 185 MatScalar *aa=a->a,*v; 186 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 187 PetscScalar *x,*b; 188 189 PetscFunctionBegin; 190 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 191 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 192 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 193 194 /* forward solve the U^T */ 195 idx = 0; 196 for (i=0; i<n; i++) { 197 198 v = aa + 16*diag[i]; 199 /* multiply by the inverse of the block diagonal */ 200 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 201 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 202 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 203 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 204 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 205 v += 16; 206 207 vi = aj + diag[i] + 1; 208 nz = ai[i+1] - diag[i] - 1; 209 while (nz--) { 210 oidx = 4*(*vi++); 211 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 212 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 213 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 214 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 215 v += 16; 216 } 217 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 218 idx += 4; 219 } 220 /* backward solve the L^T */ 221 for (i=n-1; i>=0; i--){ 222 v = aa + 16*diag[i] - 16; 223 vi = aj + diag[i] - 1; 224 nz = diag[i] - ai[i]; 225 idt = 4*i; 226 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 227 while (nz--) { 228 idx = 4*(*vi--); 229 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 230 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 231 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 232 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 233 v -= 16; 234 } 235 } 236 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 237 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 238 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 #undef __FUNCT__ 243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 245 { 246 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 247 PetscErrorCode ierr; 248 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 249 PetscInt *diag = a->diag,oidx; 250 MatScalar *aa=a->a,*v; 251 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 252 PetscScalar *x,*b; 253 254 PetscFunctionBegin; 255 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 256 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 257 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 258 259 /* forward solve the U^T */ 260 idx = 0; 261 for (i=0; i<n; i++) { 262 263 v = aa + 25*diag[i]; 264 /* multiply by the inverse of the block diagonal */ 265 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 266 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 267 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 268 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 269 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 270 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 271 v += 25; 272 273 vi = aj + diag[i] + 1; 274 nz = ai[i+1] - diag[i] - 1; 275 while (nz--) { 276 oidx = 5*(*vi++); 277 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 278 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 279 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 280 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 281 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 282 v += 25; 283 } 284 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 285 idx += 5; 286 } 287 /* backward solve the L^T */ 288 for (i=n-1; i>=0; i--){ 289 v = aa + 25*diag[i] - 25; 290 vi = aj + diag[i] - 1; 291 nz = diag[i] - ai[i]; 292 idt = 5*i; 293 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 294 while (nz--) { 295 idx = 5*(*vi--); 296 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 297 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 298 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 299 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 300 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 301 v -= 25; 302 } 303 } 304 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 305 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 306 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->n);CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 #undef __FUNCT__ 311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 313 { 314 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 315 PetscErrorCode ierr; 316 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 317 PetscInt *diag = a->diag,oidx; 318 MatScalar *aa=a->a,*v; 319 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 320 PetscScalar *x,*b; 321 322 PetscFunctionBegin; 323 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 324 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 325 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 326 327 /* forward solve the U^T */ 328 idx = 0; 329 for (i=0; i<n; i++) { 330 331 v = aa + 36*diag[i]; 332 /* multiply by the inverse of the block diagonal */ 333 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 334 x6 = x[5+idx]; 335 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 336 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 337 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 338 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 339 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 340 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 341 v += 36; 342 343 vi = aj + diag[i] + 1; 344 nz = ai[i+1] - diag[i] - 1; 345 while (nz--) { 346 oidx = 6*(*vi++); 347 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 348 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 349 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 350 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 351 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 352 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 353 v += 36; 354 } 355 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 356 x[5+idx] = s6; 357 idx += 6; 358 } 359 /* backward solve the L^T */ 360 for (i=n-1; i>=0; i--){ 361 v = aa + 36*diag[i] - 36; 362 vi = aj + diag[i] - 1; 363 nz = diag[i] - ai[i]; 364 idt = 6*i; 365 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 366 s6 = x[5+idt]; 367 while (nz--) { 368 idx = 6*(*vi--); 369 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 370 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 371 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 372 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 373 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 374 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 375 v -= 36; 376 } 377 } 378 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 379 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 380 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->n);CHKERRQ(ierr); 381 PetscFunctionReturn(0); 382 } 383 384 #undef __FUNCT__ 385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 387 { 388 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 389 PetscErrorCode ierr; 390 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 391 PetscInt *diag = a->diag,oidx; 392 MatScalar *aa=a->a,*v; 393 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 394 PetscScalar *x,*b; 395 396 PetscFunctionBegin; 397 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 398 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 399 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 400 401 /* forward solve the U^T */ 402 idx = 0; 403 for (i=0; i<n; i++) { 404 405 v = aa + 49*diag[i]; 406 /* multiply by the inverse of the block diagonal */ 407 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 408 x6 = x[5+idx]; x7 = x[6+idx]; 409 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 410 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 411 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 412 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 413 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 414 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 415 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 416 v += 49; 417 418 vi = aj + diag[i] + 1; 419 nz = ai[i+1] - diag[i] - 1; 420 while (nz--) { 421 oidx = 7*(*vi++); 422 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 423 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 424 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 425 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 426 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 427 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 428 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 429 v += 49; 430 } 431 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 432 x[5+idx] = s6;x[6+idx] = s7; 433 idx += 7; 434 } 435 /* backward solve the L^T */ 436 for (i=n-1; i>=0; i--){ 437 v = aa + 49*diag[i] - 49; 438 vi = aj + diag[i] - 1; 439 nz = diag[i] - ai[i]; 440 idt = 7*i; 441 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 442 s6 = x[5+idt];s7 = x[6+idt]; 443 while (nz--) { 444 idx = 7*(*vi--); 445 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 446 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 447 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 448 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 449 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 450 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 451 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 452 v -= 49; 453 } 454 } 455 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->n);CHKERRQ(ierr); 458 PetscFunctionReturn(0); 459 } 460 461 /*---------------------------------------------------------------------------------------------*/ 462 #undef __FUNCT__ 463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 465 { 466 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 467 IS iscol=a->col,isrow=a->row; 468 PetscErrorCode ierr; 469 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 470 PetscInt *diag = a->diag; 471 MatScalar *aa=a->a,*v; 472 PetscScalar s1,*x,*b,*t; 473 474 PetscFunctionBegin; 475 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 476 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 477 t = a->solve_work; 478 479 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 480 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 481 482 /* copy the b into temp work space according to permutation */ 483 for (i=0; i<n; i++) { 484 t[i] = b[c[i]]; 485 } 486 487 /* forward solve the U^T */ 488 for (i=0; i<n; i++) { 489 490 v = aa + diag[i]; 491 /* multiply by the inverse of the block diagonal */ 492 s1 = (*v++)*t[i]; 493 vi = aj + diag[i] + 1; 494 nz = ai[i+1] - diag[i] - 1; 495 while (nz--) { 496 t[*vi++] -= (*v++)*s1; 497 } 498 t[i] = s1; 499 } 500 /* backward solve the L^T */ 501 for (i=n-1; i>=0; i--){ 502 v = aa + diag[i] - 1; 503 vi = aj + diag[i] - 1; 504 nz = diag[i] - ai[i]; 505 s1 = t[i]; 506 while (nz--) { 507 t[*vi--] -= (*v--)*s1; 508 } 509 } 510 511 /* copy t into x according to permutation */ 512 for (i=0; i<n; i++) { 513 x[r[i]] = t[i]; 514 } 515 516 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 517 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 518 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 519 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 520 ierr = PetscLogFlops(2*(a->nz) - A->n);CHKERRQ(ierr); 521 PetscFunctionReturn(0); 522 } 523 524 #undef __FUNCT__ 525 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 526 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 527 { 528 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 529 IS iscol=a->col,isrow=a->row; 530 PetscErrorCode ierr; 531 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 532 PetscInt *diag = a->diag,ii,ic,ir,oidx; 533 MatScalar *aa=a->a,*v; 534 PetscScalar s1,s2,x1,x2; 535 PetscScalar *x,*b,*t; 536 537 PetscFunctionBegin; 538 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 539 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 540 t = a->solve_work; 541 542 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 543 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 544 545 /* copy the b into temp work space according to permutation */ 546 ii = 0; 547 for (i=0; i<n; i++) { 548 ic = 2*c[i]; 549 t[ii] = b[ic]; 550 t[ii+1] = b[ic+1]; 551 ii += 2; 552 } 553 554 /* forward solve the U^T */ 555 idx = 0; 556 for (i=0; i<n; i++) { 557 558 v = aa + 4*diag[i]; 559 /* multiply by the inverse of the block diagonal */ 560 x1 = t[idx]; x2 = t[1+idx]; 561 s1 = v[0]*x1 + v[1]*x2; 562 s2 = v[2]*x1 + v[3]*x2; 563 v += 4; 564 565 vi = aj + diag[i] + 1; 566 nz = ai[i+1] - diag[i] - 1; 567 while (nz--) { 568 oidx = 2*(*vi++); 569 t[oidx] -= v[0]*s1 + v[1]*s2; 570 t[oidx+1] -= v[2]*s1 + v[3]*s2; 571 v += 4; 572 } 573 t[idx] = s1;t[1+idx] = s2; 574 idx += 2; 575 } 576 /* backward solve the L^T */ 577 for (i=n-1; i>=0; i--){ 578 v = aa + 4*diag[i] - 4; 579 vi = aj + diag[i] - 1; 580 nz = diag[i] - ai[i]; 581 idt = 2*i; 582 s1 = t[idt]; s2 = t[1+idt]; 583 while (nz--) { 584 idx = 2*(*vi--); 585 t[idx] -= v[0]*s1 + v[1]*s2; 586 t[idx+1] -= v[2]*s1 + v[3]*s2; 587 v -= 4; 588 } 589 } 590 591 /* copy t into x according to permutation */ 592 ii = 0; 593 for (i=0; i<n; i++) { 594 ir = 2*r[i]; 595 x[ir] = t[ii]; 596 x[ir+1] = t[ii+1]; 597 ii += 2; 598 } 599 600 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 601 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 602 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 603 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 604 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->n);CHKERRQ(ierr); 605 PetscFunctionReturn(0); 606 } 607 608 #undef __FUNCT__ 609 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 610 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 611 { 612 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 613 IS iscol=a->col,isrow=a->row; 614 PetscErrorCode ierr; 615 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 616 PetscInt *diag = a->diag,ii,ic,ir,oidx; 617 MatScalar *aa=a->a,*v; 618 PetscScalar s1,s2,s3,x1,x2,x3; 619 PetscScalar *x,*b,*t; 620 621 PetscFunctionBegin; 622 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 623 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 624 t = a->solve_work; 625 626 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 627 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 628 629 /* copy the b into temp work space according to permutation */ 630 ii = 0; 631 for (i=0; i<n; i++) { 632 ic = 3*c[i]; 633 t[ii] = b[ic]; 634 t[ii+1] = b[ic+1]; 635 t[ii+2] = b[ic+2]; 636 ii += 3; 637 } 638 639 /* forward solve the U^T */ 640 idx = 0; 641 for (i=0; i<n; i++) { 642 643 v = aa + 9*diag[i]; 644 /* multiply by the inverse of the block diagonal */ 645 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 646 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 647 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 648 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 649 v += 9; 650 651 vi = aj + diag[i] + 1; 652 nz = ai[i+1] - diag[i] - 1; 653 while (nz--) { 654 oidx = 3*(*vi++); 655 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 656 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 657 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 658 v += 9; 659 } 660 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 661 idx += 3; 662 } 663 /* backward solve the L^T */ 664 for (i=n-1; i>=0; i--){ 665 v = aa + 9*diag[i] - 9; 666 vi = aj + diag[i] - 1; 667 nz = diag[i] - ai[i]; 668 idt = 3*i; 669 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 670 while (nz--) { 671 idx = 3*(*vi--); 672 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 673 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 674 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 675 v -= 9; 676 } 677 } 678 679 /* copy t into x according to permutation */ 680 ii = 0; 681 for (i=0; i<n; i++) { 682 ir = 3*r[i]; 683 x[ir] = t[ii]; 684 x[ir+1] = t[ii+1]; 685 x[ir+2] = t[ii+2]; 686 ii += 3; 687 } 688 689 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 690 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 691 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 692 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 693 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->n);CHKERRQ(ierr); 694 PetscFunctionReturn(0); 695 } 696 697 #undef __FUNCT__ 698 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 699 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 700 { 701 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 702 IS iscol=a->col,isrow=a->row; 703 PetscErrorCode ierr; 704 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 705 PetscInt *diag = a->diag,ii,ic,ir,oidx; 706 MatScalar *aa=a->a,*v; 707 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 708 PetscScalar *x,*b,*t; 709 710 PetscFunctionBegin; 711 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 712 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 713 t = a->solve_work; 714 715 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 716 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 717 718 /* copy the b into temp work space according to permutation */ 719 ii = 0; 720 for (i=0; i<n; i++) { 721 ic = 4*c[i]; 722 t[ii] = b[ic]; 723 t[ii+1] = b[ic+1]; 724 t[ii+2] = b[ic+2]; 725 t[ii+3] = b[ic+3]; 726 ii += 4; 727 } 728 729 /* forward solve the U^T */ 730 idx = 0; 731 for (i=0; i<n; i++) { 732 733 v = aa + 16*diag[i]; 734 /* multiply by the inverse of the block diagonal */ 735 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 736 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 737 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 738 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 739 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 740 v += 16; 741 742 vi = aj + diag[i] + 1; 743 nz = ai[i+1] - diag[i] - 1; 744 while (nz--) { 745 oidx = 4*(*vi++); 746 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 747 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 748 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 749 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 750 v += 16; 751 } 752 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 753 idx += 4; 754 } 755 /* backward solve the L^T */ 756 for (i=n-1; i>=0; i--){ 757 v = aa + 16*diag[i] - 16; 758 vi = aj + diag[i] - 1; 759 nz = diag[i] - ai[i]; 760 idt = 4*i; 761 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 762 while (nz--) { 763 idx = 4*(*vi--); 764 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 765 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 766 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 767 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 768 v -= 16; 769 } 770 } 771 772 /* copy t into x according to permutation */ 773 ii = 0; 774 for (i=0; i<n; i++) { 775 ir = 4*r[i]; 776 x[ir] = t[ii]; 777 x[ir+1] = t[ii+1]; 778 x[ir+2] = t[ii+2]; 779 x[ir+3] = t[ii+3]; 780 ii += 4; 781 } 782 783 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 784 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 785 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 786 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 787 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr); 788 PetscFunctionReturn(0); 789 } 790 791 #undef __FUNCT__ 792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 793 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 794 { 795 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 796 IS iscol=a->col,isrow=a->row; 797 PetscErrorCode ierr; 798 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 799 PetscInt *diag = a->diag,ii,ic,ir,oidx; 800 MatScalar *aa=a->a,*v; 801 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 802 PetscScalar *x,*b,*t; 803 804 PetscFunctionBegin; 805 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 806 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 807 t = a->solve_work; 808 809 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 810 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 811 812 /* copy the b into temp work space according to permutation */ 813 ii = 0; 814 for (i=0; i<n; i++) { 815 ic = 5*c[i]; 816 t[ii] = b[ic]; 817 t[ii+1] = b[ic+1]; 818 t[ii+2] = b[ic+2]; 819 t[ii+3] = b[ic+3]; 820 t[ii+4] = b[ic+4]; 821 ii += 5; 822 } 823 824 /* forward solve the U^T */ 825 idx = 0; 826 for (i=0; i<n; i++) { 827 828 v = aa + 25*diag[i]; 829 /* multiply by the inverse of the block diagonal */ 830 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 831 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 832 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 833 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 834 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 835 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 836 v += 25; 837 838 vi = aj + diag[i] + 1; 839 nz = ai[i+1] - diag[i] - 1; 840 while (nz--) { 841 oidx = 5*(*vi++); 842 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 843 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 844 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 845 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 846 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 847 v += 25; 848 } 849 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 850 idx += 5; 851 } 852 /* backward solve the L^T */ 853 for (i=n-1; i>=0; i--){ 854 v = aa + 25*diag[i] - 25; 855 vi = aj + diag[i] - 1; 856 nz = diag[i] - ai[i]; 857 idt = 5*i; 858 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 859 while (nz--) { 860 idx = 5*(*vi--); 861 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 862 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 863 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 864 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 865 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 866 v -= 25; 867 } 868 } 869 870 /* copy t into x according to permutation */ 871 ii = 0; 872 for (i=0; i<n; i++) { 873 ir = 5*r[i]; 874 x[ir] = t[ii]; 875 x[ir+1] = t[ii+1]; 876 x[ir+2] = t[ii+2]; 877 x[ir+3] = t[ii+3]; 878 x[ir+4] = t[ii+4]; 879 ii += 5; 880 } 881 882 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 883 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 884 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 885 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 886 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->n);CHKERRQ(ierr); 887 PetscFunctionReturn(0); 888 } 889 890 #undef __FUNCT__ 891 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 892 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 893 { 894 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 895 IS iscol=a->col,isrow=a->row; 896 PetscErrorCode ierr; 897 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 898 PetscInt *diag = a->diag,ii,ic,ir,oidx; 899 MatScalar *aa=a->a,*v; 900 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 901 PetscScalar *x,*b,*t; 902 903 PetscFunctionBegin; 904 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 905 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 906 t = a->solve_work; 907 908 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 909 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 910 911 /* copy the b into temp work space according to permutation */ 912 ii = 0; 913 for (i=0; i<n; i++) { 914 ic = 6*c[i]; 915 t[ii] = b[ic]; 916 t[ii+1] = b[ic+1]; 917 t[ii+2] = b[ic+2]; 918 t[ii+3] = b[ic+3]; 919 t[ii+4] = b[ic+4]; 920 t[ii+5] = b[ic+5]; 921 ii += 6; 922 } 923 924 /* forward solve the U^T */ 925 idx = 0; 926 for (i=0; i<n; i++) { 927 928 v = aa + 36*diag[i]; 929 /* multiply by the inverse of the block diagonal */ 930 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 931 x6 = t[5+idx]; 932 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 933 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 934 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 935 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 936 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 937 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 938 v += 36; 939 940 vi = aj + diag[i] + 1; 941 nz = ai[i+1] - diag[i] - 1; 942 while (nz--) { 943 oidx = 6*(*vi++); 944 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 945 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 946 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 947 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 948 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 949 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 950 v += 36; 951 } 952 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 953 t[5+idx] = s6; 954 idx += 6; 955 } 956 /* backward solve the L^T */ 957 for (i=n-1; i>=0; i--){ 958 v = aa + 36*diag[i] - 36; 959 vi = aj + diag[i] - 1; 960 nz = diag[i] - ai[i]; 961 idt = 6*i; 962 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 963 s6 = t[5+idt]; 964 while (nz--) { 965 idx = 6*(*vi--); 966 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 967 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 968 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 969 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 970 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 971 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 972 v -= 36; 973 } 974 } 975 976 /* copy t into x according to permutation */ 977 ii = 0; 978 for (i=0; i<n; i++) { 979 ir = 6*r[i]; 980 x[ir] = t[ii]; 981 x[ir+1] = t[ii+1]; 982 x[ir+2] = t[ii+2]; 983 x[ir+3] = t[ii+3]; 984 x[ir+4] = t[ii+4]; 985 x[ir+5] = t[ii+5]; 986 ii += 6; 987 } 988 989 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 990 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 991 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 992 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 993 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->n);CHKERRQ(ierr); 994 PetscFunctionReturn(0); 995 } 996 997 #undef __FUNCT__ 998 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 999 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1000 { 1001 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1002 IS iscol=a->col,isrow=a->row; 1003 PetscErrorCode ierr; 1004 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 1005 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1006 MatScalar *aa=a->a,*v; 1007 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1008 PetscScalar *x,*b,*t; 1009 1010 PetscFunctionBegin; 1011 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1012 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1013 t = a->solve_work; 1014 1015 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1016 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1017 1018 /* copy the b into temp work space according to permutation */ 1019 ii = 0; 1020 for (i=0; i<n; i++) { 1021 ic = 7*c[i]; 1022 t[ii] = b[ic]; 1023 t[ii+1] = b[ic+1]; 1024 t[ii+2] = b[ic+2]; 1025 t[ii+3] = b[ic+3]; 1026 t[ii+4] = b[ic+4]; 1027 t[ii+5] = b[ic+5]; 1028 t[ii+6] = b[ic+6]; 1029 ii += 7; 1030 } 1031 1032 /* forward solve the U^T */ 1033 idx = 0; 1034 for (i=0; i<n; i++) { 1035 1036 v = aa + 49*diag[i]; 1037 /* multiply by the inverse of the block diagonal */ 1038 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1039 x6 = t[5+idx]; x7 = t[6+idx]; 1040 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1041 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1042 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1043 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1044 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1045 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1046 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1047 v += 49; 1048 1049 vi = aj + diag[i] + 1; 1050 nz = ai[i+1] - diag[i] - 1; 1051 while (nz--) { 1052 oidx = 7*(*vi++); 1053 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1054 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1055 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1056 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1057 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1058 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1059 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1060 v += 49; 1061 } 1062 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1063 t[5+idx] = s6;t[6+idx] = s7; 1064 idx += 7; 1065 } 1066 /* backward solve the L^T */ 1067 for (i=n-1; i>=0; i--){ 1068 v = aa + 49*diag[i] - 49; 1069 vi = aj + diag[i] - 1; 1070 nz = diag[i] - ai[i]; 1071 idt = 7*i; 1072 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1073 s6 = t[5+idt];s7 = t[6+idt]; 1074 while (nz--) { 1075 idx = 7*(*vi--); 1076 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1077 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1078 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1079 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1080 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1081 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1082 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1083 v -= 49; 1084 } 1085 } 1086 1087 /* copy t into x according to permutation */ 1088 ii = 0; 1089 for (i=0; i<n; i++) { 1090 ir = 7*r[i]; 1091 x[ir] = t[ii]; 1092 x[ir+1] = t[ii+1]; 1093 x[ir+2] = t[ii+2]; 1094 x[ir+3] = t[ii+3]; 1095 x[ir+4] = t[ii+4]; 1096 x[ir+5] = t[ii+5]; 1097 x[ir+6] = t[ii+6]; 1098 ii += 7; 1099 } 1100 1101 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1102 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1103 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1104 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1105 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->n);CHKERRQ(ierr); 1106 PetscFunctionReturn(0); 1107 } 1108 1109 /* ----------------------------------------------------------- */ 1110 #undef __FUNCT__ 1111 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1112 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1113 { 1114 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1115 IS iscol=a->col,isrow=a->row; 1116 PetscErrorCode ierr; 1117 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1118 PetscInt nz,bs=A->bs,bs2=a->bs2,*rout,*cout; 1119 MatScalar *aa=a->a,*v; 1120 PetscScalar *x,*b,*s,*t,*ls; 1121 1122 PetscFunctionBegin; 1123 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1124 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1125 t = a->solve_work; 1126 1127 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1128 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1129 1130 /* forward solve the lower triangular */ 1131 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1132 for (i=1; i<n; i++) { 1133 v = aa + bs2*ai[i]; 1134 vi = aj + ai[i]; 1135 nz = a->diag[i] - ai[i]; 1136 s = t + bs*i; 1137 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1138 while (nz--) { 1139 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1140 v += bs2; 1141 } 1142 } 1143 /* backward solve the upper triangular */ 1144 ls = a->solve_work + A->n; 1145 for (i=n-1; i>=0; i--){ 1146 v = aa + bs2*(a->diag[i] + 1); 1147 vi = aj + a->diag[i] + 1; 1148 nz = ai[i+1] - a->diag[i] - 1; 1149 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1150 while (nz--) { 1151 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1152 v += bs2; 1153 } 1154 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1155 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1156 } 1157 1158 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1159 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1160 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1161 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1162 ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->bs*A->n);CHKERRQ(ierr); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 #undef __FUNCT__ 1167 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1168 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1169 { 1170 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1171 IS iscol=a->col,isrow=a->row; 1172 PetscErrorCode ierr; 1173 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1174 PetscInt *diag = a->diag; 1175 MatScalar *aa=a->a,*v; 1176 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1177 PetscScalar *x,*b,*t; 1178 1179 PetscFunctionBegin; 1180 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1181 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1182 t = a->solve_work; 1183 1184 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1185 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1186 1187 /* forward solve the lower triangular */ 1188 idx = 7*(*r++); 1189 t[0] = b[idx]; t[1] = b[1+idx]; 1190 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1191 t[5] = b[5+idx]; t[6] = b[6+idx]; 1192 1193 for (i=1; i<n; i++) { 1194 v = aa + 49*ai[i]; 1195 vi = aj + ai[i]; 1196 nz = diag[i] - ai[i]; 1197 idx = 7*(*r++); 1198 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1199 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1200 while (nz--) { 1201 idx = 7*(*vi++); 1202 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1203 x4 = t[3+idx];x5 = t[4+idx]; 1204 x6 = t[5+idx];x7 = t[6+idx]; 1205 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1206 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1207 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1208 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1209 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1210 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1211 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1212 v += 49; 1213 } 1214 idx = 7*i; 1215 t[idx] = s1;t[1+idx] = s2; 1216 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1217 t[5+idx] = s6;t[6+idx] = s7; 1218 } 1219 /* backward solve the upper triangular */ 1220 for (i=n-1; i>=0; i--){ 1221 v = aa + 49*diag[i] + 49; 1222 vi = aj + diag[i] + 1; 1223 nz = ai[i+1] - diag[i] - 1; 1224 idt = 7*i; 1225 s1 = t[idt]; s2 = t[1+idt]; 1226 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1227 s6 = t[5+idt];s7 = t[6+idt]; 1228 while (nz--) { 1229 idx = 7*(*vi++); 1230 x1 = t[idx]; x2 = t[1+idx]; 1231 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1232 x6 = t[5+idx]; x7 = t[6+idx]; 1233 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1234 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1235 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1236 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1237 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1238 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1239 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1240 v += 49; 1241 } 1242 idc = 7*(*c--); 1243 v = aa + 49*diag[i]; 1244 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1245 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1246 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1247 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1248 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1249 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1250 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1251 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1252 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1253 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1254 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1255 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1256 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1257 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1258 } 1259 1260 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1261 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1262 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1263 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1264 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->n);CHKERRQ(ierr); 1265 PetscFunctionReturn(0); 1266 } 1267 1268 #undef __FUNCT__ 1269 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1270 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1271 { 1272 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1273 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1274 PetscErrorCode ierr; 1275 PetscInt *diag = a->diag,jdx; 1276 MatScalar *aa=a->a,*v; 1277 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1278 1279 PetscFunctionBegin; 1280 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1281 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1282 /* forward solve the lower triangular */ 1283 idx = 0; 1284 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1285 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1286 x[6] = b[6+idx]; 1287 for (i=1; i<n; i++) { 1288 v = aa + 49*ai[i]; 1289 vi = aj + ai[i]; 1290 nz = diag[i] - ai[i]; 1291 idx = 7*i; 1292 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1293 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1294 s7 = b[6+idx]; 1295 while (nz--) { 1296 jdx = 7*(*vi++); 1297 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1298 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1299 x7 = x[6+jdx]; 1300 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1301 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1302 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1303 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1304 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1305 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1306 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1307 v += 49; 1308 } 1309 x[idx] = s1; 1310 x[1+idx] = s2; 1311 x[2+idx] = s3; 1312 x[3+idx] = s4; 1313 x[4+idx] = s5; 1314 x[5+idx] = s6; 1315 x[6+idx] = s7; 1316 } 1317 /* backward solve the upper triangular */ 1318 for (i=n-1; i>=0; i--){ 1319 v = aa + 49*diag[i] + 49; 1320 vi = aj + diag[i] + 1; 1321 nz = ai[i+1] - diag[i] - 1; 1322 idt = 7*i; 1323 s1 = x[idt]; s2 = x[1+idt]; 1324 s3 = x[2+idt]; s4 = x[3+idt]; 1325 s5 = x[4+idt]; s6 = x[5+idt]; 1326 s7 = x[6+idt]; 1327 while (nz--) { 1328 idx = 7*(*vi++); 1329 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1330 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1331 x7 = x[6+idx]; 1332 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1333 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1334 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1335 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1336 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1337 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1338 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1339 v += 49; 1340 } 1341 v = aa + 49*diag[i]; 1342 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1343 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1344 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1345 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1346 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1347 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1348 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1349 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1350 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1351 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1352 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1353 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1354 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1355 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1356 } 1357 1358 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1359 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1360 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->n);CHKERRQ(ierr); 1361 PetscFunctionReturn(0); 1362 } 1363 1364 #undef __FUNCT__ 1365 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1366 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1367 { 1368 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1369 IS iscol=a->col,isrow=a->row; 1370 PetscErrorCode ierr; 1371 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1372 PetscInt *diag = a->diag; 1373 MatScalar *aa=a->a,*v; 1374 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1375 1376 PetscFunctionBegin; 1377 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1378 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1379 t = a->solve_work; 1380 1381 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1382 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1383 1384 /* forward solve the lower triangular */ 1385 idx = 6*(*r++); 1386 t[0] = b[idx]; t[1] = b[1+idx]; 1387 t[2] = b[2+idx]; t[3] = b[3+idx]; 1388 t[4] = b[4+idx]; t[5] = b[5+idx]; 1389 for (i=1; i<n; i++) { 1390 v = aa + 36*ai[i]; 1391 vi = aj + ai[i]; 1392 nz = diag[i] - ai[i]; 1393 idx = 6*(*r++); 1394 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1395 s5 = b[4+idx]; s6 = b[5+idx]; 1396 while (nz--) { 1397 idx = 6*(*vi++); 1398 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1399 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1400 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1401 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1402 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1403 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1404 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1405 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1406 v += 36; 1407 } 1408 idx = 6*i; 1409 t[idx] = s1;t[1+idx] = s2; 1410 t[2+idx] = s3;t[3+idx] = s4; 1411 t[4+idx] = s5;t[5+idx] = s6; 1412 } 1413 /* backward solve the upper triangular */ 1414 for (i=n-1; i>=0; i--){ 1415 v = aa + 36*diag[i] + 36; 1416 vi = aj + diag[i] + 1; 1417 nz = ai[i+1] - diag[i] - 1; 1418 idt = 6*i; 1419 s1 = t[idt]; s2 = t[1+idt]; 1420 s3 = t[2+idt];s4 = t[3+idt]; 1421 s5 = t[4+idt];s6 = t[5+idt]; 1422 while (nz--) { 1423 idx = 6*(*vi++); 1424 x1 = t[idx]; x2 = t[1+idx]; 1425 x3 = t[2+idx]; x4 = t[3+idx]; 1426 x5 = t[4+idx]; x6 = t[5+idx]; 1427 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1428 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1429 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1430 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1431 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1432 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1433 v += 36; 1434 } 1435 idc = 6*(*c--); 1436 v = aa + 36*diag[i]; 1437 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1438 v[18]*s4+v[24]*s5+v[30]*s6; 1439 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1440 v[19]*s4+v[25]*s5+v[31]*s6; 1441 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1442 v[20]*s4+v[26]*s5+v[32]*s6; 1443 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1444 v[21]*s4+v[27]*s5+v[33]*s6; 1445 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1446 v[22]*s4+v[28]*s5+v[34]*s6; 1447 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1448 v[23]*s4+v[29]*s5+v[35]*s6; 1449 } 1450 1451 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1452 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1453 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1454 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1455 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->n);CHKERRQ(ierr); 1456 PetscFunctionReturn(0); 1457 } 1458 1459 #undef __FUNCT__ 1460 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1461 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1462 { 1463 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1464 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1465 PetscErrorCode ierr; 1466 PetscInt *diag = a->diag,jdx; 1467 MatScalar *aa=a->a,*v; 1468 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1469 1470 PetscFunctionBegin; 1471 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1472 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1473 /* forward solve the lower triangular */ 1474 idx = 0; 1475 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1476 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1477 for (i=1; i<n; i++) { 1478 v = aa + 36*ai[i]; 1479 vi = aj + ai[i]; 1480 nz = diag[i] - ai[i]; 1481 idx = 6*i; 1482 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1483 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1484 while (nz--) { 1485 jdx = 6*(*vi++); 1486 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1487 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1488 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1489 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1490 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1491 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1492 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1493 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1494 v += 36; 1495 } 1496 x[idx] = s1; 1497 x[1+idx] = s2; 1498 x[2+idx] = s3; 1499 x[3+idx] = s4; 1500 x[4+idx] = s5; 1501 x[5+idx] = s6; 1502 } 1503 /* backward solve the upper triangular */ 1504 for (i=n-1; i>=0; i--){ 1505 v = aa + 36*diag[i] + 36; 1506 vi = aj + diag[i] + 1; 1507 nz = ai[i+1] - diag[i] - 1; 1508 idt = 6*i; 1509 s1 = x[idt]; s2 = x[1+idt]; 1510 s3 = x[2+idt]; s4 = x[3+idt]; 1511 s5 = x[4+idt]; s6 = x[5+idt]; 1512 while (nz--) { 1513 idx = 6*(*vi++); 1514 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1515 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1516 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1517 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1518 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1519 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1520 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1521 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1522 v += 36; 1523 } 1524 v = aa + 36*diag[i]; 1525 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1526 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1527 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1528 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1529 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1530 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1531 } 1532 1533 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1534 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1535 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->n);CHKERRQ(ierr); 1536 PetscFunctionReturn(0); 1537 } 1538 1539 #undef __FUNCT__ 1540 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1541 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1542 { 1543 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1544 IS iscol=a->col,isrow=a->row; 1545 PetscErrorCode ierr; 1546 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1547 PetscInt *diag = a->diag; 1548 MatScalar *aa=a->a,*v; 1549 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1550 1551 PetscFunctionBegin; 1552 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1553 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1554 t = a->solve_work; 1555 1556 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1557 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1558 1559 /* forward solve the lower triangular */ 1560 idx = 5*(*r++); 1561 t[0] = b[idx]; t[1] = b[1+idx]; 1562 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1563 for (i=1; i<n; i++) { 1564 v = aa + 25*ai[i]; 1565 vi = aj + ai[i]; 1566 nz = diag[i] - ai[i]; 1567 idx = 5*(*r++); 1568 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1569 s5 = b[4+idx]; 1570 while (nz--) { 1571 idx = 5*(*vi++); 1572 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1573 x4 = t[3+idx];x5 = t[4+idx]; 1574 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1575 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1576 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1577 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1578 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1579 v += 25; 1580 } 1581 idx = 5*i; 1582 t[idx] = s1;t[1+idx] = s2; 1583 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1584 } 1585 /* backward solve the upper triangular */ 1586 for (i=n-1; i>=0; i--){ 1587 v = aa + 25*diag[i] + 25; 1588 vi = aj + diag[i] + 1; 1589 nz = ai[i+1] - diag[i] - 1; 1590 idt = 5*i; 1591 s1 = t[idt]; s2 = t[1+idt]; 1592 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1593 while (nz--) { 1594 idx = 5*(*vi++); 1595 x1 = t[idx]; x2 = t[1+idx]; 1596 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1597 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1598 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1599 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1600 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1601 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1602 v += 25; 1603 } 1604 idc = 5*(*c--); 1605 v = aa + 25*diag[i]; 1606 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1607 v[15]*s4+v[20]*s5; 1608 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1609 v[16]*s4+v[21]*s5; 1610 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1611 v[17]*s4+v[22]*s5; 1612 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1613 v[18]*s4+v[23]*s5; 1614 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1615 v[19]*s4+v[24]*s5; 1616 } 1617 1618 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1619 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1620 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1621 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1622 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->n);CHKERRQ(ierr); 1623 PetscFunctionReturn(0); 1624 } 1625 1626 #undef __FUNCT__ 1627 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1628 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1629 { 1630 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1631 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1632 PetscErrorCode ierr; 1633 PetscInt *diag = a->diag,jdx; 1634 MatScalar *aa=a->a,*v; 1635 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1636 1637 PetscFunctionBegin; 1638 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1639 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1640 /* forward solve the lower triangular */ 1641 idx = 0; 1642 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1643 for (i=1; i<n; i++) { 1644 v = aa + 25*ai[i]; 1645 vi = aj + ai[i]; 1646 nz = diag[i] - ai[i]; 1647 idx = 5*i; 1648 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1649 while (nz--) { 1650 jdx = 5*(*vi++); 1651 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1652 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1653 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1654 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1655 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1656 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1657 v += 25; 1658 } 1659 x[idx] = s1; 1660 x[1+idx] = s2; 1661 x[2+idx] = s3; 1662 x[3+idx] = s4; 1663 x[4+idx] = s5; 1664 } 1665 /* backward solve the upper triangular */ 1666 for (i=n-1; i>=0; i--){ 1667 v = aa + 25*diag[i] + 25; 1668 vi = aj + diag[i] + 1; 1669 nz = ai[i+1] - diag[i] - 1; 1670 idt = 5*i; 1671 s1 = x[idt]; s2 = x[1+idt]; 1672 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1673 while (nz--) { 1674 idx = 5*(*vi++); 1675 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1676 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1677 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1678 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1679 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1680 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1681 v += 25; 1682 } 1683 v = aa + 25*diag[i]; 1684 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1685 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1686 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1687 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1688 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1689 } 1690 1691 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1692 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1693 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->n);CHKERRQ(ierr); 1694 PetscFunctionReturn(0); 1695 } 1696 1697 #undef __FUNCT__ 1698 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1699 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1700 { 1701 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1702 IS iscol=a->col,isrow=a->row; 1703 PetscErrorCode ierr; 1704 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1705 PetscInt *diag = a->diag; 1706 MatScalar *aa=a->a,*v; 1707 PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1708 1709 PetscFunctionBegin; 1710 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1711 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1712 t = a->solve_work; 1713 1714 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1715 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1716 1717 /* forward solve the lower triangular */ 1718 idx = 4*(*r++); 1719 t[0] = b[idx]; t[1] = b[1+idx]; 1720 t[2] = b[2+idx]; t[3] = b[3+idx]; 1721 for (i=1; i<n; i++) { 1722 v = aa + 16*ai[i]; 1723 vi = aj + ai[i]; 1724 nz = diag[i] - ai[i]; 1725 idx = 4*(*r++); 1726 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1727 while (nz--) { 1728 idx = 4*(*vi++); 1729 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1730 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1731 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1732 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1733 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1734 v += 16; 1735 } 1736 idx = 4*i; 1737 t[idx] = s1;t[1+idx] = s2; 1738 t[2+idx] = s3;t[3+idx] = s4; 1739 } 1740 /* backward solve the upper triangular */ 1741 for (i=n-1; i>=0; i--){ 1742 v = aa + 16*diag[i] + 16; 1743 vi = aj + diag[i] + 1; 1744 nz = ai[i+1] - diag[i] - 1; 1745 idt = 4*i; 1746 s1 = t[idt]; s2 = t[1+idt]; 1747 s3 = t[2+idt];s4 = t[3+idt]; 1748 while (nz--) { 1749 idx = 4*(*vi++); 1750 x1 = t[idx]; x2 = t[1+idx]; 1751 x3 = t[2+idx]; x4 = t[3+idx]; 1752 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1753 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1754 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1755 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1756 v += 16; 1757 } 1758 idc = 4*(*c--); 1759 v = aa + 16*diag[i]; 1760 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1761 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1762 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1763 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1764 } 1765 1766 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1767 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1768 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1770 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr); 1771 PetscFunctionReturn(0); 1772 } 1773 1774 #undef __FUNCT__ 1775 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1776 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1777 { 1778 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1779 IS iscol=a->col,isrow=a->row; 1780 PetscErrorCode ierr; 1781 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1782 PetscInt *diag = a->diag; 1783 MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1784 PetscScalar *x,*b; 1785 1786 PetscFunctionBegin; 1787 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1788 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1789 t = (MatScalar *)a->solve_work; 1790 1791 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1792 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1793 1794 /* forward solve the lower triangular */ 1795 idx = 4*(*r++); 1796 t[0] = (MatScalar)b[idx]; 1797 t[1] = (MatScalar)b[1+idx]; 1798 t[2] = (MatScalar)b[2+idx]; 1799 t[3] = (MatScalar)b[3+idx]; 1800 for (i=1; i<n; i++) { 1801 v = aa + 16*ai[i]; 1802 vi = aj + ai[i]; 1803 nz = diag[i] - ai[i]; 1804 idx = 4*(*r++); 1805 s1 = (MatScalar)b[idx]; 1806 s2 = (MatScalar)b[1+idx]; 1807 s3 = (MatScalar)b[2+idx]; 1808 s4 = (MatScalar)b[3+idx]; 1809 while (nz--) { 1810 idx = 4*(*vi++); 1811 x1 = t[idx]; 1812 x2 = t[1+idx]; 1813 x3 = t[2+idx]; 1814 x4 = t[3+idx]; 1815 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1816 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1817 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1818 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1819 v += 16; 1820 } 1821 idx = 4*i; 1822 t[idx] = s1; 1823 t[1+idx] = s2; 1824 t[2+idx] = s3; 1825 t[3+idx] = s4; 1826 } 1827 /* backward solve the upper triangular */ 1828 for (i=n-1; i>=0; i--){ 1829 v = aa + 16*diag[i] + 16; 1830 vi = aj + diag[i] + 1; 1831 nz = ai[i+1] - diag[i] - 1; 1832 idt = 4*i; 1833 s1 = t[idt]; 1834 s2 = t[1+idt]; 1835 s3 = t[2+idt]; 1836 s4 = t[3+idt]; 1837 while (nz--) { 1838 idx = 4*(*vi++); 1839 x1 = t[idx]; 1840 x2 = t[1+idx]; 1841 x3 = t[2+idx]; 1842 x4 = t[3+idx]; 1843 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1844 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1845 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1846 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1847 v += 16; 1848 } 1849 idc = 4*(*c--); 1850 v = aa + 16*diag[i]; 1851 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1852 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1853 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1854 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1855 x[idc] = (PetscScalar)t[idt]; 1856 x[1+idc] = (PetscScalar)t[1+idt]; 1857 x[2+idc] = (PetscScalar)t[2+idt]; 1858 x[3+idc] = (PetscScalar)t[3+idt]; 1859 } 1860 1861 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1862 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1863 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1864 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1865 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr); 1866 PetscFunctionReturn(0); 1867 } 1868 1869 #if defined (PETSC_HAVE_SSE) 1870 1871 #include PETSC_HAVE_SSE 1872 1873 #undef __FUNCT__ 1874 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1875 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1876 { 1877 /* 1878 Note: This code uses demotion of double 1879 to float when performing the mixed-mode computation. 1880 This may not be numerically reasonable for all applications. 1881 */ 1882 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1883 IS iscol=a->col,isrow=a->row; 1884 PetscErrorCode ierr; 1885 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1886 PetscInt *diag = a->diag,ai16; 1887 MatScalar *aa=a->a,*v; 1888 PetscScalar *x,*b,*t; 1889 1890 /* Make space in temp stack for 16 Byte Aligned arrays */ 1891 float ssealignedspace[11],*tmps,*tmpx; 1892 unsigned long offset; 1893 1894 PetscFunctionBegin; 1895 SSE_SCOPE_BEGIN; 1896 1897 offset = (unsigned long)ssealignedspace % 16; 1898 if (offset) offset = (16 - offset)/4; 1899 tmps = &ssealignedspace[offset]; 1900 tmpx = &ssealignedspace[offset+4]; 1901 PREFETCH_NTA(aa+16*ai[1]); 1902 1903 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1904 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1905 t = a->solve_work; 1906 1907 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1908 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1909 1910 /* forward solve the lower triangular */ 1911 idx = 4*(*r++); 1912 t[0] = b[idx]; t[1] = b[1+idx]; 1913 t[2] = b[2+idx]; t[3] = b[3+idx]; 1914 v = aa + 16*ai[1]; 1915 1916 for (i=1; i<n;) { 1917 PREFETCH_NTA(&v[8]); 1918 vi = aj + ai[i]; 1919 nz = diag[i] - ai[i]; 1920 idx = 4*(*r++); 1921 1922 /* Demote sum from double to float */ 1923 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1924 LOAD_PS(tmps,XMM7); 1925 1926 while (nz--) { 1927 PREFETCH_NTA(&v[16]); 1928 idx = 4*(*vi++); 1929 1930 /* Demote solution (so far) from double to float */ 1931 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1932 1933 /* 4x4 Matrix-Vector product with negative accumulation: */ 1934 SSE_INLINE_BEGIN_2(tmpx,v) 1935 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1936 1937 /* First Column */ 1938 SSE_COPY_PS(XMM0,XMM6) 1939 SSE_SHUFFLE(XMM0,XMM0,0x00) 1940 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1941 SSE_SUB_PS(XMM7,XMM0) 1942 1943 /* Second Column */ 1944 SSE_COPY_PS(XMM1,XMM6) 1945 SSE_SHUFFLE(XMM1,XMM1,0x55) 1946 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1947 SSE_SUB_PS(XMM7,XMM1) 1948 1949 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1950 1951 /* Third Column */ 1952 SSE_COPY_PS(XMM2,XMM6) 1953 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1954 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1955 SSE_SUB_PS(XMM7,XMM2) 1956 1957 /* Fourth Column */ 1958 SSE_COPY_PS(XMM3,XMM6) 1959 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1960 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1961 SSE_SUB_PS(XMM7,XMM3) 1962 SSE_INLINE_END_2 1963 1964 v += 16; 1965 } 1966 idx = 4*i; 1967 v = aa + 16*ai[++i]; 1968 PREFETCH_NTA(v); 1969 STORE_PS(tmps,XMM7); 1970 1971 /* Promote result from float to double */ 1972 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1973 } 1974 /* backward solve the upper triangular */ 1975 idt = 4*(n-1); 1976 ai16 = 16*diag[n-1]; 1977 v = aa + ai16 + 16; 1978 for (i=n-1; i>=0;){ 1979 PREFETCH_NTA(&v[8]); 1980 vi = aj + diag[i] + 1; 1981 nz = ai[i+1] - diag[i] - 1; 1982 1983 /* Demote accumulator from double to float */ 1984 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1985 LOAD_PS(tmps,XMM7); 1986 1987 while (nz--) { 1988 PREFETCH_NTA(&v[16]); 1989 idx = 4*(*vi++); 1990 1991 /* Demote solution (so far) from double to float */ 1992 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1993 1994 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1995 SSE_INLINE_BEGIN_2(tmpx,v) 1996 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1997 1998 /* First Column */ 1999 SSE_COPY_PS(XMM0,XMM6) 2000 SSE_SHUFFLE(XMM0,XMM0,0x00) 2001 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2002 SSE_SUB_PS(XMM7,XMM0) 2003 2004 /* Second Column */ 2005 SSE_COPY_PS(XMM1,XMM6) 2006 SSE_SHUFFLE(XMM1,XMM1,0x55) 2007 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2008 SSE_SUB_PS(XMM7,XMM1) 2009 2010 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2011 2012 /* Third Column */ 2013 SSE_COPY_PS(XMM2,XMM6) 2014 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2015 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2016 SSE_SUB_PS(XMM7,XMM2) 2017 2018 /* Fourth Column */ 2019 SSE_COPY_PS(XMM3,XMM6) 2020 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2021 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2022 SSE_SUB_PS(XMM7,XMM3) 2023 SSE_INLINE_END_2 2024 v += 16; 2025 } 2026 v = aa + ai16; 2027 ai16 = 16*diag[--i]; 2028 PREFETCH_NTA(aa+ai16+16); 2029 /* 2030 Scale the result by the diagonal 4x4 block, 2031 which was inverted as part of the factorization 2032 */ 2033 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2034 /* First Column */ 2035 SSE_COPY_PS(XMM0,XMM7) 2036 SSE_SHUFFLE(XMM0,XMM0,0x00) 2037 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2038 2039 /* Second Column */ 2040 SSE_COPY_PS(XMM1,XMM7) 2041 SSE_SHUFFLE(XMM1,XMM1,0x55) 2042 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2043 SSE_ADD_PS(XMM0,XMM1) 2044 2045 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2046 2047 /* Third Column */ 2048 SSE_COPY_PS(XMM2,XMM7) 2049 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2050 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2051 SSE_ADD_PS(XMM0,XMM2) 2052 2053 /* Fourth Column */ 2054 SSE_COPY_PS(XMM3,XMM7) 2055 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2056 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2057 SSE_ADD_PS(XMM0,XMM3) 2058 2059 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2060 SSE_INLINE_END_3 2061 2062 /* Promote solution from float to double */ 2063 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2064 2065 /* Apply reordering to t and stream into x. */ 2066 /* This way, x doesn't pollute the cache. */ 2067 /* Be careful with size: 2 doubles = 4 floats! */ 2068 idc = 4*(*c--); 2069 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2070 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2071 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2072 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2073 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2074 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2075 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2076 SSE_INLINE_END_2 2077 v = aa + ai16 + 16; 2078 idt -= 4; 2079 } 2080 2081 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2082 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2083 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2084 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2085 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr); 2086 SSE_SCOPE_END; 2087 PetscFunctionReturn(0); 2088 } 2089 2090 #endif 2091 2092 2093 /* 2094 Special case where the matrix was ILU(0) factored in the natural 2095 ordering. This eliminates the need for the column and row permutation. 2096 */ 2097 #undef __FUNCT__ 2098 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2099 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2100 { 2101 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2102 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2103 PetscErrorCode ierr; 2104 PetscInt *diag = a->diag; 2105 MatScalar *aa=a->a; 2106 PetscScalar *x,*b; 2107 2108 PetscFunctionBegin; 2109 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2110 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2111 2112 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2113 { 2114 static PetscScalar w[2000]; /* very BAD need to fix */ 2115 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2116 } 2117 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2118 { 2119 static PetscScalar w[2000]; /* very BAD need to fix */ 2120 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2121 } 2122 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2123 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2124 #else 2125 { 2126 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2127 MatScalar *v; 2128 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2129 2130 /* forward solve the lower triangular */ 2131 idx = 0; 2132 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2133 for (i=1; i<n; i++) { 2134 v = aa + 16*ai[i]; 2135 vi = aj + ai[i]; 2136 nz = diag[i] - ai[i]; 2137 idx += 4; 2138 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2139 while (nz--) { 2140 jdx = 4*(*vi++); 2141 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2142 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2143 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2144 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2145 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2146 v += 16; 2147 } 2148 x[idx] = s1; 2149 x[1+idx] = s2; 2150 x[2+idx] = s3; 2151 x[3+idx] = s4; 2152 } 2153 /* backward solve the upper triangular */ 2154 idt = 4*(n-1); 2155 for (i=n-1; i>=0; i--){ 2156 ai16 = 16*diag[i]; 2157 v = aa + ai16 + 16; 2158 vi = aj + diag[i] + 1; 2159 nz = ai[i+1] - diag[i] - 1; 2160 s1 = x[idt]; s2 = x[1+idt]; 2161 s3 = x[2+idt];s4 = x[3+idt]; 2162 while (nz--) { 2163 idx = 4*(*vi++); 2164 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2165 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2166 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2167 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2168 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2169 v += 16; 2170 } 2171 v = aa + ai16; 2172 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2173 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2174 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2175 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2176 idt -= 4; 2177 } 2178 } 2179 #endif 2180 2181 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2182 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2183 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr); 2184 PetscFunctionReturn(0); 2185 } 2186 2187 #undef __FUNCT__ 2188 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2189 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2190 { 2191 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2192 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2193 PetscErrorCode ierr; 2194 PetscInt *diag = a->diag; 2195 MatScalar *aa=a->a; 2196 PetscScalar *x,*b; 2197 2198 PetscFunctionBegin; 2199 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2200 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2201 2202 { 2203 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2204 MatScalar *v,*t=(MatScalar *)x; 2205 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2206 2207 /* forward solve the lower triangular */ 2208 idx = 0; 2209 t[0] = (MatScalar)b[0]; 2210 t[1] = (MatScalar)b[1]; 2211 t[2] = (MatScalar)b[2]; 2212 t[3] = (MatScalar)b[3]; 2213 for (i=1; i<n; i++) { 2214 v = aa + 16*ai[i]; 2215 vi = aj + ai[i]; 2216 nz = diag[i] - ai[i]; 2217 idx += 4; 2218 s1 = (MatScalar)b[idx]; 2219 s2 = (MatScalar)b[1+idx]; 2220 s3 = (MatScalar)b[2+idx]; 2221 s4 = (MatScalar)b[3+idx]; 2222 while (nz--) { 2223 jdx = 4*(*vi++); 2224 x1 = t[jdx]; 2225 x2 = t[1+jdx]; 2226 x3 = t[2+jdx]; 2227 x4 = t[3+jdx]; 2228 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2229 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2230 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2231 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2232 v += 16; 2233 } 2234 t[idx] = s1; 2235 t[1+idx] = s2; 2236 t[2+idx] = s3; 2237 t[3+idx] = s4; 2238 } 2239 /* backward solve the upper triangular */ 2240 idt = 4*(n-1); 2241 for (i=n-1; i>=0; i--){ 2242 ai16 = 16*diag[i]; 2243 v = aa + ai16 + 16; 2244 vi = aj + diag[i] + 1; 2245 nz = ai[i+1] - diag[i] - 1; 2246 s1 = t[idt]; 2247 s2 = t[1+idt]; 2248 s3 = t[2+idt]; 2249 s4 = t[3+idt]; 2250 while (nz--) { 2251 idx = 4*(*vi++); 2252 x1 = (MatScalar)x[idx]; 2253 x2 = (MatScalar)x[1+idx]; 2254 x3 = (MatScalar)x[2+idx]; 2255 x4 = (MatScalar)x[3+idx]; 2256 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2257 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2258 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2259 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2260 v += 16; 2261 } 2262 v = aa + ai16; 2263 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2264 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2265 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2266 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2267 idt -= 4; 2268 } 2269 } 2270 2271 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2272 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2273 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr); 2274 PetscFunctionReturn(0); 2275 } 2276 2277 #if defined (PETSC_HAVE_SSE) 2278 2279 #include PETSC_HAVE_SSE 2280 #undef __FUNCT__ 2281 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2282 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2283 { 2284 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2285 unsigned short *aj=(unsigned short *)a->j; 2286 PetscErrorCode ierr; 2287 int *ai=a->i,n=a->mbs,*diag = a->diag; 2288 MatScalar *aa=a->a; 2289 PetscScalar *x,*b; 2290 2291 PetscFunctionBegin; 2292 SSE_SCOPE_BEGIN; 2293 /* 2294 Note: This code currently uses demotion of double 2295 to float when performing the mixed-mode computation. 2296 This may not be numerically reasonable for all applications. 2297 */ 2298 PREFETCH_NTA(aa+16*ai[1]); 2299 2300 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2301 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2302 { 2303 /* x will first be computed in single precision then promoted inplace to double */ 2304 MatScalar *v,*t=(MatScalar *)x; 2305 int nz,i,idt,ai16; 2306 unsigned int jdx,idx; 2307 unsigned short *vi; 2308 /* Forward solve the lower triangular factor. */ 2309 2310 /* First block is the identity. */ 2311 idx = 0; 2312 CONVERT_DOUBLE4_FLOAT4(t,b); 2313 v = aa + 16*((unsigned int)ai[1]); 2314 2315 for (i=1; i<n;) { 2316 PREFETCH_NTA(&v[8]); 2317 vi = aj + ai[i]; 2318 nz = diag[i] - ai[i]; 2319 idx += 4; 2320 2321 /* Demote RHS from double to float. */ 2322 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2323 LOAD_PS(&t[idx],XMM7); 2324 2325 while (nz--) { 2326 PREFETCH_NTA(&v[16]); 2327 jdx = 4*((unsigned int)(*vi++)); 2328 2329 /* 4x4 Matrix-Vector product with negative accumulation: */ 2330 SSE_INLINE_BEGIN_2(&t[jdx],v) 2331 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2332 2333 /* First Column */ 2334 SSE_COPY_PS(XMM0,XMM6) 2335 SSE_SHUFFLE(XMM0,XMM0,0x00) 2336 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2337 SSE_SUB_PS(XMM7,XMM0) 2338 2339 /* Second Column */ 2340 SSE_COPY_PS(XMM1,XMM6) 2341 SSE_SHUFFLE(XMM1,XMM1,0x55) 2342 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2343 SSE_SUB_PS(XMM7,XMM1) 2344 2345 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2346 2347 /* Third Column */ 2348 SSE_COPY_PS(XMM2,XMM6) 2349 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2350 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2351 SSE_SUB_PS(XMM7,XMM2) 2352 2353 /* Fourth Column */ 2354 SSE_COPY_PS(XMM3,XMM6) 2355 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2356 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2357 SSE_SUB_PS(XMM7,XMM3) 2358 SSE_INLINE_END_2 2359 2360 v += 16; 2361 } 2362 v = aa + 16*ai[++i]; 2363 PREFETCH_NTA(v); 2364 STORE_PS(&t[idx],XMM7); 2365 } 2366 2367 /* Backward solve the upper triangular factor.*/ 2368 2369 idt = 4*(n-1); 2370 ai16 = 16*diag[n-1]; 2371 v = aa + ai16 + 16; 2372 for (i=n-1; i>=0;){ 2373 PREFETCH_NTA(&v[8]); 2374 vi = aj + diag[i] + 1; 2375 nz = ai[i+1] - diag[i] - 1; 2376 2377 LOAD_PS(&t[idt],XMM7); 2378 2379 while (nz--) { 2380 PREFETCH_NTA(&v[16]); 2381 idx = 4*((unsigned int)(*vi++)); 2382 2383 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2384 SSE_INLINE_BEGIN_2(&t[idx],v) 2385 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2386 2387 /* First Column */ 2388 SSE_COPY_PS(XMM0,XMM6) 2389 SSE_SHUFFLE(XMM0,XMM0,0x00) 2390 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2391 SSE_SUB_PS(XMM7,XMM0) 2392 2393 /* Second Column */ 2394 SSE_COPY_PS(XMM1,XMM6) 2395 SSE_SHUFFLE(XMM1,XMM1,0x55) 2396 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2397 SSE_SUB_PS(XMM7,XMM1) 2398 2399 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2400 2401 /* Third Column */ 2402 SSE_COPY_PS(XMM2,XMM6) 2403 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2404 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2405 SSE_SUB_PS(XMM7,XMM2) 2406 2407 /* Fourth Column */ 2408 SSE_COPY_PS(XMM3,XMM6) 2409 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2410 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2411 SSE_SUB_PS(XMM7,XMM3) 2412 SSE_INLINE_END_2 2413 v += 16; 2414 } 2415 v = aa + ai16; 2416 ai16 = 16*diag[--i]; 2417 PREFETCH_NTA(aa+ai16+16); 2418 /* 2419 Scale the result by the diagonal 4x4 block, 2420 which was inverted as part of the factorization 2421 */ 2422 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2423 /* First Column */ 2424 SSE_COPY_PS(XMM0,XMM7) 2425 SSE_SHUFFLE(XMM0,XMM0,0x00) 2426 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2427 2428 /* Second Column */ 2429 SSE_COPY_PS(XMM1,XMM7) 2430 SSE_SHUFFLE(XMM1,XMM1,0x55) 2431 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2432 SSE_ADD_PS(XMM0,XMM1) 2433 2434 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2435 2436 /* Third Column */ 2437 SSE_COPY_PS(XMM2,XMM7) 2438 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2439 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2440 SSE_ADD_PS(XMM0,XMM2) 2441 2442 /* Fourth Column */ 2443 SSE_COPY_PS(XMM3,XMM7) 2444 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2445 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2446 SSE_ADD_PS(XMM0,XMM3) 2447 2448 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2449 SSE_INLINE_END_3 2450 2451 v = aa + ai16 + 16; 2452 idt -= 4; 2453 } 2454 2455 /* Convert t from single precision back to double precision (inplace)*/ 2456 idt = 4*(n-1); 2457 for (i=n-1;i>=0;i--) { 2458 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2459 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2460 PetscScalar *xtemp=&x[idt]; 2461 MatScalar *ttemp=&t[idt]; 2462 xtemp[3] = (PetscScalar)ttemp[3]; 2463 xtemp[2] = (PetscScalar)ttemp[2]; 2464 xtemp[1] = (PetscScalar)ttemp[1]; 2465 xtemp[0] = (PetscScalar)ttemp[0]; 2466 idt -= 4; 2467 } 2468 2469 } /* End of artificial scope. */ 2470 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2471 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2472 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr); 2473 SSE_SCOPE_END; 2474 PetscFunctionReturn(0); 2475 } 2476 2477 #undef __FUNCT__ 2478 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2479 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2480 { 2481 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2482 int *aj=a->j; 2483 PetscErrorCode ierr; 2484 int *ai=a->i,n=a->mbs,*diag = a->diag; 2485 MatScalar *aa=a->a; 2486 PetscScalar *x,*b; 2487 2488 PetscFunctionBegin; 2489 SSE_SCOPE_BEGIN; 2490 /* 2491 Note: This code currently uses demotion of double 2492 to float when performing the mixed-mode computation. 2493 This may not be numerically reasonable for all applications. 2494 */ 2495 PREFETCH_NTA(aa+16*ai[1]); 2496 2497 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2498 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2499 { 2500 /* x will first be computed in single precision then promoted inplace to double */ 2501 MatScalar *v,*t=(MatScalar *)x; 2502 int nz,i,idt,ai16; 2503 int jdx,idx; 2504 int *vi; 2505 /* Forward solve the lower triangular factor. */ 2506 2507 /* First block is the identity. */ 2508 idx = 0; 2509 CONVERT_DOUBLE4_FLOAT4(t,b); 2510 v = aa + 16*ai[1]; 2511 2512 for (i=1; i<n;) { 2513 PREFETCH_NTA(&v[8]); 2514 vi = aj + ai[i]; 2515 nz = diag[i] - ai[i]; 2516 idx += 4; 2517 2518 /* Demote RHS from double to float. */ 2519 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2520 LOAD_PS(&t[idx],XMM7); 2521 2522 while (nz--) { 2523 PREFETCH_NTA(&v[16]); 2524 jdx = 4*(*vi++); 2525 /* jdx = *vi++; */ 2526 2527 /* 4x4 Matrix-Vector product with negative accumulation: */ 2528 SSE_INLINE_BEGIN_2(&t[jdx],v) 2529 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2530 2531 /* First Column */ 2532 SSE_COPY_PS(XMM0,XMM6) 2533 SSE_SHUFFLE(XMM0,XMM0,0x00) 2534 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2535 SSE_SUB_PS(XMM7,XMM0) 2536 2537 /* Second Column */ 2538 SSE_COPY_PS(XMM1,XMM6) 2539 SSE_SHUFFLE(XMM1,XMM1,0x55) 2540 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2541 SSE_SUB_PS(XMM7,XMM1) 2542 2543 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2544 2545 /* Third Column */ 2546 SSE_COPY_PS(XMM2,XMM6) 2547 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2548 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2549 SSE_SUB_PS(XMM7,XMM2) 2550 2551 /* Fourth Column */ 2552 SSE_COPY_PS(XMM3,XMM6) 2553 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2554 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2555 SSE_SUB_PS(XMM7,XMM3) 2556 SSE_INLINE_END_2 2557 2558 v += 16; 2559 } 2560 v = aa + 16*ai[++i]; 2561 PREFETCH_NTA(v); 2562 STORE_PS(&t[idx],XMM7); 2563 } 2564 2565 /* Backward solve the upper triangular factor.*/ 2566 2567 idt = 4*(n-1); 2568 ai16 = 16*diag[n-1]; 2569 v = aa + ai16 + 16; 2570 for (i=n-1; i>=0;){ 2571 PREFETCH_NTA(&v[8]); 2572 vi = aj + diag[i] + 1; 2573 nz = ai[i+1] - diag[i] - 1; 2574 2575 LOAD_PS(&t[idt],XMM7); 2576 2577 while (nz--) { 2578 PREFETCH_NTA(&v[16]); 2579 idx = 4*(*vi++); 2580 /* idx = *vi++; */ 2581 2582 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2583 SSE_INLINE_BEGIN_2(&t[idx],v) 2584 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2585 2586 /* First Column */ 2587 SSE_COPY_PS(XMM0,XMM6) 2588 SSE_SHUFFLE(XMM0,XMM0,0x00) 2589 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2590 SSE_SUB_PS(XMM7,XMM0) 2591 2592 /* Second Column */ 2593 SSE_COPY_PS(XMM1,XMM6) 2594 SSE_SHUFFLE(XMM1,XMM1,0x55) 2595 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2596 SSE_SUB_PS(XMM7,XMM1) 2597 2598 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2599 2600 /* Third Column */ 2601 SSE_COPY_PS(XMM2,XMM6) 2602 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2603 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2604 SSE_SUB_PS(XMM7,XMM2) 2605 2606 /* Fourth Column */ 2607 SSE_COPY_PS(XMM3,XMM6) 2608 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2609 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2610 SSE_SUB_PS(XMM7,XMM3) 2611 SSE_INLINE_END_2 2612 v += 16; 2613 } 2614 v = aa + ai16; 2615 ai16 = 16*diag[--i]; 2616 PREFETCH_NTA(aa+ai16+16); 2617 /* 2618 Scale the result by the diagonal 4x4 block, 2619 which was inverted as part of the factorization 2620 */ 2621 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2622 /* First Column */ 2623 SSE_COPY_PS(XMM0,XMM7) 2624 SSE_SHUFFLE(XMM0,XMM0,0x00) 2625 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2626 2627 /* Second Column */ 2628 SSE_COPY_PS(XMM1,XMM7) 2629 SSE_SHUFFLE(XMM1,XMM1,0x55) 2630 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2631 SSE_ADD_PS(XMM0,XMM1) 2632 2633 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2634 2635 /* Third Column */ 2636 SSE_COPY_PS(XMM2,XMM7) 2637 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2638 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2639 SSE_ADD_PS(XMM0,XMM2) 2640 2641 /* Fourth Column */ 2642 SSE_COPY_PS(XMM3,XMM7) 2643 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2644 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2645 SSE_ADD_PS(XMM0,XMM3) 2646 2647 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2648 SSE_INLINE_END_3 2649 2650 v = aa + ai16 + 16; 2651 idt -= 4; 2652 } 2653 2654 /* Convert t from single precision back to double precision (inplace)*/ 2655 idt = 4*(n-1); 2656 for (i=n-1;i>=0;i--) { 2657 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2658 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2659 PetscScalar *xtemp=&x[idt]; 2660 MatScalar *ttemp=&t[idt]; 2661 xtemp[3] = (PetscScalar)ttemp[3]; 2662 xtemp[2] = (PetscScalar)ttemp[2]; 2663 xtemp[1] = (PetscScalar)ttemp[1]; 2664 xtemp[0] = (PetscScalar)ttemp[0]; 2665 idt -= 4; 2666 } 2667 2668 } /* End of artificial scope. */ 2669 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2670 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2671 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->n);CHKERRQ(ierr); 2672 SSE_SCOPE_END; 2673 PetscFunctionReturn(0); 2674 } 2675 2676 #endif 2677 #undef __FUNCT__ 2678 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2679 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2680 { 2681 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2682 IS iscol=a->col,isrow=a->row; 2683 PetscErrorCode ierr; 2684 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2685 PetscInt *diag = a->diag; 2686 MatScalar *aa=a->a,*v; 2687 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 2688 2689 PetscFunctionBegin; 2690 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2691 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2692 t = a->solve_work; 2693 2694 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2695 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2696 2697 /* forward solve the lower triangular */ 2698 idx = 3*(*r++); 2699 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2700 for (i=1; i<n; i++) { 2701 v = aa + 9*ai[i]; 2702 vi = aj + ai[i]; 2703 nz = diag[i] - ai[i]; 2704 idx = 3*(*r++); 2705 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2706 while (nz--) { 2707 idx = 3*(*vi++); 2708 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2709 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2710 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2711 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2712 v += 9; 2713 } 2714 idx = 3*i; 2715 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2716 } 2717 /* backward solve the upper triangular */ 2718 for (i=n-1; i>=0; i--){ 2719 v = aa + 9*diag[i] + 9; 2720 vi = aj + diag[i] + 1; 2721 nz = ai[i+1] - diag[i] - 1; 2722 idt = 3*i; 2723 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2724 while (nz--) { 2725 idx = 3*(*vi++); 2726 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2727 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2728 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2729 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2730 v += 9; 2731 } 2732 idc = 3*(*c--); 2733 v = aa + 9*diag[i]; 2734 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2735 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2736 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2737 } 2738 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2739 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2740 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2741 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2742 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->n);CHKERRQ(ierr); 2743 PetscFunctionReturn(0); 2744 } 2745 2746 /* 2747 Special case where the matrix was ILU(0) factored in the natural 2748 ordering. This eliminates the need for the column and row permutation. 2749 */ 2750 #undef __FUNCT__ 2751 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2752 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2753 { 2754 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2755 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2756 PetscErrorCode ierr; 2757 PetscInt *diag = a->diag; 2758 MatScalar *aa=a->a,*v; 2759 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 2760 PetscInt jdx,idt,idx,nz,*vi,i; 2761 2762 PetscFunctionBegin; 2763 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2764 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2765 2766 2767 /* forward solve the lower triangular */ 2768 idx = 0; 2769 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2770 for (i=1; i<n; i++) { 2771 v = aa + 9*ai[i]; 2772 vi = aj + ai[i]; 2773 nz = diag[i] - ai[i]; 2774 idx += 3; 2775 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2776 while (nz--) { 2777 jdx = 3*(*vi++); 2778 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2779 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2780 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2781 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2782 v += 9; 2783 } 2784 x[idx] = s1; 2785 x[1+idx] = s2; 2786 x[2+idx] = s3; 2787 } 2788 /* backward solve the upper triangular */ 2789 for (i=n-1; i>=0; i--){ 2790 v = aa + 9*diag[i] + 9; 2791 vi = aj + diag[i] + 1; 2792 nz = ai[i+1] - diag[i] - 1; 2793 idt = 3*i; 2794 s1 = x[idt]; s2 = x[1+idt]; 2795 s3 = x[2+idt]; 2796 while (nz--) { 2797 idx = 3*(*vi++); 2798 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2799 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2800 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2801 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2802 v += 9; 2803 } 2804 v = aa + 9*diag[i]; 2805 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2806 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2807 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2808 } 2809 2810 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2811 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2812 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->n);CHKERRQ(ierr); 2813 PetscFunctionReturn(0); 2814 } 2815 2816 #undef __FUNCT__ 2817 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2818 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2819 { 2820 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2821 IS iscol=a->col,isrow=a->row; 2822 PetscErrorCode ierr; 2823 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2824 PetscInt *diag = a->diag; 2825 MatScalar *aa=a->a,*v; 2826 PetscScalar *x,*b,s1,s2,x1,x2,*t; 2827 2828 PetscFunctionBegin; 2829 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2830 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2831 t = a->solve_work; 2832 2833 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2834 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2835 2836 /* forward solve the lower triangular */ 2837 idx = 2*(*r++); 2838 t[0] = b[idx]; t[1] = b[1+idx]; 2839 for (i=1; i<n; i++) { 2840 v = aa + 4*ai[i]; 2841 vi = aj + ai[i]; 2842 nz = diag[i] - ai[i]; 2843 idx = 2*(*r++); 2844 s1 = b[idx]; s2 = b[1+idx]; 2845 while (nz--) { 2846 idx = 2*(*vi++); 2847 x1 = t[idx]; x2 = t[1+idx]; 2848 s1 -= v[0]*x1 + v[2]*x2; 2849 s2 -= v[1]*x1 + v[3]*x2; 2850 v += 4; 2851 } 2852 idx = 2*i; 2853 t[idx] = s1; t[1+idx] = s2; 2854 } 2855 /* backward solve the upper triangular */ 2856 for (i=n-1; i>=0; i--){ 2857 v = aa + 4*diag[i] + 4; 2858 vi = aj + diag[i] + 1; 2859 nz = ai[i+1] - diag[i] - 1; 2860 idt = 2*i; 2861 s1 = t[idt]; s2 = t[1+idt]; 2862 while (nz--) { 2863 idx = 2*(*vi++); 2864 x1 = t[idx]; x2 = t[1+idx]; 2865 s1 -= v[0]*x1 + v[2]*x2; 2866 s2 -= v[1]*x1 + v[3]*x2; 2867 v += 4; 2868 } 2869 idc = 2*(*c--); 2870 v = aa + 4*diag[i]; 2871 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2872 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2873 } 2874 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2875 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2876 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2877 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2878 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->n);CHKERRQ(ierr); 2879 PetscFunctionReturn(0); 2880 } 2881 2882 /* 2883 Special case where the matrix was ILU(0) factored in the natural 2884 ordering. This eliminates the need for the column and row permutation. 2885 */ 2886 #undef __FUNCT__ 2887 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2888 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2889 { 2890 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2891 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2892 PetscErrorCode ierr; 2893 PetscInt *diag = a->diag; 2894 MatScalar *aa=a->a,*v; 2895 PetscScalar *x,*b,s1,s2,x1,x2; 2896 PetscInt jdx,idt,idx,nz,*vi,i; 2897 2898 PetscFunctionBegin; 2899 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2900 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2901 2902 /* forward solve the lower triangular */ 2903 idx = 0; 2904 x[0] = b[0]; x[1] = b[1]; 2905 for (i=1; i<n; i++) { 2906 v = aa + 4*ai[i]; 2907 vi = aj + ai[i]; 2908 nz = diag[i] - ai[i]; 2909 idx += 2; 2910 s1 = b[idx];s2 = b[1+idx]; 2911 while (nz--) { 2912 jdx = 2*(*vi++); 2913 x1 = x[jdx];x2 = x[1+jdx]; 2914 s1 -= v[0]*x1 + v[2]*x2; 2915 s2 -= v[1]*x1 + v[3]*x2; 2916 v += 4; 2917 } 2918 x[idx] = s1; 2919 x[1+idx] = s2; 2920 } 2921 /* backward solve the upper triangular */ 2922 for (i=n-1; i>=0; i--){ 2923 v = aa + 4*diag[i] + 4; 2924 vi = aj + diag[i] + 1; 2925 nz = ai[i+1] - diag[i] - 1; 2926 idt = 2*i; 2927 s1 = x[idt]; s2 = x[1+idt]; 2928 while (nz--) { 2929 idx = 2*(*vi++); 2930 x1 = x[idx]; x2 = x[1+idx]; 2931 s1 -= v[0]*x1 + v[2]*x2; 2932 s2 -= v[1]*x1 + v[3]*x2; 2933 v += 4; 2934 } 2935 v = aa + 4*diag[i]; 2936 x[idt] = v[0]*s1 + v[2]*s2; 2937 x[1+idt] = v[1]*s1 + v[3]*s2; 2938 } 2939 2940 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2941 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2942 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->n);CHKERRQ(ierr); 2943 PetscFunctionReturn(0); 2944 } 2945 2946 #undef __FUNCT__ 2947 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2948 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2949 { 2950 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2951 IS iscol=a->col,isrow=a->row; 2952 PetscErrorCode ierr; 2953 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2954 PetscInt *diag = a->diag; 2955 MatScalar *aa=a->a,*v; 2956 PetscScalar *x,*b,s1,*t; 2957 2958 PetscFunctionBegin; 2959 if (!n) PetscFunctionReturn(0); 2960 2961 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2962 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2963 t = a->solve_work; 2964 2965 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2966 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2967 2968 /* forward solve the lower triangular */ 2969 t[0] = b[*r++]; 2970 for (i=1; i<n; i++) { 2971 v = aa + ai[i]; 2972 vi = aj + ai[i]; 2973 nz = diag[i] - ai[i]; 2974 s1 = b[*r++]; 2975 while (nz--) { 2976 s1 -= (*v++)*t[*vi++]; 2977 } 2978 t[i] = s1; 2979 } 2980 /* backward solve the upper triangular */ 2981 for (i=n-1; i>=0; i--){ 2982 v = aa + diag[i] + 1; 2983 vi = aj + diag[i] + 1; 2984 nz = ai[i+1] - diag[i] - 1; 2985 s1 = t[i]; 2986 while (nz--) { 2987 s1 -= (*v++)*t[*vi++]; 2988 } 2989 x[*c--] = t[i] = aa[diag[i]]*s1; 2990 } 2991 2992 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2993 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2994 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2995 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2996 ierr = PetscLogFlops(2*1*(a->nz) - A->n);CHKERRQ(ierr); 2997 PetscFunctionReturn(0); 2998 } 2999 /* 3000 Special case where the matrix was ILU(0) factored in the natural 3001 ordering. This eliminates the need for the column and row permutation. 3002 */ 3003 #undef __FUNCT__ 3004 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3005 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 3006 { 3007 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3008 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3009 PetscErrorCode ierr; 3010 PetscInt *diag = a->diag; 3011 MatScalar *aa=a->a; 3012 PetscScalar *x,*b; 3013 PetscScalar s1,x1; 3014 MatScalar *v; 3015 PetscInt jdx,idt,idx,nz,*vi,i; 3016 3017 PetscFunctionBegin; 3018 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3019 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3020 3021 /* forward solve the lower triangular */ 3022 idx = 0; 3023 x[0] = b[0]; 3024 for (i=1; i<n; i++) { 3025 v = aa + ai[i]; 3026 vi = aj + ai[i]; 3027 nz = diag[i] - ai[i]; 3028 idx += 1; 3029 s1 = b[idx]; 3030 while (nz--) { 3031 jdx = *vi++; 3032 x1 = x[jdx]; 3033 s1 -= v[0]*x1; 3034 v += 1; 3035 } 3036 x[idx] = s1; 3037 } 3038 /* backward solve the upper triangular */ 3039 for (i=n-1; i>=0; i--){ 3040 v = aa + diag[i] + 1; 3041 vi = aj + diag[i] + 1; 3042 nz = ai[i+1] - diag[i] - 1; 3043 idt = i; 3044 s1 = x[idt]; 3045 while (nz--) { 3046 idx = *vi++; 3047 x1 = x[idx]; 3048 s1 -= v[0]*x1; 3049 v += 1; 3050 } 3051 v = aa + diag[i]; 3052 x[idt] = v[0]*s1; 3053 } 3054 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3055 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3056 ierr = PetscLogFlops(2*(a->nz) - A->n);CHKERRQ(ierr); 3057 PetscFunctionReturn(0); 3058 } 3059 3060 /* ----------------------------------------------------------------*/ 3061 /* 3062 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3063 except that the data structure of Mat_SeqAIJ is slightly different. 3064 Not a good example of code reuse. 3065 */ 3066 EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat); 3067 3068 #undef __FUNCT__ 3069 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3070 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact) 3071 { 3072 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3073 IS isicol; 3074 PetscErrorCode ierr; 3075 PetscInt *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j; 3076 PetscInt *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3077 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3078 PetscInt incrlev,nnz,i,bs = A->bs,bs2 = a->bs2,levels,diagonal_fill; 3079 PetscTruth col_identity,row_identity; 3080 PetscReal f; 3081 3082 PetscFunctionBegin; 3083 f = info->fill; 3084 levels = (PetscInt)info->levels; 3085 diagonal_fill = (PetscInt)info->diagonal_fill; 3086 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3087 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3088 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3089 3090 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3091 ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3092 (*fact)->factor = FACTOR_LU; 3093 b = (Mat_SeqBAIJ*)(*fact)->data; 3094 if (!b->diag) { 3095 ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3096 } 3097 ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3098 b->row = isrow; 3099 b->col = iscol; 3100 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3101 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3102 b->icol = isicol; 3103 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3104 ierr = PetscMalloc(((*fact)->m+1+(*fact)->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3105 } else { /* general case perform the symbolic factorization */ 3106 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3107 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3108 3109 /* get new row pointers */ 3110 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 3111 ainew[0] = 0; 3112 /* don't know how many column pointers are needed so estimate */ 3113 jmax = (PetscInt)(f*ai[n] + 1); 3114 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 3115 /* ajfill is level of fill for each fill entry */ 3116 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 3117 /* fill is a linked list of nonzeros in active row */ 3118 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 3119 /* im is level for each filled value */ 3120 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 3121 /* dloc is location of diagonal in factor */ 3122 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 3123 dloc[0] = 0; 3124 for (prow=0; prow<n; prow++) { 3125 3126 /* copy prow into linked list */ 3127 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3128 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 3129 xi = aj + ai[r[prow]]; 3130 fill[n] = n; 3131 fill[prow] = -1; /* marker for diagonal entry */ 3132 while (nz--) { 3133 fm = n; 3134 idx = ic[*xi++]; 3135 do { 3136 m = fm; 3137 fm = fill[m]; 3138 } while (fm < idx); 3139 fill[m] = idx; 3140 fill[idx] = fm; 3141 im[idx] = 0; 3142 } 3143 3144 /* make sure diagonal entry is included */ 3145 if (diagonal_fill && fill[prow] == -1) { 3146 fm = n; 3147 while (fill[fm] < prow) fm = fill[fm]; 3148 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3149 fill[fm] = prow; 3150 im[prow] = 0; 3151 nzf++; 3152 dcount++; 3153 } 3154 3155 nzi = 0; 3156 row = fill[n]; 3157 while (row < prow) { 3158 incrlev = im[row] + 1; 3159 nz = dloc[row]; 3160 xi = ajnew + ainew[row] + nz + 1; 3161 flev = ajfill + ainew[row] + nz + 1; 3162 nnz = ainew[row+1] - ainew[row] - nz - 1; 3163 fm = row; 3164 while (nnz-- > 0) { 3165 idx = *xi++; 3166 if (*flev + incrlev > levels) { 3167 flev++; 3168 continue; 3169 } 3170 do { 3171 m = fm; 3172 fm = fill[m]; 3173 } while (fm < idx); 3174 if (fm != idx) { 3175 im[idx] = *flev + incrlev; 3176 fill[m] = idx; 3177 fill[idx] = fm; 3178 fm = idx; 3179 nzf++; 3180 } else { 3181 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3182 } 3183 flev++; 3184 } 3185 row = fill[row]; 3186 nzi++; 3187 } 3188 /* copy new filled row into permanent storage */ 3189 ainew[prow+1] = ainew[prow] + nzf; 3190 if (ainew[prow+1] > jmax) { 3191 3192 /* estimate how much additional space we will need */ 3193 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3194 /* just double the memory each time */ 3195 PetscInt maxadd = jmax; 3196 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3197 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3198 jmax += maxadd; 3199 3200 /* allocate a longer ajnew and ajfill */ 3201 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr); 3202 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3203 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3204 ajnew = xi; 3205 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr); 3206 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3207 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3208 ajfill = xi; 3209 reallocate++; /* count how many reallocations are needed */ 3210 } 3211 xi = ajnew + ainew[prow]; 3212 flev = ajfill + ainew[prow]; 3213 dloc[prow] = nzi; 3214 fm = fill[n]; 3215 while (nzf--) { 3216 *xi++ = fm; 3217 *flev++ = im[fm]; 3218 fm = fill[fm]; 3219 } 3220 /* make sure row has diagonal entry */ 3221 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3222 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 3223 try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 3224 } 3225 } 3226 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3227 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3228 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3229 ierr = PetscFree(fill);CHKERRQ(ierr); 3230 ierr = PetscFree(im);CHKERRQ(ierr); 3231 3232 #if defined(PETSC_USE_VERBOSE) 3233 { 3234 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3235 ierr = PetscVerboseInfo((A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %D Fill ratio:given %g needed %g\n",reallocate,f,af));CHKERRQ(ierr); 3236 ierr = PetscVerboseInfo((A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af));CHKERRQ(ierr); 3237 ierr = PetscVerboseInfo((A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af));CHKERRQ(ierr); 3238 ierr = PetscVerboseInfo((A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"));CHKERRQ(ierr); 3239 if (diagonal_fill) { 3240 ierr = PetscVerboseInfo((A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %D missing diagonals\n",dcount));CHKERRQ(ierr); 3241 } 3242 } 3243 #endif 3244 3245 /* put together the new matrix */ 3246 ierr = MatCreate(A->comm,fact);CHKERRQ(ierr); 3247 ierr = MatSetSizes(*fact,bs*n,bs*n,bs*n,bs*n);CHKERRQ(ierr); 3248 ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr); 3249 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(*fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3250 ierr = PetscLogObjectParent(*fact,isicol);CHKERRQ(ierr); 3251 b = (Mat_SeqBAIJ*)(*fact)->data; 3252 b->singlemalloc = PETSC_FALSE; 3253 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 3254 b->j = ajnew; 3255 b->i = ainew; 3256 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3257 b->diag = dloc; 3258 b->ilen = 0; 3259 b->imax = 0; 3260 b->row = isrow; 3261 b->col = iscol; 3262 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3263 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3264 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3265 b->icol = isicol; 3266 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3267 /* In b structure: Free imax, ilen, old a, old j. 3268 Allocate dloc, solve_work, new a, new j */ 3269 ierr = PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 3270 b->maxnz = b->nz = ainew[n]; 3271 (*fact)->factor = FACTOR_LU; 3272 3273 (*fact)->info.factor_mallocs = reallocate; 3274 (*fact)->info.fill_ratio_given = f; 3275 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3276 } 3277 3278 if (row_identity && col_identity) { 3279 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 3280 } 3281 PetscFunctionReturn(0); 3282 } 3283 3284 #undef __FUNCT__ 3285 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3286 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3287 { 3288 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3289 /* int i,*AJ=a->j,nz=a->nz; */ 3290 PetscFunctionBegin; 3291 /* Undo Column scaling */ 3292 /* while (nz--) { */ 3293 /* AJ[i] = AJ[i]/4; */ 3294 /* } */ 3295 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3296 A->ops->setunfactored = PETSC_NULL; 3297 PetscFunctionReturn(0); 3298 } 3299 3300 #undef __FUNCT__ 3301 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3302 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3303 { 3304 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3305 PetscInt *AJ=a->j,nz=a->nz; 3306 unsigned short *aj=(unsigned short *)AJ; 3307 PetscFunctionBegin; 3308 /* Is this really necessary? */ 3309 while (nz--) { 3310 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3311 } 3312 A->ops->setunfactored = PETSC_NULL; 3313 PetscFunctionReturn(0); 3314 } 3315 3316 #undef __FUNCT__ 3317 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3318 PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 3319 { 3320 PetscErrorCode ierr; 3321 /* 3322 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3323 with natural ordering 3324 */ 3325 PetscFunctionBegin; 3326 inA->ops->solve = MatSolve_SeqBAIJ_Update; 3327 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 3328 switch (inA->bs) { 3329 case 1: 3330 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3331 ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"));CHKERRQ(ierr); 3332 break; 3333 case 2: 3334 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3335 ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"));CHKERRQ(ierr); 3336 break; 3337 case 3: 3338 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3339 ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"));CHKERRQ(ierr); 3340 break; 3341 case 4: 3342 #if defined(PETSC_USE_MAT_SINGLE) 3343 { 3344 PetscTruth sse_enabled_local; 3345 PetscErrorCode ierr; 3346 ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3347 if (sse_enabled_local) { 3348 # if defined(PETSC_HAVE_SSE) 3349 int i,*AJ=a->j,nz=a->nz,n=a->mbs; 3350 if (n==(unsigned short)n) { 3351 unsigned short *aj=(unsigned short *)AJ; 3352 for (i=0;i<nz;i++) { 3353 aj[i] = (unsigned short)AJ[i]; 3354 } 3355 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3356 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3357 ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n"));CHKERRQ(ierr); 3358 } else { 3359 /* Scale the column indices for easier indexing in MatSolve. */ 3360 /* for (i=0;i<nz;i++) { */ 3361 /* AJ[i] = AJ[i]*4; */ 3362 /* } */ 3363 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE; 3364 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 3365 ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n"));CHKERRQ(ierr); 3366 } 3367 # else 3368 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3369 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3370 # endif 3371 } else { 3372 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3373 ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"));CHKERRQ(ierr); 3374 } 3375 } 3376 #else 3377 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3378 ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"));CHKERRQ(ierr); 3379 #endif 3380 break; 3381 case 5: 3382 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3383 ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"));CHKERRQ(ierr); 3384 break; 3385 case 6: 3386 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3387 ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"));CHKERRQ(ierr); 3388 break; 3389 case 7: 3390 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3391 ierr = PetscVerboseInfo((inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"));CHKERRQ(ierr); 3392 break; 3393 } 3394 PetscFunctionReturn(0); 3395 } 3396 3397 #undef __FUNCT__ 3398 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3399 PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A) 3400 { 3401 /* 3402 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3403 with natural ordering 3404 */ 3405 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3406 IS row = a->row, col = a->col; 3407 PetscTruth row_identity, col_identity; 3408 PetscTruth use_natural; 3409 PetscErrorCode ierr; 3410 3411 PetscFunctionBegin; 3412 3413 use_natural = PETSC_FALSE; 3414 if (row && col) { 3415 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3416 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3417 3418 if (row_identity && col_identity) { 3419 use_natural = PETSC_TRUE; 3420 } 3421 } else { 3422 use_natural = PETSC_TRUE; 3423 } 3424 3425 switch (A->bs) { 3426 case 1: 3427 if (use_natural) { 3428 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3429 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3430 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"));CHKERRQ(ierr); 3431 ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"));CHKERRQ(ierr); 3432 } else { 3433 A->ops->solve = MatSolve_SeqBAIJ_1; 3434 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3435 } 3436 break; 3437 case 2: 3438 if (use_natural) { 3439 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3440 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3441 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"));CHKERRQ(ierr); 3442 ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"));CHKERRQ(ierr); 3443 } else { 3444 A->ops->solve = MatSolve_SeqBAIJ_2; 3445 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3446 } 3447 break; 3448 case 3: 3449 if (use_natural) { 3450 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3451 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3452 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"));CHKERRQ(ierr); 3453 ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"));CHKERRQ(ierr); 3454 } else { 3455 A->ops->solve = MatSolve_SeqBAIJ_3; 3456 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3457 } 3458 break; 3459 case 4: 3460 { 3461 PetscTruth sse_enabled_local; 3462 ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3463 if (use_natural) { 3464 #if defined(PETSC_USE_MAT_SINGLE) 3465 if (sse_enabled_local) { /* Natural + Single + SSE */ 3466 # if defined(PETSC_HAVE_SSE) 3467 PetscInt n=a->mbs; 3468 if (n==(unsigned short)n) { 3469 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj; 3470 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n"));CHKERRQ(ierr); 3471 } else { 3472 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3473 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n"));CHKERRQ(ierr); 3474 } 3475 # else 3476 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3477 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3478 # endif 3479 } else { /* Natural + Single */ 3480 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3481 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n"));CHKERRQ(ierr); 3482 } 3483 #else 3484 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3485 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"));CHKERRQ(ierr); 3486 #endif 3487 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3488 ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"));CHKERRQ(ierr); 3489 } else { /* Arbitrary ordering */ 3490 #if defined(PETSC_USE_MAT_SINGLE) 3491 if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3492 # if defined(PETSC_HAVE_SSE) 3493 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3494 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"));CHKERRQ(ierr); 3495 # else 3496 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3497 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3498 # endif 3499 } else { /* Arbitrary + Single */ 3500 A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3501 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"));CHKERRQ(ierr); 3502 } 3503 #else 3504 A->ops->solve = MatSolve_SeqBAIJ_4; 3505 #endif 3506 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3507 } 3508 } 3509 break; 3510 case 5: 3511 if (use_natural) { 3512 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3513 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3514 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"));CHKERRQ(ierr); 3515 ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"));CHKERRQ(ierr); 3516 } else { 3517 A->ops->solve = MatSolve_SeqBAIJ_5; 3518 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3519 } 3520 break; 3521 case 6: 3522 if (use_natural) { 3523 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3524 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3525 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"));CHKERRQ(ierr); 3526 ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"));CHKERRQ(ierr); 3527 } else { 3528 A->ops->solve = MatSolve_SeqBAIJ_6; 3529 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3530 } 3531 break; 3532 case 7: 3533 if (use_natural) { 3534 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3535 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3536 ierr = PetscVerboseInfo((A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"));CHKERRQ(ierr); 3537 ierr = PetscVerboseInfo((A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"));CHKERRQ(ierr); 3538 } else { 3539 A->ops->solve = MatSolve_SeqBAIJ_7; 3540 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3541 } 3542 break; 3543 default: 3544 A->ops->solve = MatSolve_SeqBAIJ_N; 3545 break; 3546 } 3547 PetscFunctionReturn(0); 3548 } 3549 3550 #undef __FUNCT__ 3551 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3552 PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) 3553 { 3554 PetscErrorCode ierr; 3555 3556 PetscFunctionBegin; 3557 ierr = MatSeqBAIJ_UpdateSolvers(A); 3558 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3559 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3560 } else { 3561 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3562 } 3563 PetscFunctionReturn(0); 3564 } 3565 3566 #undef __FUNCT__ 3567 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3568 PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) 3569 { 3570 PetscErrorCode ierr; 3571 3572 PetscFunctionBegin; 3573 ierr = MatSeqBAIJ_UpdateSolvers(A); 3574 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3575 PetscFunctionReturn(0); 3576 } 3577 3578 3579