1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 7 #include "src/mat/impls/baij/seq/baij.h" 8 #include "src/inline/ilu.h" 9 #include "src/inline/dot.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 PetscErrorCode ierr; 17 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18 PetscInt *diag = a->diag; 19 MatScalar *aa=a->a,*v; 20 PetscScalar s1,*x,*b; 21 22 PetscFunctionBegin; 23 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 24 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 27 /* forward solve the U^T */ 28 for (i=0; i<n; i++) { 29 30 v = aa + diag[i]; 31 /* multiply by the inverse of the block diagonal */ 32 s1 = (*v++)*x[i]; 33 vi = aj + diag[i] + 1; 34 nz = ai[i+1] - diag[i] - 1; 35 while (nz--) { 36 x[*vi++] -= (*v++)*s1; 37 } 38 x[i] = s1; 39 } 40 /* backward solve the L^T */ 41 for (i=n-1; i>=0; i--){ 42 v = aa + diag[i] - 1; 43 vi = aj + diag[i] - 1; 44 nz = diag[i] - ai[i]; 45 s1 = x[i]; 46 while (nz--) { 47 x[*vi--] -= (*v--)*s1; 48 } 49 } 50 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 51 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52 ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 59 { 60 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 61 PetscErrorCode ierr; 62 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 63 PetscInt *diag = a->diag,oidx; 64 MatScalar *aa=a->a,*v; 65 PetscScalar s1,s2,x1,x2; 66 PetscScalar *x,*b; 67 68 PetscFunctionBegin; 69 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 70 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 71 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 72 73 /* forward solve the U^T */ 74 idx = 0; 75 for (i=0; i<n; i++) { 76 77 v = aa + 4*diag[i]; 78 /* multiply by the inverse of the block diagonal */ 79 x1 = x[idx]; x2 = x[1+idx]; 80 s1 = v[0]*x1 + v[1]*x2; 81 s2 = v[2]*x1 + v[3]*x2; 82 v += 4; 83 84 vi = aj + diag[i] + 1; 85 nz = ai[i+1] - diag[i] - 1; 86 while (nz--) { 87 oidx = 2*(*vi++); 88 x[oidx] -= v[0]*s1 + v[1]*s2; 89 x[oidx+1] -= v[2]*s1 + v[3]*s2; 90 v += 4; 91 } 92 x[idx] = s1;x[1+idx] = s2; 93 idx += 2; 94 } 95 /* backward solve the L^T */ 96 for (i=n-1; i>=0; i--){ 97 v = aa + 4*diag[i] - 4; 98 vi = aj + diag[i] - 1; 99 nz = diag[i] - ai[i]; 100 idt = 2*i; 101 s1 = x[idt]; s2 = x[1+idt]; 102 while (nz--) { 103 idx = 2*(*vi--); 104 x[idx] -= v[0]*s1 + v[1]*s2; 105 x[idx+1] -= v[2]*s1 + v[3]*s2; 106 v -= 4; 107 } 108 } 109 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 110 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 111 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 112 PetscFunctionReturn(0); 113 } 114 115 #undef __FUNCT__ 116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 118 { 119 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 120 PetscErrorCode ierr; 121 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 122 PetscInt *diag = a->diag,oidx; 123 MatScalar *aa=a->a,*v; 124 PetscScalar s1,s2,s3,x1,x2,x3; 125 PetscScalar *x,*b; 126 127 PetscFunctionBegin; 128 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 129 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 130 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 131 132 /* forward solve the U^T */ 133 idx = 0; 134 for (i=0; i<n; i++) { 135 136 v = aa + 9*diag[i]; 137 /* multiply by the inverse of the block diagonal */ 138 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 139 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 140 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 141 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 142 v += 9; 143 144 vi = aj + diag[i] + 1; 145 nz = ai[i+1] - diag[i] - 1; 146 while (nz--) { 147 oidx = 3*(*vi++); 148 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 149 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 150 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 151 v += 9; 152 } 153 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 154 idx += 3; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + 9*diag[i] - 9; 159 vi = aj + diag[i] - 1; 160 nz = diag[i] - ai[i]; 161 idt = 3*i; 162 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 163 while (nz--) { 164 idx = 3*(*vi--); 165 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 166 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 167 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 168 v -= 9; 169 } 170 } 171 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 172 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 173 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 180 { 181 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182 PetscErrorCode ierr; 183 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 184 PetscInt *diag = a->diag,oidx; 185 MatScalar *aa=a->a,*v; 186 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 187 PetscScalar *x,*b; 188 189 PetscFunctionBegin; 190 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 191 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 192 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 193 194 /* forward solve the U^T */ 195 idx = 0; 196 for (i=0; i<n; i++) { 197 198 v = aa + 16*diag[i]; 199 /* multiply by the inverse of the block diagonal */ 200 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 201 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 202 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 203 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 204 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 205 v += 16; 206 207 vi = aj + diag[i] + 1; 208 nz = ai[i+1] - diag[i] - 1; 209 while (nz--) { 210 oidx = 4*(*vi++); 211 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 212 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 213 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 214 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 215 v += 16; 216 } 217 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 218 idx += 4; 219 } 220 /* backward solve the L^T */ 221 for (i=n-1; i>=0; i--){ 222 v = aa + 16*diag[i] - 16; 223 vi = aj + diag[i] - 1; 224 nz = diag[i] - ai[i]; 225 idt = 4*i; 226 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 227 while (nz--) { 228 idx = 4*(*vi--); 229 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 230 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 231 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 232 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 233 v -= 16; 234 } 235 } 236 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 237 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 238 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 #undef __FUNCT__ 243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 245 { 246 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 247 PetscErrorCode ierr; 248 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 249 PetscInt *diag = a->diag,oidx; 250 MatScalar *aa=a->a,*v; 251 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 252 PetscScalar *x,*b; 253 254 PetscFunctionBegin; 255 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 256 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 257 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 258 259 /* forward solve the U^T */ 260 idx = 0; 261 for (i=0; i<n; i++) { 262 263 v = aa + 25*diag[i]; 264 /* multiply by the inverse of the block diagonal */ 265 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 266 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 267 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 268 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 269 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 270 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 271 v += 25; 272 273 vi = aj + diag[i] + 1; 274 nz = ai[i+1] - diag[i] - 1; 275 while (nz--) { 276 oidx = 5*(*vi++); 277 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 278 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 279 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 280 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 281 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 282 v += 25; 283 } 284 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 285 idx += 5; 286 } 287 /* backward solve the L^T */ 288 for (i=n-1; i>=0; i--){ 289 v = aa + 25*diag[i] - 25; 290 vi = aj + diag[i] - 1; 291 nz = diag[i] - ai[i]; 292 idt = 5*i; 293 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 294 while (nz--) { 295 idx = 5*(*vi--); 296 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 297 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 298 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 299 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 300 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 301 v -= 25; 302 } 303 } 304 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 305 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 306 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 #undef __FUNCT__ 311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 313 { 314 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 315 PetscErrorCode ierr; 316 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 317 PetscInt *diag = a->diag,oidx; 318 MatScalar *aa=a->a,*v; 319 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 320 PetscScalar *x,*b; 321 322 PetscFunctionBegin; 323 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 324 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 325 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 326 327 /* forward solve the U^T */ 328 idx = 0; 329 for (i=0; i<n; i++) { 330 331 v = aa + 36*diag[i]; 332 /* multiply by the inverse of the block diagonal */ 333 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 334 x6 = x[5+idx]; 335 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 336 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 337 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 338 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 339 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 340 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 341 v += 36; 342 343 vi = aj + diag[i] + 1; 344 nz = ai[i+1] - diag[i] - 1; 345 while (nz--) { 346 oidx = 6*(*vi++); 347 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 348 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 349 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 350 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 351 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 352 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 353 v += 36; 354 } 355 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 356 x[5+idx] = s6; 357 idx += 6; 358 } 359 /* backward solve the L^T */ 360 for (i=n-1; i>=0; i--){ 361 v = aa + 36*diag[i] - 36; 362 vi = aj + diag[i] - 1; 363 nz = diag[i] - ai[i]; 364 idt = 6*i; 365 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 366 s6 = x[5+idt]; 367 while (nz--) { 368 idx = 6*(*vi--); 369 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 370 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 371 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 372 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 373 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 374 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 375 v -= 36; 376 } 377 } 378 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 379 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 380 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 381 PetscFunctionReturn(0); 382 } 383 384 #undef __FUNCT__ 385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 387 { 388 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 389 PetscErrorCode ierr; 390 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 391 PetscInt *diag = a->diag,oidx; 392 MatScalar *aa=a->a,*v; 393 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 394 PetscScalar *x,*b; 395 396 PetscFunctionBegin; 397 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 398 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 399 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 400 401 /* forward solve the U^T */ 402 idx = 0; 403 for (i=0; i<n; i++) { 404 405 v = aa + 49*diag[i]; 406 /* multiply by the inverse of the block diagonal */ 407 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 408 x6 = x[5+idx]; x7 = x[6+idx]; 409 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 410 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 411 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 412 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 413 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 414 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 415 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 416 v += 49; 417 418 vi = aj + diag[i] + 1; 419 nz = ai[i+1] - diag[i] - 1; 420 while (nz--) { 421 oidx = 7*(*vi++); 422 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 423 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 424 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 425 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 426 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 427 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 428 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 429 v += 49; 430 } 431 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 432 x[5+idx] = s6;x[6+idx] = s7; 433 idx += 7; 434 } 435 /* backward solve the L^T */ 436 for (i=n-1; i>=0; i--){ 437 v = aa + 49*diag[i] - 49; 438 vi = aj + diag[i] - 1; 439 nz = diag[i] - ai[i]; 440 idt = 7*i; 441 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 442 s6 = x[5+idt];s7 = x[6+idt]; 443 while (nz--) { 444 idx = 7*(*vi--); 445 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 446 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 447 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 448 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 449 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 450 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 451 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 452 v -= 49; 453 } 454 } 455 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 458 PetscFunctionReturn(0); 459 } 460 461 /*---------------------------------------------------------------------------------------------*/ 462 #undef __FUNCT__ 463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 465 { 466 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 467 IS iscol=a->col,isrow=a->row; 468 PetscErrorCode ierr; 469 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 470 PetscInt *diag = a->diag; 471 MatScalar *aa=a->a,*v; 472 PetscScalar s1,*x,*b,*t; 473 474 PetscFunctionBegin; 475 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 476 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 477 t = a->solve_work; 478 479 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 480 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 481 482 /* copy the b into temp work space according to permutation */ 483 for (i=0; i<n; i++) { 484 t[i] = b[c[i]]; 485 } 486 487 /* forward solve the U^T */ 488 for (i=0; i<n; i++) { 489 490 v = aa + diag[i]; 491 /* multiply by the inverse of the block diagonal */ 492 s1 = (*v++)*t[i]; 493 vi = aj + diag[i] + 1; 494 nz = ai[i+1] - diag[i] - 1; 495 while (nz--) { 496 t[*vi++] -= (*v++)*s1; 497 } 498 t[i] = s1; 499 } 500 /* backward solve the L^T */ 501 for (i=n-1; i>=0; i--){ 502 v = aa + diag[i] - 1; 503 vi = aj + diag[i] - 1; 504 nz = diag[i] - ai[i]; 505 s1 = t[i]; 506 while (nz--) { 507 t[*vi--] -= (*v--)*s1; 508 } 509 } 510 511 /* copy t into x according to permutation */ 512 for (i=0; i<n; i++) { 513 x[r[i]] = t[i]; 514 } 515 516 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 517 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 518 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 519 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 520 ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 521 PetscFunctionReturn(0); 522 } 523 524 #undef __FUNCT__ 525 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 526 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 527 { 528 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 529 IS iscol=a->col,isrow=a->row; 530 PetscErrorCode ierr; 531 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 532 PetscInt *diag = a->diag,ii,ic,ir,oidx; 533 MatScalar *aa=a->a,*v; 534 PetscScalar s1,s2,x1,x2; 535 PetscScalar *x,*b,*t; 536 537 PetscFunctionBegin; 538 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 539 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 540 t = a->solve_work; 541 542 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 543 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 544 545 /* copy the b into temp work space according to permutation */ 546 ii = 0; 547 for (i=0; i<n; i++) { 548 ic = 2*c[i]; 549 t[ii] = b[ic]; 550 t[ii+1] = b[ic+1]; 551 ii += 2; 552 } 553 554 /* forward solve the U^T */ 555 idx = 0; 556 for (i=0; i<n; i++) { 557 558 v = aa + 4*diag[i]; 559 /* multiply by the inverse of the block diagonal */ 560 x1 = t[idx]; x2 = t[1+idx]; 561 s1 = v[0]*x1 + v[1]*x2; 562 s2 = v[2]*x1 + v[3]*x2; 563 v += 4; 564 565 vi = aj + diag[i] + 1; 566 nz = ai[i+1] - diag[i] - 1; 567 while (nz--) { 568 oidx = 2*(*vi++); 569 t[oidx] -= v[0]*s1 + v[1]*s2; 570 t[oidx+1] -= v[2]*s1 + v[3]*s2; 571 v += 4; 572 } 573 t[idx] = s1;t[1+idx] = s2; 574 idx += 2; 575 } 576 /* backward solve the L^T */ 577 for (i=n-1; i>=0; i--){ 578 v = aa + 4*diag[i] - 4; 579 vi = aj + diag[i] - 1; 580 nz = diag[i] - ai[i]; 581 idt = 2*i; 582 s1 = t[idt]; s2 = t[1+idt]; 583 while (nz--) { 584 idx = 2*(*vi--); 585 t[idx] -= v[0]*s1 + v[1]*s2; 586 t[idx+1] -= v[2]*s1 + v[3]*s2; 587 v -= 4; 588 } 589 } 590 591 /* copy t into x according to permutation */ 592 ii = 0; 593 for (i=0; i<n; i++) { 594 ir = 2*r[i]; 595 x[ir] = t[ii]; 596 x[ir+1] = t[ii+1]; 597 ii += 2; 598 } 599 600 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 601 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 602 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 603 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 604 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 605 PetscFunctionReturn(0); 606 } 607 608 #undef __FUNCT__ 609 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 610 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 611 { 612 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 613 IS iscol=a->col,isrow=a->row; 614 PetscErrorCode ierr; 615 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 616 PetscInt *diag = a->diag,ii,ic,ir,oidx; 617 MatScalar *aa=a->a,*v; 618 PetscScalar s1,s2,s3,x1,x2,x3; 619 PetscScalar *x,*b,*t; 620 621 PetscFunctionBegin; 622 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 623 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 624 t = a->solve_work; 625 626 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 627 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 628 629 /* copy the b into temp work space according to permutation */ 630 ii = 0; 631 for (i=0; i<n; i++) { 632 ic = 3*c[i]; 633 t[ii] = b[ic]; 634 t[ii+1] = b[ic+1]; 635 t[ii+2] = b[ic+2]; 636 ii += 3; 637 } 638 639 /* forward solve the U^T */ 640 idx = 0; 641 for (i=0; i<n; i++) { 642 643 v = aa + 9*diag[i]; 644 /* multiply by the inverse of the block diagonal */ 645 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 646 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 647 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 648 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 649 v += 9; 650 651 vi = aj + diag[i] + 1; 652 nz = ai[i+1] - diag[i] - 1; 653 while (nz--) { 654 oidx = 3*(*vi++); 655 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 656 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 657 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 658 v += 9; 659 } 660 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 661 idx += 3; 662 } 663 /* backward solve the L^T */ 664 for (i=n-1; i>=0; i--){ 665 v = aa + 9*diag[i] - 9; 666 vi = aj + diag[i] - 1; 667 nz = diag[i] - ai[i]; 668 idt = 3*i; 669 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 670 while (nz--) { 671 idx = 3*(*vi--); 672 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 673 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 674 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 675 v -= 9; 676 } 677 } 678 679 /* copy t into x according to permutation */ 680 ii = 0; 681 for (i=0; i<n; i++) { 682 ir = 3*r[i]; 683 x[ir] = t[ii]; 684 x[ir+1] = t[ii+1]; 685 x[ir+2] = t[ii+2]; 686 ii += 3; 687 } 688 689 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 690 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 691 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 692 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 693 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 694 PetscFunctionReturn(0); 695 } 696 697 #undef __FUNCT__ 698 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 699 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 700 { 701 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 702 IS iscol=a->col,isrow=a->row; 703 PetscErrorCode ierr; 704 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 705 PetscInt *diag = a->diag,ii,ic,ir,oidx; 706 MatScalar *aa=a->a,*v; 707 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 708 PetscScalar *x,*b,*t; 709 710 PetscFunctionBegin; 711 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 712 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 713 t = a->solve_work; 714 715 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 716 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 717 718 /* copy the b into temp work space according to permutation */ 719 ii = 0; 720 for (i=0; i<n; i++) { 721 ic = 4*c[i]; 722 t[ii] = b[ic]; 723 t[ii+1] = b[ic+1]; 724 t[ii+2] = b[ic+2]; 725 t[ii+3] = b[ic+3]; 726 ii += 4; 727 } 728 729 /* forward solve the U^T */ 730 idx = 0; 731 for (i=0; i<n; i++) { 732 733 v = aa + 16*diag[i]; 734 /* multiply by the inverse of the block diagonal */ 735 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 736 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 737 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 738 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 739 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 740 v += 16; 741 742 vi = aj + diag[i] + 1; 743 nz = ai[i+1] - diag[i] - 1; 744 while (nz--) { 745 oidx = 4*(*vi++); 746 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 747 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 748 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 749 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 750 v += 16; 751 } 752 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 753 idx += 4; 754 } 755 /* backward solve the L^T */ 756 for (i=n-1; i>=0; i--){ 757 v = aa + 16*diag[i] - 16; 758 vi = aj + diag[i] - 1; 759 nz = diag[i] - ai[i]; 760 idt = 4*i; 761 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 762 while (nz--) { 763 idx = 4*(*vi--); 764 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 765 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 766 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 767 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 768 v -= 16; 769 } 770 } 771 772 /* copy t into x according to permutation */ 773 ii = 0; 774 for (i=0; i<n; i++) { 775 ir = 4*r[i]; 776 x[ir] = t[ii]; 777 x[ir+1] = t[ii+1]; 778 x[ir+2] = t[ii+2]; 779 x[ir+3] = t[ii+3]; 780 ii += 4; 781 } 782 783 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 784 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 785 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 786 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 787 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 788 PetscFunctionReturn(0); 789 } 790 791 #undef __FUNCT__ 792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 793 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 794 { 795 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 796 IS iscol=a->col,isrow=a->row; 797 PetscErrorCode ierr; 798 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 799 PetscInt *diag = a->diag,ii,ic,ir,oidx; 800 MatScalar *aa=a->a,*v; 801 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 802 PetscScalar *x,*b,*t; 803 804 PetscFunctionBegin; 805 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 806 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 807 t = a->solve_work; 808 809 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 810 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 811 812 /* copy the b into temp work space according to permutation */ 813 ii = 0; 814 for (i=0; i<n; i++) { 815 ic = 5*c[i]; 816 t[ii] = b[ic]; 817 t[ii+1] = b[ic+1]; 818 t[ii+2] = b[ic+2]; 819 t[ii+3] = b[ic+3]; 820 t[ii+4] = b[ic+4]; 821 ii += 5; 822 } 823 824 /* forward solve the U^T */ 825 idx = 0; 826 for (i=0; i<n; i++) { 827 828 v = aa + 25*diag[i]; 829 /* multiply by the inverse of the block diagonal */ 830 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 831 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 832 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 833 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 834 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 835 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 836 v += 25; 837 838 vi = aj + diag[i] + 1; 839 nz = ai[i+1] - diag[i] - 1; 840 while (nz--) { 841 oidx = 5*(*vi++); 842 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 843 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 844 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 845 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 846 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 847 v += 25; 848 } 849 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 850 idx += 5; 851 } 852 /* backward solve the L^T */ 853 for (i=n-1; i>=0; i--){ 854 v = aa + 25*diag[i] - 25; 855 vi = aj + diag[i] - 1; 856 nz = diag[i] - ai[i]; 857 idt = 5*i; 858 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 859 while (nz--) { 860 idx = 5*(*vi--); 861 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 862 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 863 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 864 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 865 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 866 v -= 25; 867 } 868 } 869 870 /* copy t into x according to permutation */ 871 ii = 0; 872 for (i=0; i<n; i++) { 873 ir = 5*r[i]; 874 x[ir] = t[ii]; 875 x[ir+1] = t[ii+1]; 876 x[ir+2] = t[ii+2]; 877 x[ir+3] = t[ii+3]; 878 x[ir+4] = t[ii+4]; 879 ii += 5; 880 } 881 882 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 883 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 884 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 885 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 886 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 887 PetscFunctionReturn(0); 888 } 889 890 #undef __FUNCT__ 891 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 892 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 893 { 894 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 895 IS iscol=a->col,isrow=a->row; 896 PetscErrorCode ierr; 897 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 898 PetscInt *diag = a->diag,ii,ic,ir,oidx; 899 MatScalar *aa=a->a,*v; 900 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 901 PetscScalar *x,*b,*t; 902 903 PetscFunctionBegin; 904 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 905 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 906 t = a->solve_work; 907 908 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 909 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 910 911 /* copy the b into temp work space according to permutation */ 912 ii = 0; 913 for (i=0; i<n; i++) { 914 ic = 6*c[i]; 915 t[ii] = b[ic]; 916 t[ii+1] = b[ic+1]; 917 t[ii+2] = b[ic+2]; 918 t[ii+3] = b[ic+3]; 919 t[ii+4] = b[ic+4]; 920 t[ii+5] = b[ic+5]; 921 ii += 6; 922 } 923 924 /* forward solve the U^T */ 925 idx = 0; 926 for (i=0; i<n; i++) { 927 928 v = aa + 36*diag[i]; 929 /* multiply by the inverse of the block diagonal */ 930 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 931 x6 = t[5+idx]; 932 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 933 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 934 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 935 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 936 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 937 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 938 v += 36; 939 940 vi = aj + diag[i] + 1; 941 nz = ai[i+1] - diag[i] - 1; 942 while (nz--) { 943 oidx = 6*(*vi++); 944 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 945 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 946 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 947 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 948 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 949 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 950 v += 36; 951 } 952 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 953 t[5+idx] = s6; 954 idx += 6; 955 } 956 /* backward solve the L^T */ 957 for (i=n-1; i>=0; i--){ 958 v = aa + 36*diag[i] - 36; 959 vi = aj + diag[i] - 1; 960 nz = diag[i] - ai[i]; 961 idt = 6*i; 962 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 963 s6 = t[5+idt]; 964 while (nz--) { 965 idx = 6*(*vi--); 966 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 967 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 968 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 969 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 970 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 971 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 972 v -= 36; 973 } 974 } 975 976 /* copy t into x according to permutation */ 977 ii = 0; 978 for (i=0; i<n; i++) { 979 ir = 6*r[i]; 980 x[ir] = t[ii]; 981 x[ir+1] = t[ii+1]; 982 x[ir+2] = t[ii+2]; 983 x[ir+3] = t[ii+3]; 984 x[ir+4] = t[ii+4]; 985 x[ir+5] = t[ii+5]; 986 ii += 6; 987 } 988 989 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 990 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 991 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 992 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 993 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 994 PetscFunctionReturn(0); 995 } 996 997 #undef __FUNCT__ 998 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 999 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1000 { 1001 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1002 IS iscol=a->col,isrow=a->row; 1003 PetscErrorCode ierr; 1004 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 1005 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1006 MatScalar *aa=a->a,*v; 1007 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1008 PetscScalar *x,*b,*t; 1009 1010 PetscFunctionBegin; 1011 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1012 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1013 t = a->solve_work; 1014 1015 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1016 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1017 1018 /* copy the b into temp work space according to permutation */ 1019 ii = 0; 1020 for (i=0; i<n; i++) { 1021 ic = 7*c[i]; 1022 t[ii] = b[ic]; 1023 t[ii+1] = b[ic+1]; 1024 t[ii+2] = b[ic+2]; 1025 t[ii+3] = b[ic+3]; 1026 t[ii+4] = b[ic+4]; 1027 t[ii+5] = b[ic+5]; 1028 t[ii+6] = b[ic+6]; 1029 ii += 7; 1030 } 1031 1032 /* forward solve the U^T */ 1033 idx = 0; 1034 for (i=0; i<n; i++) { 1035 1036 v = aa + 49*diag[i]; 1037 /* multiply by the inverse of the block diagonal */ 1038 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1039 x6 = t[5+idx]; x7 = t[6+idx]; 1040 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1041 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1042 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1043 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1044 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1045 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1046 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1047 v += 49; 1048 1049 vi = aj + diag[i] + 1; 1050 nz = ai[i+1] - diag[i] - 1; 1051 while (nz--) { 1052 oidx = 7*(*vi++); 1053 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1054 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1055 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1056 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1057 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1058 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1059 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1060 v += 49; 1061 } 1062 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1063 t[5+idx] = s6;t[6+idx] = s7; 1064 idx += 7; 1065 } 1066 /* backward solve the L^T */ 1067 for (i=n-1; i>=0; i--){ 1068 v = aa + 49*diag[i] - 49; 1069 vi = aj + diag[i] - 1; 1070 nz = diag[i] - ai[i]; 1071 idt = 7*i; 1072 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1073 s6 = t[5+idt];s7 = t[6+idt]; 1074 while (nz--) { 1075 idx = 7*(*vi--); 1076 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1077 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1078 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1079 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1080 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1081 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1082 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1083 v -= 49; 1084 } 1085 } 1086 1087 /* copy t into x according to permutation */ 1088 ii = 0; 1089 for (i=0; i<n; i++) { 1090 ir = 7*r[i]; 1091 x[ir] = t[ii]; 1092 x[ir+1] = t[ii+1]; 1093 x[ir+2] = t[ii+2]; 1094 x[ir+3] = t[ii+3]; 1095 x[ir+4] = t[ii+4]; 1096 x[ir+5] = t[ii+5]; 1097 x[ir+6] = t[ii+6]; 1098 ii += 7; 1099 } 1100 1101 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1102 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1103 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1104 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1105 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 1106 PetscFunctionReturn(0); 1107 } 1108 1109 /* ----------------------------------------------------------- */ 1110 #undef __FUNCT__ 1111 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1112 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1113 { 1114 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1115 IS iscol=a->col,isrow=a->row; 1116 PetscErrorCode ierr; 1117 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1118 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2,*rout,*cout; 1119 MatScalar *aa=a->a,*v; 1120 PetscScalar *x,*b,*s,*t,*ls; 1121 1122 PetscFunctionBegin; 1123 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1124 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1125 t = a->solve_work; 1126 1127 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1128 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1129 1130 /* forward solve the lower triangular */ 1131 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1132 for (i=1; i<n; i++) { 1133 v = aa + bs2*ai[i]; 1134 vi = aj + ai[i]; 1135 nz = a->diag[i] - ai[i]; 1136 s = t + bs*i; 1137 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1138 while (nz--) { 1139 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1140 v += bs2; 1141 } 1142 } 1143 /* backward solve the upper triangular */ 1144 ls = a->solve_work + A->cmap->n; 1145 for (i=n-1; i>=0; i--){ 1146 v = aa + bs2*(a->diag[i] + 1); 1147 vi = aj + a->diag[i] + 1; 1148 nz = ai[i+1] - a->diag[i] - 1; 1149 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1150 while (nz--) { 1151 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1152 v += bs2; 1153 } 1154 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1155 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1156 } 1157 1158 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1159 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1160 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1161 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1162 ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 #undef __FUNCT__ 1167 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1168 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1169 { 1170 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1171 IS iscol=a->col,isrow=a->row; 1172 PetscErrorCode ierr; 1173 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1174 PetscInt *diag = a->diag; 1175 MatScalar *aa=a->a,*v; 1176 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1177 PetscScalar *x,*b,*t; 1178 1179 PetscFunctionBegin; 1180 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1181 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1182 t = a->solve_work; 1183 1184 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1185 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1186 1187 /* forward solve the lower triangular */ 1188 idx = 7*(*r++); 1189 t[0] = b[idx]; t[1] = b[1+idx]; 1190 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1191 t[5] = b[5+idx]; t[6] = b[6+idx]; 1192 1193 for (i=1; i<n; i++) { 1194 v = aa + 49*ai[i]; 1195 vi = aj + ai[i]; 1196 nz = diag[i] - ai[i]; 1197 idx = 7*(*r++); 1198 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1199 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1200 while (nz--) { 1201 idx = 7*(*vi++); 1202 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1203 x4 = t[3+idx];x5 = t[4+idx]; 1204 x6 = t[5+idx];x7 = t[6+idx]; 1205 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1206 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1207 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1208 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1209 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1210 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1211 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1212 v += 49; 1213 } 1214 idx = 7*i; 1215 t[idx] = s1;t[1+idx] = s2; 1216 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1217 t[5+idx] = s6;t[6+idx] = s7; 1218 } 1219 /* backward solve the upper triangular */ 1220 for (i=n-1; i>=0; i--){ 1221 v = aa + 49*diag[i] + 49; 1222 vi = aj + diag[i] + 1; 1223 nz = ai[i+1] - diag[i] - 1; 1224 idt = 7*i; 1225 s1 = t[idt]; s2 = t[1+idt]; 1226 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1227 s6 = t[5+idt];s7 = t[6+idt]; 1228 while (nz--) { 1229 idx = 7*(*vi++); 1230 x1 = t[idx]; x2 = t[1+idx]; 1231 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1232 x6 = t[5+idx]; x7 = t[6+idx]; 1233 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1234 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1235 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1236 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1237 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1238 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1239 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1240 v += 49; 1241 } 1242 idc = 7*(*c--); 1243 v = aa + 49*diag[i]; 1244 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1245 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1246 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1247 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1248 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1249 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1250 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1251 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1252 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1253 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1254 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1255 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1256 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1257 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1258 } 1259 1260 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1261 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1262 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1263 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1264 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 1265 PetscFunctionReturn(0); 1266 } 1267 1268 #undef __FUNCT__ 1269 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1270 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1271 { 1272 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1273 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1274 PetscErrorCode ierr; 1275 PetscInt *diag = a->diag,jdx; 1276 const MatScalar *aa=a->a,*v; 1277 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1278 const PetscScalar *b; 1279 1280 PetscFunctionBegin; 1281 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1282 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1283 /* forward solve the lower triangular */ 1284 idx = 0; 1285 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1286 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1287 x[6] = b[6+idx]; 1288 for (i=1; i<n; i++) { 1289 v = aa + 49*ai[i]; 1290 vi = aj + ai[i]; 1291 nz = diag[i] - ai[i]; 1292 idx = 7*i; 1293 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1294 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1295 s7 = b[6+idx]; 1296 while (nz--) { 1297 jdx = 7*(*vi++); 1298 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1299 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1300 x7 = x[6+jdx]; 1301 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1302 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1303 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1304 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1305 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1306 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1307 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1308 v += 49; 1309 } 1310 x[idx] = s1; 1311 x[1+idx] = s2; 1312 x[2+idx] = s3; 1313 x[3+idx] = s4; 1314 x[4+idx] = s5; 1315 x[5+idx] = s6; 1316 x[6+idx] = s7; 1317 } 1318 /* backward solve the upper triangular */ 1319 for (i=n-1; i>=0; i--){ 1320 v = aa + 49*diag[i] + 49; 1321 vi = aj + diag[i] + 1; 1322 nz = ai[i+1] - diag[i] - 1; 1323 idt = 7*i; 1324 s1 = x[idt]; s2 = x[1+idt]; 1325 s3 = x[2+idt]; s4 = x[3+idt]; 1326 s5 = x[4+idt]; s6 = x[5+idt]; 1327 s7 = x[6+idt]; 1328 while (nz--) { 1329 idx = 7*(*vi++); 1330 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1331 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1332 x7 = x[6+idx]; 1333 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1334 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1335 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1336 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1337 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1338 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1339 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1340 v += 49; 1341 } 1342 v = aa + 49*diag[i]; 1343 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1344 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1345 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1346 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1347 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1348 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1349 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1350 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1351 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1352 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1353 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1354 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1355 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1356 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1357 } 1358 1359 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1360 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1361 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 1362 PetscFunctionReturn(0); 1363 } 1364 1365 #undef __FUNCT__ 1366 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1367 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1368 { 1369 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1370 IS iscol=a->col,isrow=a->row; 1371 PetscErrorCode ierr; 1372 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1373 PetscInt *diag = a->diag; 1374 const MatScalar *aa=a->a,*v; 1375 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1376 const PetscScalar *b; 1377 PetscFunctionBegin; 1378 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1379 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1380 t = a->solve_work; 1381 1382 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1383 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1384 1385 /* forward solve the lower triangular */ 1386 idx = 6*(*r++); 1387 t[0] = b[idx]; t[1] = b[1+idx]; 1388 t[2] = b[2+idx]; t[3] = b[3+idx]; 1389 t[4] = b[4+idx]; t[5] = b[5+idx]; 1390 for (i=1; i<n; i++) { 1391 v = aa + 36*ai[i]; 1392 vi = aj + ai[i]; 1393 nz = diag[i] - ai[i]; 1394 idx = 6*(*r++); 1395 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1396 s5 = b[4+idx]; s6 = b[5+idx]; 1397 while (nz--) { 1398 idx = 6*(*vi++); 1399 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1400 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1401 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1402 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1403 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1404 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1405 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1406 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1407 v += 36; 1408 } 1409 idx = 6*i; 1410 t[idx] = s1;t[1+idx] = s2; 1411 t[2+idx] = s3;t[3+idx] = s4; 1412 t[4+idx] = s5;t[5+idx] = s6; 1413 } 1414 /* backward solve the upper triangular */ 1415 for (i=n-1; i>=0; i--){ 1416 v = aa + 36*diag[i] + 36; 1417 vi = aj + diag[i] + 1; 1418 nz = ai[i+1] - diag[i] - 1; 1419 idt = 6*i; 1420 s1 = t[idt]; s2 = t[1+idt]; 1421 s3 = t[2+idt];s4 = t[3+idt]; 1422 s5 = t[4+idt];s6 = t[5+idt]; 1423 while (nz--) { 1424 idx = 6*(*vi++); 1425 x1 = t[idx]; x2 = t[1+idx]; 1426 x3 = t[2+idx]; x4 = t[3+idx]; 1427 x5 = t[4+idx]; x6 = t[5+idx]; 1428 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1429 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1430 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1431 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1432 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1433 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1434 v += 36; 1435 } 1436 idc = 6*(*c--); 1437 v = aa + 36*diag[i]; 1438 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1439 v[18]*s4+v[24]*s5+v[30]*s6; 1440 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1441 v[19]*s4+v[25]*s5+v[31]*s6; 1442 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1443 v[20]*s4+v[26]*s5+v[32]*s6; 1444 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1445 v[21]*s4+v[27]*s5+v[33]*s6; 1446 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1447 v[22]*s4+v[28]*s5+v[34]*s6; 1448 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1449 v[23]*s4+v[29]*s5+v[35]*s6; 1450 } 1451 1452 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1453 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1454 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1455 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1456 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 1457 PetscFunctionReturn(0); 1458 } 1459 1460 #undef __FUNCT__ 1461 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1462 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1463 { 1464 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1465 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1466 PetscErrorCode ierr; 1467 PetscInt *diag = a->diag,jdx; 1468 const MatScalar *aa=a->a,*v; 1469 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1470 const PetscScalar *b; 1471 1472 PetscFunctionBegin; 1473 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1474 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1475 /* forward solve the lower triangular */ 1476 idx = 0; 1477 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1478 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1479 for (i=1; i<n; i++) { 1480 v = aa + 36*ai[i]; 1481 vi = aj + ai[i]; 1482 nz = diag[i] - ai[i]; 1483 idx = 6*i; 1484 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1485 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1486 while (nz--) { 1487 jdx = 6*(*vi++); 1488 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1489 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1490 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1491 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1492 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1493 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1494 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1495 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1496 v += 36; 1497 } 1498 x[idx] = s1; 1499 x[1+idx] = s2; 1500 x[2+idx] = s3; 1501 x[3+idx] = s4; 1502 x[4+idx] = s5; 1503 x[5+idx] = s6; 1504 } 1505 /* backward solve the upper triangular */ 1506 for (i=n-1; i>=0; i--){ 1507 v = aa + 36*diag[i] + 36; 1508 vi = aj + diag[i] + 1; 1509 nz = ai[i+1] - diag[i] - 1; 1510 idt = 6*i; 1511 s1 = x[idt]; s2 = x[1+idt]; 1512 s3 = x[2+idt]; s4 = x[3+idt]; 1513 s5 = x[4+idt]; s6 = x[5+idt]; 1514 while (nz--) { 1515 idx = 6*(*vi++); 1516 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1517 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1518 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1519 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1520 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1521 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1522 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1523 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1524 v += 36; 1525 } 1526 v = aa + 36*diag[i]; 1527 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1528 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1529 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1530 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1531 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1532 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1533 } 1534 1535 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1536 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1537 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 1538 PetscFunctionReturn(0); 1539 } 1540 1541 #undef __FUNCT__ 1542 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1543 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1544 { 1545 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1546 IS iscol=a->col,isrow=a->row; 1547 PetscErrorCode ierr; 1548 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1549 PetscInt *diag = a->diag; 1550 const MatScalar *aa=a->a,*v; 1551 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1552 const PetscScalar *b; 1553 1554 PetscFunctionBegin; 1555 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1556 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1557 t = a->solve_work; 1558 1559 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1560 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1561 1562 /* forward solve the lower triangular */ 1563 idx = 5*(*r++); 1564 t[0] = b[idx]; t[1] = b[1+idx]; 1565 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1566 for (i=1; i<n; i++) { 1567 v = aa + 25*ai[i]; 1568 vi = aj + ai[i]; 1569 nz = diag[i] - ai[i]; 1570 idx = 5*(*r++); 1571 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1572 s5 = b[4+idx]; 1573 while (nz--) { 1574 idx = 5*(*vi++); 1575 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1576 x4 = t[3+idx];x5 = t[4+idx]; 1577 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1578 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1579 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1580 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1581 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1582 v += 25; 1583 } 1584 idx = 5*i; 1585 t[idx] = s1;t[1+idx] = s2; 1586 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1587 } 1588 /* backward solve the upper triangular */ 1589 for (i=n-1; i>=0; i--){ 1590 v = aa + 25*diag[i] + 25; 1591 vi = aj + diag[i] + 1; 1592 nz = ai[i+1] - diag[i] - 1; 1593 idt = 5*i; 1594 s1 = t[idt]; s2 = t[1+idt]; 1595 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1596 while (nz--) { 1597 idx = 5*(*vi++); 1598 x1 = t[idx]; x2 = t[1+idx]; 1599 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1600 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1601 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1602 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1603 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1604 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1605 v += 25; 1606 } 1607 idc = 5*(*c--); 1608 v = aa + 25*diag[i]; 1609 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1610 v[15]*s4+v[20]*s5; 1611 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1612 v[16]*s4+v[21]*s5; 1613 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1614 v[17]*s4+v[22]*s5; 1615 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1616 v[18]*s4+v[23]*s5; 1617 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1618 v[19]*s4+v[24]*s5; 1619 } 1620 1621 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1622 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1623 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1624 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1625 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 1626 PetscFunctionReturn(0); 1627 } 1628 1629 #undef __FUNCT__ 1630 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1631 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1632 { 1633 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1634 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1635 PetscErrorCode ierr; 1636 PetscInt *diag = a->diag,jdx; 1637 const MatScalar *aa=a->a,*v; 1638 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1639 const PetscScalar *b; 1640 1641 PetscFunctionBegin; 1642 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1643 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1644 /* forward solve the lower triangular */ 1645 idx = 0; 1646 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1647 for (i=1; i<n; i++) { 1648 v = aa + 25*ai[i]; 1649 vi = aj + ai[i]; 1650 nz = diag[i] - ai[i]; 1651 idx = 5*i; 1652 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1653 while (nz--) { 1654 jdx = 5*(*vi++); 1655 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1656 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1657 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1658 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1659 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1660 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1661 v += 25; 1662 } 1663 x[idx] = s1; 1664 x[1+idx] = s2; 1665 x[2+idx] = s3; 1666 x[3+idx] = s4; 1667 x[4+idx] = s5; 1668 } 1669 /* backward solve the upper triangular */ 1670 for (i=n-1; i>=0; i--){ 1671 v = aa + 25*diag[i] + 25; 1672 vi = aj + diag[i] + 1; 1673 nz = ai[i+1] - diag[i] - 1; 1674 idt = 5*i; 1675 s1 = x[idt]; s2 = x[1+idt]; 1676 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1677 while (nz--) { 1678 idx = 5*(*vi++); 1679 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1680 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1681 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1682 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1683 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1684 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1685 v += 25; 1686 } 1687 v = aa + 25*diag[i]; 1688 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1689 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1690 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1691 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1692 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1693 } 1694 1695 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1696 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1697 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 1698 PetscFunctionReturn(0); 1699 } 1700 1701 #undef __FUNCT__ 1702 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1703 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1704 { 1705 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1706 IS iscol=a->col,isrow=a->row; 1707 PetscErrorCode ierr; 1708 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1709 PetscInt *diag = a->diag; 1710 const MatScalar *aa=a->a,*v; 1711 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1712 const PetscScalar *b; 1713 1714 PetscFunctionBegin; 1715 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1716 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1717 t = a->solve_work; 1718 1719 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1720 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1721 1722 /* forward solve the lower triangular */ 1723 idx = 4*(*r++); 1724 t[0] = b[idx]; t[1] = b[1+idx]; 1725 t[2] = b[2+idx]; t[3] = b[3+idx]; 1726 for (i=1; i<n; i++) { 1727 v = aa + 16*ai[i]; 1728 vi = aj + ai[i]; 1729 nz = diag[i] - ai[i]; 1730 idx = 4*(*r++); 1731 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1732 while (nz--) { 1733 idx = 4*(*vi++); 1734 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1735 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1736 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1737 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1738 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1739 v += 16; 1740 } 1741 idx = 4*i; 1742 t[idx] = s1;t[1+idx] = s2; 1743 t[2+idx] = s3;t[3+idx] = s4; 1744 } 1745 /* backward solve the upper triangular */ 1746 for (i=n-1; i>=0; i--){ 1747 v = aa + 16*diag[i] + 16; 1748 vi = aj + diag[i] + 1; 1749 nz = ai[i+1] - diag[i] - 1; 1750 idt = 4*i; 1751 s1 = t[idt]; s2 = t[1+idt]; 1752 s3 = t[2+idt];s4 = t[3+idt]; 1753 while (nz--) { 1754 idx = 4*(*vi++); 1755 x1 = t[idx]; x2 = t[1+idx]; 1756 x3 = t[2+idx]; x4 = t[3+idx]; 1757 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1758 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1759 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1760 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1761 v += 16; 1762 } 1763 idc = 4*(*c--); 1764 v = aa + 16*diag[i]; 1765 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1766 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1767 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1768 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1769 } 1770 1771 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1772 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1773 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1774 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1775 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 1776 PetscFunctionReturn(0); 1777 } 1778 1779 #undef __FUNCT__ 1780 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1781 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1782 { 1783 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1784 IS iscol=a->col,isrow=a->row; 1785 PetscErrorCode ierr; 1786 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1787 PetscInt *diag = a->diag; 1788 const MatScalar *aa=a->a,*v; 1789 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 1790 PetscScalar *x; 1791 const PetscScalar *b; 1792 1793 PetscFunctionBegin; 1794 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1795 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1796 t = (MatScalar *)a->solve_work; 1797 1798 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1799 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1800 1801 /* forward solve the lower triangular */ 1802 idx = 4*(*r++); 1803 t[0] = (MatScalar)b[idx]; 1804 t[1] = (MatScalar)b[1+idx]; 1805 t[2] = (MatScalar)b[2+idx]; 1806 t[3] = (MatScalar)b[3+idx]; 1807 for (i=1; i<n; i++) { 1808 v = aa + 16*ai[i]; 1809 vi = aj + ai[i]; 1810 nz = diag[i] - ai[i]; 1811 idx = 4*(*r++); 1812 s1 = (MatScalar)b[idx]; 1813 s2 = (MatScalar)b[1+idx]; 1814 s3 = (MatScalar)b[2+idx]; 1815 s4 = (MatScalar)b[3+idx]; 1816 while (nz--) { 1817 idx = 4*(*vi++); 1818 x1 = t[idx]; 1819 x2 = t[1+idx]; 1820 x3 = t[2+idx]; 1821 x4 = t[3+idx]; 1822 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1823 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1824 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1825 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1826 v += 16; 1827 } 1828 idx = 4*i; 1829 t[idx] = s1; 1830 t[1+idx] = s2; 1831 t[2+idx] = s3; 1832 t[3+idx] = s4; 1833 } 1834 /* backward solve the upper triangular */ 1835 for (i=n-1; i>=0; i--){ 1836 v = aa + 16*diag[i] + 16; 1837 vi = aj + diag[i] + 1; 1838 nz = ai[i+1] - diag[i] - 1; 1839 idt = 4*i; 1840 s1 = t[idt]; 1841 s2 = t[1+idt]; 1842 s3 = t[2+idt]; 1843 s4 = t[3+idt]; 1844 while (nz--) { 1845 idx = 4*(*vi++); 1846 x1 = t[idx]; 1847 x2 = t[1+idx]; 1848 x3 = t[2+idx]; 1849 x4 = t[3+idx]; 1850 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1851 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1852 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1853 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1854 v += 16; 1855 } 1856 idc = 4*(*c--); 1857 v = aa + 16*diag[i]; 1858 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1859 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1860 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1861 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1862 x[idc] = (PetscScalar)t[idt]; 1863 x[1+idc] = (PetscScalar)t[1+idt]; 1864 x[2+idc] = (PetscScalar)t[2+idt]; 1865 x[3+idc] = (PetscScalar)t[3+idt]; 1866 } 1867 1868 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1869 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1870 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1871 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1872 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 1873 PetscFunctionReturn(0); 1874 } 1875 1876 #if defined (PETSC_HAVE_SSE) 1877 1878 #include PETSC_HAVE_SSE 1879 1880 #undef __FUNCT__ 1881 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1882 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1883 { 1884 /* 1885 Note: This code uses demotion of double 1886 to float when performing the mixed-mode computation. 1887 This may not be numerically reasonable for all applications. 1888 */ 1889 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1890 IS iscol=a->col,isrow=a->row; 1891 PetscErrorCode ierr; 1892 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1893 PetscInt *diag = a->diag,ai16; 1894 MatScalar *aa=a->a,*v; 1895 PetscScalar *x,*b,*t; 1896 1897 /* Make space in temp stack for 16 Byte Aligned arrays */ 1898 float ssealignedspace[11],*tmps,*tmpx; 1899 unsigned long offset; 1900 1901 PetscFunctionBegin; 1902 SSE_SCOPE_BEGIN; 1903 1904 offset = (unsigned long)ssealignedspace % 16; 1905 if (offset) offset = (16 - offset)/4; 1906 tmps = &ssealignedspace[offset]; 1907 tmpx = &ssealignedspace[offset+4]; 1908 PREFETCH_NTA(aa+16*ai[1]); 1909 1910 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1911 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1912 t = a->solve_work; 1913 1914 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1915 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1916 1917 /* forward solve the lower triangular */ 1918 idx = 4*(*r++); 1919 t[0] = b[idx]; t[1] = b[1+idx]; 1920 t[2] = b[2+idx]; t[3] = b[3+idx]; 1921 v = aa + 16*ai[1]; 1922 1923 for (i=1; i<n;) { 1924 PREFETCH_NTA(&v[8]); 1925 vi = aj + ai[i]; 1926 nz = diag[i] - ai[i]; 1927 idx = 4*(*r++); 1928 1929 /* Demote sum from double to float */ 1930 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1931 LOAD_PS(tmps,XMM7); 1932 1933 while (nz--) { 1934 PREFETCH_NTA(&v[16]); 1935 idx = 4*(*vi++); 1936 1937 /* Demote solution (so far) from double to float */ 1938 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1939 1940 /* 4x4 Matrix-Vector product with negative accumulation: */ 1941 SSE_INLINE_BEGIN_2(tmpx,v) 1942 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1943 1944 /* First Column */ 1945 SSE_COPY_PS(XMM0,XMM6) 1946 SSE_SHUFFLE(XMM0,XMM0,0x00) 1947 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1948 SSE_SUB_PS(XMM7,XMM0) 1949 1950 /* Second Column */ 1951 SSE_COPY_PS(XMM1,XMM6) 1952 SSE_SHUFFLE(XMM1,XMM1,0x55) 1953 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1954 SSE_SUB_PS(XMM7,XMM1) 1955 1956 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1957 1958 /* Third Column */ 1959 SSE_COPY_PS(XMM2,XMM6) 1960 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1961 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1962 SSE_SUB_PS(XMM7,XMM2) 1963 1964 /* Fourth Column */ 1965 SSE_COPY_PS(XMM3,XMM6) 1966 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1967 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1968 SSE_SUB_PS(XMM7,XMM3) 1969 SSE_INLINE_END_2 1970 1971 v += 16; 1972 } 1973 idx = 4*i; 1974 v = aa + 16*ai[++i]; 1975 PREFETCH_NTA(v); 1976 STORE_PS(tmps,XMM7); 1977 1978 /* Promote result from float to double */ 1979 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1980 } 1981 /* backward solve the upper triangular */ 1982 idt = 4*(n-1); 1983 ai16 = 16*diag[n-1]; 1984 v = aa + ai16 + 16; 1985 for (i=n-1; i>=0;){ 1986 PREFETCH_NTA(&v[8]); 1987 vi = aj + diag[i] + 1; 1988 nz = ai[i+1] - diag[i] - 1; 1989 1990 /* Demote accumulator from double to float */ 1991 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1992 LOAD_PS(tmps,XMM7); 1993 1994 while (nz--) { 1995 PREFETCH_NTA(&v[16]); 1996 idx = 4*(*vi++); 1997 1998 /* Demote solution (so far) from double to float */ 1999 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 2000 2001 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2002 SSE_INLINE_BEGIN_2(tmpx,v) 2003 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2004 2005 /* First Column */ 2006 SSE_COPY_PS(XMM0,XMM6) 2007 SSE_SHUFFLE(XMM0,XMM0,0x00) 2008 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2009 SSE_SUB_PS(XMM7,XMM0) 2010 2011 /* Second Column */ 2012 SSE_COPY_PS(XMM1,XMM6) 2013 SSE_SHUFFLE(XMM1,XMM1,0x55) 2014 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2015 SSE_SUB_PS(XMM7,XMM1) 2016 2017 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2018 2019 /* Third Column */ 2020 SSE_COPY_PS(XMM2,XMM6) 2021 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2022 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2023 SSE_SUB_PS(XMM7,XMM2) 2024 2025 /* Fourth Column */ 2026 SSE_COPY_PS(XMM3,XMM6) 2027 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2028 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2029 SSE_SUB_PS(XMM7,XMM3) 2030 SSE_INLINE_END_2 2031 v += 16; 2032 } 2033 v = aa + ai16; 2034 ai16 = 16*diag[--i]; 2035 PREFETCH_NTA(aa+ai16+16); 2036 /* 2037 Scale the result by the diagonal 4x4 block, 2038 which was inverted as part of the factorization 2039 */ 2040 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2041 /* First Column */ 2042 SSE_COPY_PS(XMM0,XMM7) 2043 SSE_SHUFFLE(XMM0,XMM0,0x00) 2044 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2045 2046 /* Second Column */ 2047 SSE_COPY_PS(XMM1,XMM7) 2048 SSE_SHUFFLE(XMM1,XMM1,0x55) 2049 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2050 SSE_ADD_PS(XMM0,XMM1) 2051 2052 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2053 2054 /* Third Column */ 2055 SSE_COPY_PS(XMM2,XMM7) 2056 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2057 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2058 SSE_ADD_PS(XMM0,XMM2) 2059 2060 /* Fourth Column */ 2061 SSE_COPY_PS(XMM3,XMM7) 2062 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2063 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2064 SSE_ADD_PS(XMM0,XMM3) 2065 2066 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2067 SSE_INLINE_END_3 2068 2069 /* Promote solution from float to double */ 2070 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2071 2072 /* Apply reordering to t and stream into x. */ 2073 /* This way, x doesn't pollute the cache. */ 2074 /* Be careful with size: 2 doubles = 4 floats! */ 2075 idc = 4*(*c--); 2076 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2077 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2078 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2079 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2080 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2081 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2082 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2083 SSE_INLINE_END_2 2084 v = aa + ai16 + 16; 2085 idt -= 4; 2086 } 2087 2088 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2089 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2090 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2091 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2092 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2093 SSE_SCOPE_END; 2094 PetscFunctionReturn(0); 2095 } 2096 2097 #endif 2098 2099 2100 /* 2101 Special case where the matrix was ILU(0) factored in the natural 2102 ordering. This eliminates the need for the column and row permutation. 2103 */ 2104 #undef __FUNCT__ 2105 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2106 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2107 { 2108 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2109 PetscInt n=a->mbs; 2110 const PetscInt *ai=a->i,*aj=a->j; 2111 PetscErrorCode ierr; 2112 const PetscInt *diag = a->diag; 2113 const MatScalar *aa=a->a; 2114 PetscScalar *x; 2115 const PetscScalar *b; 2116 2117 PetscFunctionBegin; 2118 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2119 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2120 2121 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2122 { 2123 static PetscScalar w[2000]; /* very BAD need to fix */ 2124 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2125 } 2126 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2127 { 2128 static PetscScalar w[2000]; /* very BAD need to fix */ 2129 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2130 } 2131 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2132 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2133 #else 2134 { 2135 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2136 const MatScalar *v; 2137 PetscInt jdx,idt,idx,nz,i,ai16; 2138 const PetscInt *vi; 2139 2140 /* forward solve the lower triangular */ 2141 idx = 0; 2142 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2143 for (i=1; i<n; i++) { 2144 v = aa + 16*ai[i]; 2145 vi = aj + ai[i]; 2146 nz = diag[i] - ai[i]; 2147 idx += 4; 2148 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2149 while (nz--) { 2150 jdx = 4*(*vi++); 2151 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2152 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2153 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2154 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2155 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2156 v += 16; 2157 } 2158 x[idx] = s1; 2159 x[1+idx] = s2; 2160 x[2+idx] = s3; 2161 x[3+idx] = s4; 2162 } 2163 /* backward solve the upper triangular */ 2164 idt = 4*(n-1); 2165 for (i=n-1; i>=0; i--){ 2166 ai16 = 16*diag[i]; 2167 v = aa + ai16 + 16; 2168 vi = aj + diag[i] + 1; 2169 nz = ai[i+1] - diag[i] - 1; 2170 s1 = x[idt]; s2 = x[1+idt]; 2171 s3 = x[2+idt];s4 = x[3+idt]; 2172 while (nz--) { 2173 idx = 4*(*vi++); 2174 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2175 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2176 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2177 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2178 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2179 v += 16; 2180 } 2181 v = aa + ai16; 2182 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2183 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2184 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2185 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2186 idt -= 4; 2187 } 2188 } 2189 #endif 2190 2191 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2192 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2193 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2194 PetscFunctionReturn(0); 2195 } 2196 2197 #undef __FUNCT__ 2198 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2199 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2200 { 2201 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2202 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2203 PetscErrorCode ierr; 2204 PetscInt *diag = a->diag; 2205 MatScalar *aa=a->a; 2206 PetscScalar *x,*b; 2207 2208 PetscFunctionBegin; 2209 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2210 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2211 2212 { 2213 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2214 MatScalar *v,*t=(MatScalar *)x; 2215 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2216 2217 /* forward solve the lower triangular */ 2218 idx = 0; 2219 t[0] = (MatScalar)b[0]; 2220 t[1] = (MatScalar)b[1]; 2221 t[2] = (MatScalar)b[2]; 2222 t[3] = (MatScalar)b[3]; 2223 for (i=1; i<n; i++) { 2224 v = aa + 16*ai[i]; 2225 vi = aj + ai[i]; 2226 nz = diag[i] - ai[i]; 2227 idx += 4; 2228 s1 = (MatScalar)b[idx]; 2229 s2 = (MatScalar)b[1+idx]; 2230 s3 = (MatScalar)b[2+idx]; 2231 s4 = (MatScalar)b[3+idx]; 2232 while (nz--) { 2233 jdx = 4*(*vi++); 2234 x1 = t[jdx]; 2235 x2 = t[1+jdx]; 2236 x3 = t[2+jdx]; 2237 x4 = t[3+jdx]; 2238 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2239 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2240 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2241 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2242 v += 16; 2243 } 2244 t[idx] = s1; 2245 t[1+idx] = s2; 2246 t[2+idx] = s3; 2247 t[3+idx] = s4; 2248 } 2249 /* backward solve the upper triangular */ 2250 idt = 4*(n-1); 2251 for (i=n-1; i>=0; i--){ 2252 ai16 = 16*diag[i]; 2253 v = aa + ai16 + 16; 2254 vi = aj + diag[i] + 1; 2255 nz = ai[i+1] - diag[i] - 1; 2256 s1 = t[idt]; 2257 s2 = t[1+idt]; 2258 s3 = t[2+idt]; 2259 s4 = t[3+idt]; 2260 while (nz--) { 2261 idx = 4*(*vi++); 2262 x1 = (MatScalar)x[idx]; 2263 x2 = (MatScalar)x[1+idx]; 2264 x3 = (MatScalar)x[2+idx]; 2265 x4 = (MatScalar)x[3+idx]; 2266 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2267 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2268 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2269 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2270 v += 16; 2271 } 2272 v = aa + ai16; 2273 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2274 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2275 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2276 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2277 idt -= 4; 2278 } 2279 } 2280 2281 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2282 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2283 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2284 PetscFunctionReturn(0); 2285 } 2286 2287 #if defined (PETSC_HAVE_SSE) 2288 2289 #include PETSC_HAVE_SSE 2290 #undef __FUNCT__ 2291 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2292 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2293 { 2294 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2295 unsigned short *aj=(unsigned short *)a->j; 2296 PetscErrorCode ierr; 2297 int *ai=a->i,n=a->mbs,*diag = a->diag; 2298 MatScalar *aa=a->a; 2299 PetscScalar *x,*b; 2300 2301 PetscFunctionBegin; 2302 SSE_SCOPE_BEGIN; 2303 /* 2304 Note: This code currently uses demotion of double 2305 to float when performing the mixed-mode computation. 2306 This may not be numerically reasonable for all applications. 2307 */ 2308 PREFETCH_NTA(aa+16*ai[1]); 2309 2310 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2311 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2312 { 2313 /* x will first be computed in single precision then promoted inplace to double */ 2314 MatScalar *v,*t=(MatScalar *)x; 2315 int nz,i,idt,ai16; 2316 unsigned int jdx,idx; 2317 unsigned short *vi; 2318 /* Forward solve the lower triangular factor. */ 2319 2320 /* First block is the identity. */ 2321 idx = 0; 2322 CONVERT_DOUBLE4_FLOAT4(t,b); 2323 v = aa + 16*((unsigned int)ai[1]); 2324 2325 for (i=1; i<n;) { 2326 PREFETCH_NTA(&v[8]); 2327 vi = aj + ai[i]; 2328 nz = diag[i] - ai[i]; 2329 idx += 4; 2330 2331 /* Demote RHS from double to float. */ 2332 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2333 LOAD_PS(&t[idx],XMM7); 2334 2335 while (nz--) { 2336 PREFETCH_NTA(&v[16]); 2337 jdx = 4*((unsigned int)(*vi++)); 2338 2339 /* 4x4 Matrix-Vector product with negative accumulation: */ 2340 SSE_INLINE_BEGIN_2(&t[jdx],v) 2341 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2342 2343 /* First Column */ 2344 SSE_COPY_PS(XMM0,XMM6) 2345 SSE_SHUFFLE(XMM0,XMM0,0x00) 2346 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2347 SSE_SUB_PS(XMM7,XMM0) 2348 2349 /* Second Column */ 2350 SSE_COPY_PS(XMM1,XMM6) 2351 SSE_SHUFFLE(XMM1,XMM1,0x55) 2352 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2353 SSE_SUB_PS(XMM7,XMM1) 2354 2355 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2356 2357 /* Third Column */ 2358 SSE_COPY_PS(XMM2,XMM6) 2359 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2360 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2361 SSE_SUB_PS(XMM7,XMM2) 2362 2363 /* Fourth Column */ 2364 SSE_COPY_PS(XMM3,XMM6) 2365 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2366 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2367 SSE_SUB_PS(XMM7,XMM3) 2368 SSE_INLINE_END_2 2369 2370 v += 16; 2371 } 2372 v = aa + 16*ai[++i]; 2373 PREFETCH_NTA(v); 2374 STORE_PS(&t[idx],XMM7); 2375 } 2376 2377 /* Backward solve the upper triangular factor.*/ 2378 2379 idt = 4*(n-1); 2380 ai16 = 16*diag[n-1]; 2381 v = aa + ai16 + 16; 2382 for (i=n-1; i>=0;){ 2383 PREFETCH_NTA(&v[8]); 2384 vi = aj + diag[i] + 1; 2385 nz = ai[i+1] - diag[i] - 1; 2386 2387 LOAD_PS(&t[idt],XMM7); 2388 2389 while (nz--) { 2390 PREFETCH_NTA(&v[16]); 2391 idx = 4*((unsigned int)(*vi++)); 2392 2393 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2394 SSE_INLINE_BEGIN_2(&t[idx],v) 2395 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2396 2397 /* First Column */ 2398 SSE_COPY_PS(XMM0,XMM6) 2399 SSE_SHUFFLE(XMM0,XMM0,0x00) 2400 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2401 SSE_SUB_PS(XMM7,XMM0) 2402 2403 /* Second Column */ 2404 SSE_COPY_PS(XMM1,XMM6) 2405 SSE_SHUFFLE(XMM1,XMM1,0x55) 2406 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2407 SSE_SUB_PS(XMM7,XMM1) 2408 2409 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2410 2411 /* Third Column */ 2412 SSE_COPY_PS(XMM2,XMM6) 2413 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2414 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2415 SSE_SUB_PS(XMM7,XMM2) 2416 2417 /* Fourth Column */ 2418 SSE_COPY_PS(XMM3,XMM6) 2419 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2420 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2421 SSE_SUB_PS(XMM7,XMM3) 2422 SSE_INLINE_END_2 2423 v += 16; 2424 } 2425 v = aa + ai16; 2426 ai16 = 16*diag[--i]; 2427 PREFETCH_NTA(aa+ai16+16); 2428 /* 2429 Scale the result by the diagonal 4x4 block, 2430 which was inverted as part of the factorization 2431 */ 2432 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2433 /* First Column */ 2434 SSE_COPY_PS(XMM0,XMM7) 2435 SSE_SHUFFLE(XMM0,XMM0,0x00) 2436 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2437 2438 /* Second Column */ 2439 SSE_COPY_PS(XMM1,XMM7) 2440 SSE_SHUFFLE(XMM1,XMM1,0x55) 2441 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2442 SSE_ADD_PS(XMM0,XMM1) 2443 2444 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2445 2446 /* Third Column */ 2447 SSE_COPY_PS(XMM2,XMM7) 2448 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2449 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2450 SSE_ADD_PS(XMM0,XMM2) 2451 2452 /* Fourth Column */ 2453 SSE_COPY_PS(XMM3,XMM7) 2454 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2455 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2456 SSE_ADD_PS(XMM0,XMM3) 2457 2458 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2459 SSE_INLINE_END_3 2460 2461 v = aa + ai16 + 16; 2462 idt -= 4; 2463 } 2464 2465 /* Convert t from single precision back to double precision (inplace)*/ 2466 idt = 4*(n-1); 2467 for (i=n-1;i>=0;i--) { 2468 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2469 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2470 PetscScalar *xtemp=&x[idt]; 2471 MatScalar *ttemp=&t[idt]; 2472 xtemp[3] = (PetscScalar)ttemp[3]; 2473 xtemp[2] = (PetscScalar)ttemp[2]; 2474 xtemp[1] = (PetscScalar)ttemp[1]; 2475 xtemp[0] = (PetscScalar)ttemp[0]; 2476 idt -= 4; 2477 } 2478 2479 } /* End of artificial scope. */ 2480 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2481 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2482 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2483 SSE_SCOPE_END; 2484 PetscFunctionReturn(0); 2485 } 2486 2487 #undef __FUNCT__ 2488 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2489 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2490 { 2491 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2492 int *aj=a->j; 2493 PetscErrorCode ierr; 2494 int *ai=a->i,n=a->mbs,*diag = a->diag; 2495 MatScalar *aa=a->a; 2496 PetscScalar *x,*b; 2497 2498 PetscFunctionBegin; 2499 SSE_SCOPE_BEGIN; 2500 /* 2501 Note: This code currently uses demotion of double 2502 to float when performing the mixed-mode computation. 2503 This may not be numerically reasonable for all applications. 2504 */ 2505 PREFETCH_NTA(aa+16*ai[1]); 2506 2507 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2508 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2509 { 2510 /* x will first be computed in single precision then promoted inplace to double */ 2511 MatScalar *v,*t=(MatScalar *)x; 2512 int nz,i,idt,ai16; 2513 int jdx,idx; 2514 int *vi; 2515 /* Forward solve the lower triangular factor. */ 2516 2517 /* First block is the identity. */ 2518 idx = 0; 2519 CONVERT_DOUBLE4_FLOAT4(t,b); 2520 v = aa + 16*ai[1]; 2521 2522 for (i=1; i<n;) { 2523 PREFETCH_NTA(&v[8]); 2524 vi = aj + ai[i]; 2525 nz = diag[i] - ai[i]; 2526 idx += 4; 2527 2528 /* Demote RHS from double to float. */ 2529 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2530 LOAD_PS(&t[idx],XMM7); 2531 2532 while (nz--) { 2533 PREFETCH_NTA(&v[16]); 2534 jdx = 4*(*vi++); 2535 /* jdx = *vi++; */ 2536 2537 /* 4x4 Matrix-Vector product with negative accumulation: */ 2538 SSE_INLINE_BEGIN_2(&t[jdx],v) 2539 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2540 2541 /* First Column */ 2542 SSE_COPY_PS(XMM0,XMM6) 2543 SSE_SHUFFLE(XMM0,XMM0,0x00) 2544 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2545 SSE_SUB_PS(XMM7,XMM0) 2546 2547 /* Second Column */ 2548 SSE_COPY_PS(XMM1,XMM6) 2549 SSE_SHUFFLE(XMM1,XMM1,0x55) 2550 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2551 SSE_SUB_PS(XMM7,XMM1) 2552 2553 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2554 2555 /* Third Column */ 2556 SSE_COPY_PS(XMM2,XMM6) 2557 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2558 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2559 SSE_SUB_PS(XMM7,XMM2) 2560 2561 /* Fourth Column */ 2562 SSE_COPY_PS(XMM3,XMM6) 2563 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2564 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2565 SSE_SUB_PS(XMM7,XMM3) 2566 SSE_INLINE_END_2 2567 2568 v += 16; 2569 } 2570 v = aa + 16*ai[++i]; 2571 PREFETCH_NTA(v); 2572 STORE_PS(&t[idx],XMM7); 2573 } 2574 2575 /* Backward solve the upper triangular factor.*/ 2576 2577 idt = 4*(n-1); 2578 ai16 = 16*diag[n-1]; 2579 v = aa + ai16 + 16; 2580 for (i=n-1; i>=0;){ 2581 PREFETCH_NTA(&v[8]); 2582 vi = aj + diag[i] + 1; 2583 nz = ai[i+1] - diag[i] - 1; 2584 2585 LOAD_PS(&t[idt],XMM7); 2586 2587 while (nz--) { 2588 PREFETCH_NTA(&v[16]); 2589 idx = 4*(*vi++); 2590 /* idx = *vi++; */ 2591 2592 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2593 SSE_INLINE_BEGIN_2(&t[idx],v) 2594 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2595 2596 /* First Column */ 2597 SSE_COPY_PS(XMM0,XMM6) 2598 SSE_SHUFFLE(XMM0,XMM0,0x00) 2599 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2600 SSE_SUB_PS(XMM7,XMM0) 2601 2602 /* Second Column */ 2603 SSE_COPY_PS(XMM1,XMM6) 2604 SSE_SHUFFLE(XMM1,XMM1,0x55) 2605 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2606 SSE_SUB_PS(XMM7,XMM1) 2607 2608 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2609 2610 /* Third Column */ 2611 SSE_COPY_PS(XMM2,XMM6) 2612 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2613 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2614 SSE_SUB_PS(XMM7,XMM2) 2615 2616 /* Fourth Column */ 2617 SSE_COPY_PS(XMM3,XMM6) 2618 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2619 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2620 SSE_SUB_PS(XMM7,XMM3) 2621 SSE_INLINE_END_2 2622 v += 16; 2623 } 2624 v = aa + ai16; 2625 ai16 = 16*diag[--i]; 2626 PREFETCH_NTA(aa+ai16+16); 2627 /* 2628 Scale the result by the diagonal 4x4 block, 2629 which was inverted as part of the factorization 2630 */ 2631 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2632 /* First Column */ 2633 SSE_COPY_PS(XMM0,XMM7) 2634 SSE_SHUFFLE(XMM0,XMM0,0x00) 2635 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2636 2637 /* Second Column */ 2638 SSE_COPY_PS(XMM1,XMM7) 2639 SSE_SHUFFLE(XMM1,XMM1,0x55) 2640 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2641 SSE_ADD_PS(XMM0,XMM1) 2642 2643 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2644 2645 /* Third Column */ 2646 SSE_COPY_PS(XMM2,XMM7) 2647 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2648 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2649 SSE_ADD_PS(XMM0,XMM2) 2650 2651 /* Fourth Column */ 2652 SSE_COPY_PS(XMM3,XMM7) 2653 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2654 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2655 SSE_ADD_PS(XMM0,XMM3) 2656 2657 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2658 SSE_INLINE_END_3 2659 2660 v = aa + ai16 + 16; 2661 idt -= 4; 2662 } 2663 2664 /* Convert t from single precision back to double precision (inplace)*/ 2665 idt = 4*(n-1); 2666 for (i=n-1;i>=0;i--) { 2667 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2668 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2669 PetscScalar *xtemp=&x[idt]; 2670 MatScalar *ttemp=&t[idt]; 2671 xtemp[3] = (PetscScalar)ttemp[3]; 2672 xtemp[2] = (PetscScalar)ttemp[2]; 2673 xtemp[1] = (PetscScalar)ttemp[1]; 2674 xtemp[0] = (PetscScalar)ttemp[0]; 2675 idt -= 4; 2676 } 2677 2678 } /* End of artificial scope. */ 2679 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2680 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2681 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2682 SSE_SCOPE_END; 2683 PetscFunctionReturn(0); 2684 } 2685 2686 #endif 2687 #undef __FUNCT__ 2688 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2689 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2690 { 2691 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2692 IS iscol=a->col,isrow=a->row; 2693 PetscErrorCode ierr; 2694 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2695 PetscInt *diag = a->diag; 2696 const MatScalar *aa=a->a,*v; 2697 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 2698 const PetscScalar *b; 2699 2700 PetscFunctionBegin; 2701 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2702 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2703 t = a->solve_work; 2704 2705 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2706 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2707 2708 /* forward solve the lower triangular */ 2709 idx = 3*(*r++); 2710 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2711 for (i=1; i<n; i++) { 2712 v = aa + 9*ai[i]; 2713 vi = aj + ai[i]; 2714 nz = diag[i] - ai[i]; 2715 idx = 3*(*r++); 2716 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2717 while (nz--) { 2718 idx = 3*(*vi++); 2719 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2720 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2721 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2722 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2723 v += 9; 2724 } 2725 idx = 3*i; 2726 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2727 } 2728 /* backward solve the upper triangular */ 2729 for (i=n-1; i>=0; i--){ 2730 v = aa + 9*diag[i] + 9; 2731 vi = aj + diag[i] + 1; 2732 nz = ai[i+1] - diag[i] - 1; 2733 idt = 3*i; 2734 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2735 while (nz--) { 2736 idx = 3*(*vi++); 2737 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2738 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2739 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2740 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2741 v += 9; 2742 } 2743 idc = 3*(*c--); 2744 v = aa + 9*diag[i]; 2745 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2746 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2747 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2748 } 2749 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2750 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2751 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2752 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2753 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 2754 PetscFunctionReturn(0); 2755 } 2756 2757 /* 2758 Special case where the matrix was ILU(0) factored in the natural 2759 ordering. This eliminates the need for the column and row permutation. 2760 */ 2761 #undef __FUNCT__ 2762 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2763 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2764 { 2765 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2766 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2767 PetscErrorCode ierr; 2768 PetscInt *diag = a->diag; 2769 const MatScalar *aa=a->a,*v; 2770 PetscScalar *x,s1,s2,s3,x1,x2,x3; 2771 const PetscScalar *b; 2772 PetscInt jdx,idt,idx,nz,*vi,i; 2773 2774 PetscFunctionBegin; 2775 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2776 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2777 2778 /* forward solve the lower triangular */ 2779 idx = 0; 2780 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2781 for (i=1; i<n; i++) { 2782 v = aa + 9*ai[i]; 2783 vi = aj + ai[i]; 2784 nz = diag[i] - ai[i]; 2785 idx += 3; 2786 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2787 while (nz--) { 2788 jdx = 3*(*vi++); 2789 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2790 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2791 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2792 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2793 v += 9; 2794 } 2795 x[idx] = s1; 2796 x[1+idx] = s2; 2797 x[2+idx] = s3; 2798 } 2799 /* backward solve the upper triangular */ 2800 for (i=n-1; i>=0; i--){ 2801 v = aa + 9*diag[i] + 9; 2802 vi = aj + diag[i] + 1; 2803 nz = ai[i+1] - diag[i] - 1; 2804 idt = 3*i; 2805 s1 = x[idt]; s2 = x[1+idt]; 2806 s3 = x[2+idt]; 2807 while (nz--) { 2808 idx = 3*(*vi++); 2809 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2810 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2811 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2812 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2813 v += 9; 2814 } 2815 v = aa + 9*diag[i]; 2816 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2817 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2818 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2819 } 2820 2821 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2822 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2823 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 2824 PetscFunctionReturn(0); 2825 } 2826 2827 #undef __FUNCT__ 2828 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2829 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2830 { 2831 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2832 IS iscol=a->col,isrow=a->row; 2833 PetscErrorCode ierr; 2834 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2835 PetscInt *diag = a->diag; 2836 const MatScalar *aa=a->a,*v; 2837 PetscScalar *x,s1,s2,x1,x2,*t; 2838 const PetscScalar *b; 2839 2840 PetscFunctionBegin; 2841 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2842 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2843 t = a->solve_work; 2844 2845 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2846 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2847 2848 /* forward solve the lower triangular */ 2849 idx = 2*(*r++); 2850 t[0] = b[idx]; t[1] = b[1+idx]; 2851 for (i=1; i<n; i++) { 2852 v = aa + 4*ai[i]; 2853 vi = aj + ai[i]; 2854 nz = diag[i] - ai[i]; 2855 idx = 2*(*r++); 2856 s1 = b[idx]; s2 = b[1+idx]; 2857 while (nz--) { 2858 idx = 2*(*vi++); 2859 x1 = t[idx]; x2 = t[1+idx]; 2860 s1 -= v[0]*x1 + v[2]*x2; 2861 s2 -= v[1]*x1 + v[3]*x2; 2862 v += 4; 2863 } 2864 idx = 2*i; 2865 t[idx] = s1; t[1+idx] = s2; 2866 } 2867 /* backward solve the upper triangular */ 2868 for (i=n-1; i>=0; i--){ 2869 v = aa + 4*diag[i] + 4; 2870 vi = aj + diag[i] + 1; 2871 nz = ai[i+1] - diag[i] - 1; 2872 idt = 2*i; 2873 s1 = t[idt]; s2 = t[1+idt]; 2874 while (nz--) { 2875 idx = 2*(*vi++); 2876 x1 = t[idx]; x2 = t[1+idx]; 2877 s1 -= v[0]*x1 + v[2]*x2; 2878 s2 -= v[1]*x1 + v[3]*x2; 2879 v += 4; 2880 } 2881 idc = 2*(*c--); 2882 v = aa + 4*diag[i]; 2883 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2884 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2885 } 2886 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2887 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2888 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2889 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2890 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 2891 PetscFunctionReturn(0); 2892 } 2893 2894 /* 2895 Special case where the matrix was ILU(0) factored in the natural 2896 ordering. This eliminates the need for the column and row permutation. 2897 */ 2898 #undef __FUNCT__ 2899 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2900 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2901 { 2902 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2903 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2904 PetscErrorCode ierr; 2905 PetscInt *diag = a->diag; 2906 const MatScalar *aa=a->a,*v; 2907 PetscScalar *x,s1,s2,x1,x2; 2908 const PetscScalar *b; 2909 PetscInt jdx,idt,idx,nz,*vi,i; 2910 2911 PetscFunctionBegin; 2912 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2914 2915 /* forward solve the lower triangular */ 2916 idx = 0; 2917 x[0] = b[0]; x[1] = b[1]; 2918 for (i=1; i<n; i++) { 2919 v = aa + 4*ai[i]; 2920 vi = aj + ai[i]; 2921 nz = diag[i] - ai[i]; 2922 idx += 2; 2923 s1 = b[idx];s2 = b[1+idx]; 2924 while (nz--) { 2925 jdx = 2*(*vi++); 2926 x1 = x[jdx];x2 = x[1+jdx]; 2927 s1 -= v[0]*x1 + v[2]*x2; 2928 s2 -= v[1]*x1 + v[3]*x2; 2929 v += 4; 2930 } 2931 x[idx] = s1; 2932 x[1+idx] = s2; 2933 } 2934 /* backward solve the upper triangular */ 2935 for (i=n-1; i>=0; i--){ 2936 v = aa + 4*diag[i] + 4; 2937 vi = aj + diag[i] + 1; 2938 nz = ai[i+1] - diag[i] - 1; 2939 idt = 2*i; 2940 s1 = x[idt]; s2 = x[1+idt]; 2941 while (nz--) { 2942 idx = 2*(*vi++); 2943 x1 = x[idx]; x2 = x[1+idx]; 2944 s1 -= v[0]*x1 + v[2]*x2; 2945 s2 -= v[1]*x1 + v[3]*x2; 2946 v += 4; 2947 } 2948 v = aa + 4*diag[i]; 2949 x[idt] = v[0]*s1 + v[2]*s2; 2950 x[1+idt] = v[1]*s1 + v[3]*s2; 2951 } 2952 2953 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2954 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2955 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 2956 PetscFunctionReturn(0); 2957 } 2958 2959 #undef __FUNCT__ 2960 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2961 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2962 { 2963 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2964 IS iscol=a->col,isrow=a->row; 2965 PetscErrorCode ierr; 2966 PetscInt *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2967 PetscInt *diag = a->diag; 2968 MatScalar *aa=a->a,*v; 2969 PetscScalar *x,*b,s1,*t; 2970 2971 PetscFunctionBegin; 2972 if (!n) PetscFunctionReturn(0); 2973 2974 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2975 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2976 t = a->solve_work; 2977 2978 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2979 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2980 2981 /* forward solve the lower triangular */ 2982 t[0] = b[*r++]; 2983 for (i=1; i<n; i++) { 2984 v = aa + ai[i]; 2985 vi = aj + ai[i]; 2986 nz = diag[i] - ai[i]; 2987 s1 = b[*r++]; 2988 while (nz--) { 2989 s1 -= (*v++)*t[*vi++]; 2990 } 2991 t[i] = s1; 2992 } 2993 /* backward solve the upper triangular */ 2994 for (i=n-1; i>=0; i--){ 2995 v = aa + diag[i] + 1; 2996 vi = aj + diag[i] + 1; 2997 nz = ai[i+1] - diag[i] - 1; 2998 s1 = t[i]; 2999 while (nz--) { 3000 s1 -= (*v++)*t[*vi++]; 3001 } 3002 x[*c--] = t[i] = aa[diag[i]]*s1; 3003 } 3004 3005 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3006 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3007 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3008 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3009 ierr = PetscLogFlops(2*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3010 PetscFunctionReturn(0); 3011 } 3012 /* 3013 Special case where the matrix was ILU(0) factored in the natural 3014 ordering. This eliminates the need for the column and row permutation. 3015 */ 3016 #undef __FUNCT__ 3017 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3018 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 3019 { 3020 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3021 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3022 PetscErrorCode ierr; 3023 PetscInt *diag = a->diag; 3024 MatScalar *aa=a->a; 3025 PetscScalar *x,*b; 3026 PetscScalar s1,x1; 3027 MatScalar *v; 3028 PetscInt jdx,idt,idx,nz,*vi,i; 3029 3030 PetscFunctionBegin; 3031 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3032 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3033 3034 /* forward solve the lower triangular */ 3035 idx = 0; 3036 x[0] = b[0]; 3037 for (i=1; i<n; i++) { 3038 v = aa + ai[i]; 3039 vi = aj + ai[i]; 3040 nz = diag[i] - ai[i]; 3041 idx += 1; 3042 s1 = b[idx]; 3043 while (nz--) { 3044 jdx = *vi++; 3045 x1 = x[jdx]; 3046 s1 -= v[0]*x1; 3047 v += 1; 3048 } 3049 x[idx] = s1; 3050 } 3051 /* backward solve the upper triangular */ 3052 for (i=n-1; i>=0; i--){ 3053 v = aa + diag[i] + 1; 3054 vi = aj + diag[i] + 1; 3055 nz = ai[i+1] - diag[i] - 1; 3056 idt = i; 3057 s1 = x[idt]; 3058 while (nz--) { 3059 idx = *vi++; 3060 x1 = x[idx]; 3061 s1 -= v[0]*x1; 3062 v += 1; 3063 } 3064 v = aa + diag[i]; 3065 x[idt] = v[0]*s1; 3066 } 3067 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3068 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3069 ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3070 PetscFunctionReturn(0); 3071 } 3072 3073 /* ----------------------------------------------------------------*/ 3074 /* 3075 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3076 except that the data structure of Mat_SeqAIJ is slightly different. 3077 Not a good example of code reuse. 3078 */ 3079 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,MatDuplicateOption,Mat*); 3080 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 3081 3082 #undef __FUNCT__ 3083 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3084 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact) 3085 { 3086 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3087 IS isicol; 3088 PetscErrorCode ierr; 3089 PetscInt *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j; 3090 PetscInt *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3091 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3092 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 3093 PetscTruth col_identity,row_identity,flg; 3094 PetscReal f; 3095 3096 PetscFunctionBegin; 3097 f = info->fill; 3098 levels = (PetscInt)info->levels; 3099 diagonal_fill = (PetscInt)info->diagonal_fill; 3100 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3101 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3102 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3103 3104 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3105 ierr = MatDuplicateNoCreate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3106 (*fact)->factor = MAT_FACTOR_LU; 3107 b = (Mat_SeqBAIJ*)(*fact)->data; 3108 ierr = MatMissingDiagonal_SeqBAIJ(*fact,&flg,&dd);CHKERRQ(ierr); 3109 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd); 3110 b->row = isrow; 3111 b->col = iscol; 3112 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3113 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3114 b->icol = isicol; 3115 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3116 ierr = PetscMalloc(((*fact)->rmap->N+1+(*fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3117 } else { /* general case perform the symbolic factorization */ 3118 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3119 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3120 3121 /* get new row pointers */ 3122 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 3123 ainew[0] = 0; 3124 /* don't know how many column pointers are needed so estimate */ 3125 jmax = (PetscInt)(f*ai[n] + 1); 3126 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 3127 /* ajfill is level of fill for each fill entry */ 3128 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 3129 /* fill is a linked list of nonzeros in active row */ 3130 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 3131 /* im is level for each filled value */ 3132 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 3133 /* dloc is location of diagonal in factor */ 3134 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 3135 dloc[0] = 0; 3136 for (prow=0; prow<n; prow++) { 3137 3138 /* copy prow into linked list */ 3139 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3140 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 3141 xi = aj + ai[r[prow]]; 3142 fill[n] = n; 3143 fill[prow] = -1; /* marker for diagonal entry */ 3144 while (nz--) { 3145 fm = n; 3146 idx = ic[*xi++]; 3147 do { 3148 m = fm; 3149 fm = fill[m]; 3150 } while (fm < idx); 3151 fill[m] = idx; 3152 fill[idx] = fm; 3153 im[idx] = 0; 3154 } 3155 3156 /* make sure diagonal entry is included */ 3157 if (diagonal_fill && fill[prow] == -1) { 3158 fm = n; 3159 while (fill[fm] < prow) fm = fill[fm]; 3160 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3161 fill[fm] = prow; 3162 im[prow] = 0; 3163 nzf++; 3164 dcount++; 3165 } 3166 3167 nzi = 0; 3168 row = fill[n]; 3169 while (row < prow) { 3170 incrlev = im[row] + 1; 3171 nz = dloc[row]; 3172 xi = ajnew + ainew[row] + nz + 1; 3173 flev = ajfill + ainew[row] + nz + 1; 3174 nnz = ainew[row+1] - ainew[row] - nz - 1; 3175 fm = row; 3176 while (nnz-- > 0) { 3177 idx = *xi++; 3178 if (*flev + incrlev > levels) { 3179 flev++; 3180 continue; 3181 } 3182 do { 3183 m = fm; 3184 fm = fill[m]; 3185 } while (fm < idx); 3186 if (fm != idx) { 3187 im[idx] = *flev + incrlev; 3188 fill[m] = idx; 3189 fill[idx] = fm; 3190 fm = idx; 3191 nzf++; 3192 } else { 3193 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3194 } 3195 flev++; 3196 } 3197 row = fill[row]; 3198 nzi++; 3199 } 3200 /* copy new filled row into permanent storage */ 3201 ainew[prow+1] = ainew[prow] + nzf; 3202 if (ainew[prow+1] > jmax) { 3203 3204 /* estimate how much additional space we will need */ 3205 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3206 /* just double the memory each time */ 3207 PetscInt maxadd = jmax; 3208 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3209 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3210 jmax += maxadd; 3211 3212 /* allocate a longer ajnew and ajfill */ 3213 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr); 3214 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3215 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3216 ajnew = xi; 3217 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr); 3218 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3219 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3220 ajfill = xi; 3221 reallocate++; /* count how many reallocations are needed */ 3222 } 3223 xi = ajnew + ainew[prow]; 3224 flev = ajfill + ainew[prow]; 3225 dloc[prow] = nzi; 3226 fm = fill[n]; 3227 while (nzf--) { 3228 *xi++ = fm; 3229 *flev++ = im[fm]; 3230 fm = fill[fm]; 3231 } 3232 /* make sure row has diagonal entry */ 3233 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3234 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 3235 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3236 } 3237 } 3238 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3239 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3240 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3241 ierr = PetscFree(fill);CHKERRQ(ierr); 3242 ierr = PetscFree(im);CHKERRQ(ierr); 3243 3244 #if defined(PETSC_USE_INFO) 3245 { 3246 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3247 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3248 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3249 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3250 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3251 if (diagonal_fill) { 3252 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3253 } 3254 } 3255 #endif 3256 3257 /* put together the new matrix */ 3258 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(*fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3259 ierr = PetscLogObjectParent(*fact,isicol);CHKERRQ(ierr); 3260 b = (Mat_SeqBAIJ*)(*fact)->data; 3261 b->free_a = PETSC_TRUE; 3262 b->free_ij = PETSC_TRUE; 3263 b->singlemalloc = PETSC_FALSE; 3264 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 3265 b->j = ajnew; 3266 b->i = ainew; 3267 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3268 b->diag = dloc; 3269 b->ilen = 0; 3270 b->imax = 0; 3271 b->row = isrow; 3272 b->col = iscol; 3273 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3274 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3275 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3276 b->icol = isicol; 3277 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3278 /* In b structure: Free imax, ilen, old a, old j. 3279 Allocate dloc, solve_work, new a, new j */ 3280 ierr = PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 3281 b->maxnz = b->nz = ainew[n]; 3282 (*fact)->factor = MAT_FACTOR_LU; 3283 3284 (*fact)->info.factor_mallocs = reallocate; 3285 (*fact)->info.fill_ratio_given = f; 3286 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3287 } 3288 ierr = MatSeqBAIJSetNumericFactorization(*fact,(PetscTruth)(row_identity && col_identity));CHKERRQ(ierr); 3289 PetscFunctionReturn(0); 3290 } 3291 3292 #undef __FUNCT__ 3293 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3294 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3295 { 3296 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3297 /* int i,*AJ=a->j,nz=a->nz; */ 3298 PetscFunctionBegin; 3299 /* Undo Column scaling */ 3300 /* while (nz--) { */ 3301 /* AJ[i] = AJ[i]/4; */ 3302 /* } */ 3303 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3304 A->ops->setunfactored = PETSC_NULL; 3305 PetscFunctionReturn(0); 3306 } 3307 3308 #undef __FUNCT__ 3309 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3310 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3311 { 3312 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3313 PetscInt *AJ=a->j,nz=a->nz; 3314 unsigned short *aj=(unsigned short *)AJ; 3315 PetscFunctionBegin; 3316 /* Is this really necessary? */ 3317 while (nz--) { 3318 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3319 } 3320 A->ops->setunfactored = PETSC_NULL; 3321 PetscFunctionReturn(0); 3322 } 3323 3324 3325