1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 7 #include "src/mat/impls/baij/seq/baij.h" 8 #include "src/inline/ilu.h" 9 #include "src/inline/dot.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 PetscErrorCode ierr; 17 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18 PetscInt *diag = a->diag; 19 MatScalar *aa=a->a,*v; 20 PetscScalar s1,*x,*b; 21 22 PetscFunctionBegin; 23 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 24 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 27 /* forward solve the U^T */ 28 for (i=0; i<n; i++) { 29 30 v = aa + diag[i]; 31 /* multiply by the inverse of the block diagonal */ 32 s1 = (*v++)*x[i]; 33 vi = aj + diag[i] + 1; 34 nz = ai[i+1] - diag[i] - 1; 35 while (nz--) { 36 x[*vi++] -= (*v++)*s1; 37 } 38 x[i] = s1; 39 } 40 /* backward solve the L^T */ 41 for (i=n-1; i>=0; i--){ 42 v = aa + diag[i] - 1; 43 vi = aj + diag[i] - 1; 44 nz = diag[i] - ai[i]; 45 s1 = x[i]; 46 while (nz--) { 47 x[*vi--] -= (*v--)*s1; 48 } 49 } 50 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 51 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52 ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 59 { 60 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 61 PetscErrorCode ierr; 62 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 63 PetscInt *diag = a->diag,oidx; 64 MatScalar *aa=a->a,*v; 65 PetscScalar s1,s2,x1,x2; 66 PetscScalar *x,*b; 67 68 PetscFunctionBegin; 69 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 70 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 71 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 72 73 /* forward solve the U^T */ 74 idx = 0; 75 for (i=0; i<n; i++) { 76 77 v = aa + 4*diag[i]; 78 /* multiply by the inverse of the block diagonal */ 79 x1 = x[idx]; x2 = x[1+idx]; 80 s1 = v[0]*x1 + v[1]*x2; 81 s2 = v[2]*x1 + v[3]*x2; 82 v += 4; 83 84 vi = aj + diag[i] + 1; 85 nz = ai[i+1] - diag[i] - 1; 86 while (nz--) { 87 oidx = 2*(*vi++); 88 x[oidx] -= v[0]*s1 + v[1]*s2; 89 x[oidx+1] -= v[2]*s1 + v[3]*s2; 90 v += 4; 91 } 92 x[idx] = s1;x[1+idx] = s2; 93 idx += 2; 94 } 95 /* backward solve the L^T */ 96 for (i=n-1; i>=0; i--){ 97 v = aa + 4*diag[i] - 4; 98 vi = aj + diag[i] - 1; 99 nz = diag[i] - ai[i]; 100 idt = 2*i; 101 s1 = x[idt]; s2 = x[1+idt]; 102 while (nz--) { 103 idx = 2*(*vi--); 104 x[idx] -= v[0]*s1 + v[1]*s2; 105 x[idx+1] -= v[2]*s1 + v[3]*s2; 106 v -= 4; 107 } 108 } 109 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 110 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 111 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 112 PetscFunctionReturn(0); 113 } 114 115 #undef __FUNCT__ 116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 118 { 119 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 120 PetscErrorCode ierr; 121 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 122 PetscInt *diag = a->diag,oidx; 123 MatScalar *aa=a->a,*v; 124 PetscScalar s1,s2,s3,x1,x2,x3; 125 PetscScalar *x,*b; 126 127 PetscFunctionBegin; 128 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 129 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 130 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 131 132 /* forward solve the U^T */ 133 idx = 0; 134 for (i=0; i<n; i++) { 135 136 v = aa + 9*diag[i]; 137 /* multiply by the inverse of the block diagonal */ 138 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 139 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 140 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 141 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 142 v += 9; 143 144 vi = aj + diag[i] + 1; 145 nz = ai[i+1] - diag[i] - 1; 146 while (nz--) { 147 oidx = 3*(*vi++); 148 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 149 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 150 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 151 v += 9; 152 } 153 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 154 idx += 3; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + 9*diag[i] - 9; 159 vi = aj + diag[i] - 1; 160 nz = diag[i] - ai[i]; 161 idt = 3*i; 162 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 163 while (nz--) { 164 idx = 3*(*vi--); 165 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 166 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 167 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 168 v -= 9; 169 } 170 } 171 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 172 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 173 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 180 { 181 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182 PetscErrorCode ierr; 183 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 184 PetscInt *diag = a->diag,oidx; 185 MatScalar *aa=a->a,*v; 186 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 187 PetscScalar *x,*b; 188 189 PetscFunctionBegin; 190 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 191 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 192 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 193 194 /* forward solve the U^T */ 195 idx = 0; 196 for (i=0; i<n; i++) { 197 198 v = aa + 16*diag[i]; 199 /* multiply by the inverse of the block diagonal */ 200 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 201 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 202 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 203 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 204 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 205 v += 16; 206 207 vi = aj + diag[i] + 1; 208 nz = ai[i+1] - diag[i] - 1; 209 while (nz--) { 210 oidx = 4*(*vi++); 211 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 212 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 213 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 214 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 215 v += 16; 216 } 217 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 218 idx += 4; 219 } 220 /* backward solve the L^T */ 221 for (i=n-1; i>=0; i--){ 222 v = aa + 16*diag[i] - 16; 223 vi = aj + diag[i] - 1; 224 nz = diag[i] - ai[i]; 225 idt = 4*i; 226 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 227 while (nz--) { 228 idx = 4*(*vi--); 229 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 230 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 231 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 232 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 233 v -= 16; 234 } 235 } 236 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 237 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 238 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 #undef __FUNCT__ 243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 245 { 246 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 247 PetscErrorCode ierr; 248 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 249 PetscInt *diag = a->diag,oidx; 250 MatScalar *aa=a->a,*v; 251 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 252 PetscScalar *x,*b; 253 254 PetscFunctionBegin; 255 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 256 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 257 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 258 259 /* forward solve the U^T */ 260 idx = 0; 261 for (i=0; i<n; i++) { 262 263 v = aa + 25*diag[i]; 264 /* multiply by the inverse of the block diagonal */ 265 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 266 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 267 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 268 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 269 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 270 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 271 v += 25; 272 273 vi = aj + diag[i] + 1; 274 nz = ai[i+1] - diag[i] - 1; 275 while (nz--) { 276 oidx = 5*(*vi++); 277 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 278 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 279 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 280 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 281 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 282 v += 25; 283 } 284 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 285 idx += 5; 286 } 287 /* backward solve the L^T */ 288 for (i=n-1; i>=0; i--){ 289 v = aa + 25*diag[i] - 25; 290 vi = aj + diag[i] - 1; 291 nz = diag[i] - ai[i]; 292 idt = 5*i; 293 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 294 while (nz--) { 295 idx = 5*(*vi--); 296 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 297 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 298 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 299 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 300 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 301 v -= 25; 302 } 303 } 304 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 305 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 306 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 #undef __FUNCT__ 311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 313 { 314 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 315 PetscErrorCode ierr; 316 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 317 PetscInt *diag = a->diag,oidx; 318 MatScalar *aa=a->a,*v; 319 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 320 PetscScalar *x,*b; 321 322 PetscFunctionBegin; 323 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 324 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 325 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 326 327 /* forward solve the U^T */ 328 idx = 0; 329 for (i=0; i<n; i++) { 330 331 v = aa + 36*diag[i]; 332 /* multiply by the inverse of the block diagonal */ 333 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 334 x6 = x[5+idx]; 335 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 336 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 337 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 338 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 339 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 340 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 341 v += 36; 342 343 vi = aj + diag[i] + 1; 344 nz = ai[i+1] - diag[i] - 1; 345 while (nz--) { 346 oidx = 6*(*vi++); 347 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 348 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 349 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 350 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 351 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 352 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 353 v += 36; 354 } 355 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 356 x[5+idx] = s6; 357 idx += 6; 358 } 359 /* backward solve the L^T */ 360 for (i=n-1; i>=0; i--){ 361 v = aa + 36*diag[i] - 36; 362 vi = aj + diag[i] - 1; 363 nz = diag[i] - ai[i]; 364 idt = 6*i; 365 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 366 s6 = x[5+idt]; 367 while (nz--) { 368 idx = 6*(*vi--); 369 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 370 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 371 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 372 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 373 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 374 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 375 v -= 36; 376 } 377 } 378 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 379 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 380 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 381 PetscFunctionReturn(0); 382 } 383 384 #undef __FUNCT__ 385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 387 { 388 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 389 PetscErrorCode ierr; 390 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 391 PetscInt *diag = a->diag,oidx; 392 MatScalar *aa=a->a,*v; 393 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 394 PetscScalar *x,*b; 395 396 PetscFunctionBegin; 397 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 398 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 399 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 400 401 /* forward solve the U^T */ 402 idx = 0; 403 for (i=0; i<n; i++) { 404 405 v = aa + 49*diag[i]; 406 /* multiply by the inverse of the block diagonal */ 407 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 408 x6 = x[5+idx]; x7 = x[6+idx]; 409 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 410 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 411 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 412 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 413 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 414 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 415 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 416 v += 49; 417 418 vi = aj + diag[i] + 1; 419 nz = ai[i+1] - diag[i] - 1; 420 while (nz--) { 421 oidx = 7*(*vi++); 422 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 423 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 424 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 425 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 426 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 427 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 428 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 429 v += 49; 430 } 431 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 432 x[5+idx] = s6;x[6+idx] = s7; 433 idx += 7; 434 } 435 /* backward solve the L^T */ 436 for (i=n-1; i>=0; i--){ 437 v = aa + 49*diag[i] - 49; 438 vi = aj + diag[i] - 1; 439 nz = diag[i] - ai[i]; 440 idt = 7*i; 441 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 442 s6 = x[5+idt];s7 = x[6+idt]; 443 while (nz--) { 444 idx = 7*(*vi--); 445 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 446 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 447 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 448 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 449 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 450 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 451 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 452 v -= 49; 453 } 454 } 455 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 458 PetscFunctionReturn(0); 459 } 460 461 /*---------------------------------------------------------------------------------------------*/ 462 #undef __FUNCT__ 463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 465 { 466 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 467 IS iscol=a->col,isrow=a->row; 468 PetscErrorCode ierr; 469 const PetscInt *r,*c,*rout,*cout; 470 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 471 PetscInt *diag = a->diag; 472 MatScalar *aa=a->a,*v; 473 PetscScalar s1,*x,*b,*t; 474 475 PetscFunctionBegin; 476 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 477 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 478 t = a->solve_work; 479 480 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 481 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 482 483 /* copy the b into temp work space according to permutation */ 484 for (i=0; i<n; i++) { 485 t[i] = b[c[i]]; 486 } 487 488 /* forward solve the U^T */ 489 for (i=0; i<n; i++) { 490 491 v = aa + diag[i]; 492 /* multiply by the inverse of the block diagonal */ 493 s1 = (*v++)*t[i]; 494 vi = aj + diag[i] + 1; 495 nz = ai[i+1] - diag[i] - 1; 496 while (nz--) { 497 t[*vi++] -= (*v++)*s1; 498 } 499 t[i] = s1; 500 } 501 /* backward solve the L^T */ 502 for (i=n-1; i>=0; i--){ 503 v = aa + diag[i] - 1; 504 vi = aj + diag[i] - 1; 505 nz = diag[i] - ai[i]; 506 s1 = t[i]; 507 while (nz--) { 508 t[*vi--] -= (*v--)*s1; 509 } 510 } 511 512 /* copy t into x according to permutation */ 513 for (i=0; i<n; i++) { 514 x[r[i]] = t[i]; 515 } 516 517 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 518 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 519 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 520 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 521 ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 522 PetscFunctionReturn(0); 523 } 524 525 #undef __FUNCT__ 526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 527 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 528 { 529 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 530 IS iscol=a->col,isrow=a->row; 531 PetscErrorCode ierr; 532 const PetscInt *r,*c,*rout,*cout; 533 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 534 PetscInt *diag = a->diag,ii,ic,ir,oidx; 535 MatScalar *aa=a->a,*v; 536 PetscScalar s1,s2,x1,x2; 537 PetscScalar *x,*b,*t; 538 539 PetscFunctionBegin; 540 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 541 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 542 t = a->solve_work; 543 544 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 545 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 546 547 /* copy the b into temp work space according to permutation */ 548 ii = 0; 549 for (i=0; i<n; i++) { 550 ic = 2*c[i]; 551 t[ii] = b[ic]; 552 t[ii+1] = b[ic+1]; 553 ii += 2; 554 } 555 556 /* forward solve the U^T */ 557 idx = 0; 558 for (i=0; i<n; i++) { 559 560 v = aa + 4*diag[i]; 561 /* multiply by the inverse of the block diagonal */ 562 x1 = t[idx]; x2 = t[1+idx]; 563 s1 = v[0]*x1 + v[1]*x2; 564 s2 = v[2]*x1 + v[3]*x2; 565 v += 4; 566 567 vi = aj + diag[i] + 1; 568 nz = ai[i+1] - diag[i] - 1; 569 while (nz--) { 570 oidx = 2*(*vi++); 571 t[oidx] -= v[0]*s1 + v[1]*s2; 572 t[oidx+1] -= v[2]*s1 + v[3]*s2; 573 v += 4; 574 } 575 t[idx] = s1;t[1+idx] = s2; 576 idx += 2; 577 } 578 /* backward solve the L^T */ 579 for (i=n-1; i>=0; i--){ 580 v = aa + 4*diag[i] - 4; 581 vi = aj + diag[i] - 1; 582 nz = diag[i] - ai[i]; 583 idt = 2*i; 584 s1 = t[idt]; s2 = t[1+idt]; 585 while (nz--) { 586 idx = 2*(*vi--); 587 t[idx] -= v[0]*s1 + v[1]*s2; 588 t[idx+1] -= v[2]*s1 + v[3]*s2; 589 v -= 4; 590 } 591 } 592 593 /* copy t into x according to permutation */ 594 ii = 0; 595 for (i=0; i<n; i++) { 596 ir = 2*r[i]; 597 x[ir] = t[ii]; 598 x[ir+1] = t[ii+1]; 599 ii += 2; 600 } 601 602 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 603 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 604 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 605 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 606 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 607 PetscFunctionReturn(0); 608 } 609 610 #undef __FUNCT__ 611 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 612 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 613 { 614 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 615 IS iscol=a->col,isrow=a->row; 616 PetscErrorCode ierr; 617 const PetscInt *r,*c,*rout,*cout; 618 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 619 PetscInt *diag = a->diag,ii,ic,ir,oidx; 620 MatScalar *aa=a->a,*v; 621 PetscScalar s1,s2,s3,x1,x2,x3; 622 PetscScalar *x,*b,*t; 623 624 PetscFunctionBegin; 625 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 626 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 627 t = a->solve_work; 628 629 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 630 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 631 632 /* copy the b into temp work space according to permutation */ 633 ii = 0; 634 for (i=0; i<n; i++) { 635 ic = 3*c[i]; 636 t[ii] = b[ic]; 637 t[ii+1] = b[ic+1]; 638 t[ii+2] = b[ic+2]; 639 ii += 3; 640 } 641 642 /* forward solve the U^T */ 643 idx = 0; 644 for (i=0; i<n; i++) { 645 646 v = aa + 9*diag[i]; 647 /* multiply by the inverse of the block diagonal */ 648 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 649 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 650 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 651 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 652 v += 9; 653 654 vi = aj + diag[i] + 1; 655 nz = ai[i+1] - diag[i] - 1; 656 while (nz--) { 657 oidx = 3*(*vi++); 658 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 659 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 660 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 661 v += 9; 662 } 663 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 664 idx += 3; 665 } 666 /* backward solve the L^T */ 667 for (i=n-1; i>=0; i--){ 668 v = aa + 9*diag[i] - 9; 669 vi = aj + diag[i] - 1; 670 nz = diag[i] - ai[i]; 671 idt = 3*i; 672 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 673 while (nz--) { 674 idx = 3*(*vi--); 675 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 676 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 677 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 678 v -= 9; 679 } 680 } 681 682 /* copy t into x according to permutation */ 683 ii = 0; 684 for (i=0; i<n; i++) { 685 ir = 3*r[i]; 686 x[ir] = t[ii]; 687 x[ir+1] = t[ii+1]; 688 x[ir+2] = t[ii+2]; 689 ii += 3; 690 } 691 692 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 693 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 694 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 695 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 696 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 697 PetscFunctionReturn(0); 698 } 699 700 #undef __FUNCT__ 701 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 702 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 703 { 704 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 705 IS iscol=a->col,isrow=a->row; 706 PetscErrorCode ierr; 707 const PetscInt *r,*c,*rout,*cout; 708 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 709 PetscInt *diag = a->diag,ii,ic,ir,oidx; 710 MatScalar *aa=a->a,*v; 711 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 712 PetscScalar *x,*b,*t; 713 714 PetscFunctionBegin; 715 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 716 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 717 t = a->solve_work; 718 719 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 720 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 721 722 /* copy the b into temp work space according to permutation */ 723 ii = 0; 724 for (i=0; i<n; i++) { 725 ic = 4*c[i]; 726 t[ii] = b[ic]; 727 t[ii+1] = b[ic+1]; 728 t[ii+2] = b[ic+2]; 729 t[ii+3] = b[ic+3]; 730 ii += 4; 731 } 732 733 /* forward solve the U^T */ 734 idx = 0; 735 for (i=0; i<n; i++) { 736 737 v = aa + 16*diag[i]; 738 /* multiply by the inverse of the block diagonal */ 739 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 740 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 741 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 742 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 743 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 744 v += 16; 745 746 vi = aj + diag[i] + 1; 747 nz = ai[i+1] - diag[i] - 1; 748 while (nz--) { 749 oidx = 4*(*vi++); 750 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 751 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 752 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 753 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 754 v += 16; 755 } 756 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 757 idx += 4; 758 } 759 /* backward solve the L^T */ 760 for (i=n-1; i>=0; i--){ 761 v = aa + 16*diag[i] - 16; 762 vi = aj + diag[i] - 1; 763 nz = diag[i] - ai[i]; 764 idt = 4*i; 765 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 766 while (nz--) { 767 idx = 4*(*vi--); 768 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 769 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 770 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 771 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 772 v -= 16; 773 } 774 } 775 776 /* copy t into x according to permutation */ 777 ii = 0; 778 for (i=0; i<n; i++) { 779 ir = 4*r[i]; 780 x[ir] = t[ii]; 781 x[ir+1] = t[ii+1]; 782 x[ir+2] = t[ii+2]; 783 x[ir+3] = t[ii+3]; 784 ii += 4; 785 } 786 787 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 788 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 789 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 790 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 791 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 792 PetscFunctionReturn(0); 793 } 794 795 #undef __FUNCT__ 796 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 797 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 798 { 799 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 800 IS iscol=a->col,isrow=a->row; 801 PetscErrorCode ierr; 802 const PetscInt *r,*c,*rout,*cout; 803 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 804 PetscInt *diag = a->diag,ii,ic,ir,oidx; 805 MatScalar *aa=a->a,*v; 806 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 807 PetscScalar *x,*b,*t; 808 809 PetscFunctionBegin; 810 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 811 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 812 t = a->solve_work; 813 814 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 815 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 816 817 /* copy the b into temp work space according to permutation */ 818 ii = 0; 819 for (i=0; i<n; i++) { 820 ic = 5*c[i]; 821 t[ii] = b[ic]; 822 t[ii+1] = b[ic+1]; 823 t[ii+2] = b[ic+2]; 824 t[ii+3] = b[ic+3]; 825 t[ii+4] = b[ic+4]; 826 ii += 5; 827 } 828 829 /* forward solve the U^T */ 830 idx = 0; 831 for (i=0; i<n; i++) { 832 833 v = aa + 25*diag[i]; 834 /* multiply by the inverse of the block diagonal */ 835 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 836 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 837 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 838 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 839 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 840 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 841 v += 25; 842 843 vi = aj + diag[i] + 1; 844 nz = ai[i+1] - diag[i] - 1; 845 while (nz--) { 846 oidx = 5*(*vi++); 847 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 848 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 849 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 850 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 851 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 852 v += 25; 853 } 854 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 855 idx += 5; 856 } 857 /* backward solve the L^T */ 858 for (i=n-1; i>=0; i--){ 859 v = aa + 25*diag[i] - 25; 860 vi = aj + diag[i] - 1; 861 nz = diag[i] - ai[i]; 862 idt = 5*i; 863 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 864 while (nz--) { 865 idx = 5*(*vi--); 866 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 867 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 868 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 869 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 870 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 871 v -= 25; 872 } 873 } 874 875 /* copy t into x according to permutation */ 876 ii = 0; 877 for (i=0; i<n; i++) { 878 ir = 5*r[i]; 879 x[ir] = t[ii]; 880 x[ir+1] = t[ii+1]; 881 x[ir+2] = t[ii+2]; 882 x[ir+3] = t[ii+3]; 883 x[ir+4] = t[ii+4]; 884 ii += 5; 885 } 886 887 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 888 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 889 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 890 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 891 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 892 PetscFunctionReturn(0); 893 } 894 895 #undef __FUNCT__ 896 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 897 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 898 { 899 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 900 IS iscol=a->col,isrow=a->row; 901 PetscErrorCode ierr; 902 const PetscInt *r,*c,*rout,*cout; 903 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 904 PetscInt *diag = a->diag,ii,ic,ir,oidx; 905 MatScalar *aa=a->a,*v; 906 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 907 PetscScalar *x,*b,*t; 908 909 PetscFunctionBegin; 910 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 911 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 912 t = a->solve_work; 913 914 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 915 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 916 917 /* copy the b into temp work space according to permutation */ 918 ii = 0; 919 for (i=0; i<n; i++) { 920 ic = 6*c[i]; 921 t[ii] = b[ic]; 922 t[ii+1] = b[ic+1]; 923 t[ii+2] = b[ic+2]; 924 t[ii+3] = b[ic+3]; 925 t[ii+4] = b[ic+4]; 926 t[ii+5] = b[ic+5]; 927 ii += 6; 928 } 929 930 /* forward solve the U^T */ 931 idx = 0; 932 for (i=0; i<n; i++) { 933 934 v = aa + 36*diag[i]; 935 /* multiply by the inverse of the block diagonal */ 936 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 937 x6 = t[5+idx]; 938 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 939 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 940 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 941 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 942 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 943 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 944 v += 36; 945 946 vi = aj + diag[i] + 1; 947 nz = ai[i+1] - diag[i] - 1; 948 while (nz--) { 949 oidx = 6*(*vi++); 950 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 951 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 952 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 953 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 954 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 955 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 956 v += 36; 957 } 958 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 959 t[5+idx] = s6; 960 idx += 6; 961 } 962 /* backward solve the L^T */ 963 for (i=n-1; i>=0; i--){ 964 v = aa + 36*diag[i] - 36; 965 vi = aj + diag[i] - 1; 966 nz = diag[i] - ai[i]; 967 idt = 6*i; 968 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 969 s6 = t[5+idt]; 970 while (nz--) { 971 idx = 6*(*vi--); 972 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 973 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 974 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 975 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 976 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 977 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 978 v -= 36; 979 } 980 } 981 982 /* copy t into x according to permutation */ 983 ii = 0; 984 for (i=0; i<n; i++) { 985 ir = 6*r[i]; 986 x[ir] = t[ii]; 987 x[ir+1] = t[ii+1]; 988 x[ir+2] = t[ii+2]; 989 x[ir+3] = t[ii+3]; 990 x[ir+4] = t[ii+4]; 991 x[ir+5] = t[ii+5]; 992 ii += 6; 993 } 994 995 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 996 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 997 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 998 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 999 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 1000 PetscFunctionReturn(0); 1001 } 1002 1003 #undef __FUNCT__ 1004 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1005 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1006 { 1007 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1008 IS iscol=a->col,isrow=a->row; 1009 PetscErrorCode ierr; 1010 const PetscInt *r,*c,*rout,*cout; 1011 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1012 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1013 MatScalar *aa=a->a,*v; 1014 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1015 PetscScalar *x,*b,*t; 1016 1017 PetscFunctionBegin; 1018 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1019 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1020 t = a->solve_work; 1021 1022 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1023 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1024 1025 /* copy the b into temp work space according to permutation */ 1026 ii = 0; 1027 for (i=0; i<n; i++) { 1028 ic = 7*c[i]; 1029 t[ii] = b[ic]; 1030 t[ii+1] = b[ic+1]; 1031 t[ii+2] = b[ic+2]; 1032 t[ii+3] = b[ic+3]; 1033 t[ii+4] = b[ic+4]; 1034 t[ii+5] = b[ic+5]; 1035 t[ii+6] = b[ic+6]; 1036 ii += 7; 1037 } 1038 1039 /* forward solve the U^T */ 1040 idx = 0; 1041 for (i=0; i<n; i++) { 1042 1043 v = aa + 49*diag[i]; 1044 /* multiply by the inverse of the block diagonal */ 1045 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1046 x6 = t[5+idx]; x7 = t[6+idx]; 1047 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1048 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1049 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1050 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1051 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1052 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1053 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1054 v += 49; 1055 1056 vi = aj + diag[i] + 1; 1057 nz = ai[i+1] - diag[i] - 1; 1058 while (nz--) { 1059 oidx = 7*(*vi++); 1060 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1061 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1062 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1063 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1064 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1065 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1066 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1067 v += 49; 1068 } 1069 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1070 t[5+idx] = s6;t[6+idx] = s7; 1071 idx += 7; 1072 } 1073 /* backward solve the L^T */ 1074 for (i=n-1; i>=0; i--){ 1075 v = aa + 49*diag[i] - 49; 1076 vi = aj + diag[i] - 1; 1077 nz = diag[i] - ai[i]; 1078 idt = 7*i; 1079 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1080 s6 = t[5+idt];s7 = t[6+idt]; 1081 while (nz--) { 1082 idx = 7*(*vi--); 1083 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1084 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1085 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1086 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1087 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1088 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1089 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1090 v -= 49; 1091 } 1092 } 1093 1094 /* copy t into x according to permutation */ 1095 ii = 0; 1096 for (i=0; i<n; i++) { 1097 ir = 7*r[i]; 1098 x[ir] = t[ii]; 1099 x[ir+1] = t[ii+1]; 1100 x[ir+2] = t[ii+2]; 1101 x[ir+3] = t[ii+3]; 1102 x[ir+4] = t[ii+4]; 1103 x[ir+5] = t[ii+5]; 1104 x[ir+6] = t[ii+6]; 1105 ii += 7; 1106 } 1107 1108 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1109 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1110 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1111 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1112 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 1113 PetscFunctionReturn(0); 1114 } 1115 1116 /* ----------------------------------------------------------- */ 1117 #undef __FUNCT__ 1118 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1119 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1120 { 1121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1122 IS iscol=a->col,isrow=a->row; 1123 PetscErrorCode ierr; 1124 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1125 PetscInt i,n=a->mbs; 1126 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1127 MatScalar *aa=a->a,*v; 1128 PetscScalar *x,*b,*s,*t,*ls; 1129 1130 PetscFunctionBegin; 1131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1133 t = a->solve_work; 1134 1135 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1136 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1137 1138 /* forward solve the lower triangular */ 1139 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1140 for (i=1; i<n; i++) { 1141 v = aa + bs2*ai[i]; 1142 vi = aj + ai[i]; 1143 nz = a->diag[i] - ai[i]; 1144 s = t + bs*i; 1145 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1146 while (nz--) { 1147 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1148 v += bs2; 1149 } 1150 } 1151 /* backward solve the upper triangular */ 1152 ls = a->solve_work + A->cmap->n; 1153 for (i=n-1; i>=0; i--){ 1154 v = aa + bs2*(a->diag[i] + 1); 1155 vi = aj + a->diag[i] + 1; 1156 nz = ai[i+1] - a->diag[i] - 1; 1157 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1158 while (nz--) { 1159 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1160 v += bs2; 1161 } 1162 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1163 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1164 } 1165 1166 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1167 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1168 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1170 ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1171 PetscFunctionReturn(0); 1172 } 1173 1174 #undef __FUNCT__ 1175 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1176 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1177 { 1178 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1179 IS iscol=a->col,isrow=a->row; 1180 PetscErrorCode ierr; 1181 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1182 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1183 MatScalar *aa=a->a,*v; 1184 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1185 PetscScalar *x,*b,*t; 1186 1187 PetscFunctionBegin; 1188 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1189 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1190 t = a->solve_work; 1191 1192 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1193 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1194 1195 /* forward solve the lower triangular */ 1196 idx = 7*(*r++); 1197 t[0] = b[idx]; t[1] = b[1+idx]; 1198 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1199 t[5] = b[5+idx]; t[6] = b[6+idx]; 1200 1201 for (i=1; i<n; i++) { 1202 v = aa + 49*ai[i]; 1203 vi = aj + ai[i]; 1204 nz = diag[i] - ai[i]; 1205 idx = 7*(*r++); 1206 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1207 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1208 while (nz--) { 1209 idx = 7*(*vi++); 1210 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1211 x4 = t[3+idx];x5 = t[4+idx]; 1212 x6 = t[5+idx];x7 = t[6+idx]; 1213 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1214 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1215 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1216 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1217 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1218 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1219 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1220 v += 49; 1221 } 1222 idx = 7*i; 1223 t[idx] = s1;t[1+idx] = s2; 1224 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1225 t[5+idx] = s6;t[6+idx] = s7; 1226 } 1227 /* backward solve the upper triangular */ 1228 for (i=n-1; i>=0; i--){ 1229 v = aa + 49*diag[i] + 49; 1230 vi = aj + diag[i] + 1; 1231 nz = ai[i+1] - diag[i] - 1; 1232 idt = 7*i; 1233 s1 = t[idt]; s2 = t[1+idt]; 1234 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1235 s6 = t[5+idt];s7 = t[6+idt]; 1236 while (nz--) { 1237 idx = 7*(*vi++); 1238 x1 = t[idx]; x2 = t[1+idx]; 1239 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1240 x6 = t[5+idx]; x7 = t[6+idx]; 1241 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1242 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1243 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1244 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1245 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1246 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1247 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1248 v += 49; 1249 } 1250 idc = 7*(*c--); 1251 v = aa + 49*diag[i]; 1252 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1253 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1254 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1255 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1256 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1257 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1258 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1259 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1260 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1261 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1262 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1263 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1264 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1265 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1266 } 1267 1268 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1269 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1270 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1271 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1272 ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr); 1273 PetscFunctionReturn(0); 1274 } 1275 1276 #undef __FUNCT__ 1277 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1278 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1279 { 1280 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1281 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1282 PetscErrorCode ierr; 1283 PetscInt *diag = a->diag,jdx; 1284 const MatScalar *aa=a->a,*v; 1285 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1286 const PetscScalar *b; 1287 1288 PetscFunctionBegin; 1289 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1290 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1291 /* forward solve the lower triangular */ 1292 idx = 0; 1293 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1294 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1295 x[6] = b[6+idx]; 1296 for (i=1; i<n; i++) { 1297 v = aa + 49*ai[i]; 1298 vi = aj + ai[i]; 1299 nz = diag[i] - ai[i]; 1300 idx = 7*i; 1301 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1302 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1303 s7 = b[6+idx]; 1304 while (nz--) { 1305 jdx = 7*(*vi++); 1306 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1307 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1308 x7 = x[6+jdx]; 1309 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1310 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1311 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1312 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1313 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1314 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1315 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1316 v += 49; 1317 } 1318 x[idx] = s1; 1319 x[1+idx] = s2; 1320 x[2+idx] = s3; 1321 x[3+idx] = s4; 1322 x[4+idx] = s5; 1323 x[5+idx] = s6; 1324 x[6+idx] = s7; 1325 } 1326 /* backward solve the upper triangular */ 1327 for (i=n-1; i>=0; i--){ 1328 v = aa + 49*diag[i] + 49; 1329 vi = aj + diag[i] + 1; 1330 nz = ai[i+1] - diag[i] - 1; 1331 idt = 7*i; 1332 s1 = x[idt]; s2 = x[1+idt]; 1333 s3 = x[2+idt]; s4 = x[3+idt]; 1334 s5 = x[4+idt]; s6 = x[5+idt]; 1335 s7 = x[6+idt]; 1336 while (nz--) { 1337 idx = 7*(*vi++); 1338 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1339 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1340 x7 = x[6+idx]; 1341 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1342 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1343 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1344 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1345 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1346 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1347 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1348 v += 49; 1349 } 1350 v = aa + 49*diag[i]; 1351 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1352 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1353 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1354 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1355 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1356 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1357 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1358 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1359 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1360 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1361 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1362 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1363 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1364 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1365 } 1366 1367 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1368 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1369 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 1370 PetscFunctionReturn(0); 1371 } 1372 1373 #undef __FUNCT__ 1374 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1375 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1376 { 1377 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1378 IS iscol=a->col,isrow=a->row; 1379 PetscErrorCode ierr; 1380 const PetscInt *r,*c,*rout,*cout; 1381 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1382 const MatScalar *aa=a->a,*v; 1383 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1384 const PetscScalar *b; 1385 PetscFunctionBegin; 1386 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1387 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1388 t = a->solve_work; 1389 1390 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1391 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1392 1393 /* forward solve the lower triangular */ 1394 idx = 6*(*r++); 1395 t[0] = b[idx]; t[1] = b[1+idx]; 1396 t[2] = b[2+idx]; t[3] = b[3+idx]; 1397 t[4] = b[4+idx]; t[5] = b[5+idx]; 1398 for (i=1; i<n; i++) { 1399 v = aa + 36*ai[i]; 1400 vi = aj + ai[i]; 1401 nz = diag[i] - ai[i]; 1402 idx = 6*(*r++); 1403 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1404 s5 = b[4+idx]; s6 = b[5+idx]; 1405 while (nz--) { 1406 idx = 6*(*vi++); 1407 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1408 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1409 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1410 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1411 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1412 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1413 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1414 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1415 v += 36; 1416 } 1417 idx = 6*i; 1418 t[idx] = s1;t[1+idx] = s2; 1419 t[2+idx] = s3;t[3+idx] = s4; 1420 t[4+idx] = s5;t[5+idx] = s6; 1421 } 1422 /* backward solve the upper triangular */ 1423 for (i=n-1; i>=0; i--){ 1424 v = aa + 36*diag[i] + 36; 1425 vi = aj + diag[i] + 1; 1426 nz = ai[i+1] - diag[i] - 1; 1427 idt = 6*i; 1428 s1 = t[idt]; s2 = t[1+idt]; 1429 s3 = t[2+idt];s4 = t[3+idt]; 1430 s5 = t[4+idt];s6 = t[5+idt]; 1431 while (nz--) { 1432 idx = 6*(*vi++); 1433 x1 = t[idx]; x2 = t[1+idx]; 1434 x3 = t[2+idx]; x4 = t[3+idx]; 1435 x5 = t[4+idx]; x6 = t[5+idx]; 1436 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1437 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1438 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1439 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1440 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1441 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1442 v += 36; 1443 } 1444 idc = 6*(*c--); 1445 v = aa + 36*diag[i]; 1446 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1447 v[18]*s4+v[24]*s5+v[30]*s6; 1448 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1449 v[19]*s4+v[25]*s5+v[31]*s6; 1450 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1451 v[20]*s4+v[26]*s5+v[32]*s6; 1452 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1453 v[21]*s4+v[27]*s5+v[33]*s6; 1454 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1455 v[22]*s4+v[28]*s5+v[34]*s6; 1456 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1457 v[23]*s4+v[29]*s5+v[35]*s6; 1458 } 1459 1460 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1461 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1462 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1463 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1464 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 1465 PetscFunctionReturn(0); 1466 } 1467 1468 #undef __FUNCT__ 1469 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1470 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1471 { 1472 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1473 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1474 PetscErrorCode ierr; 1475 PetscInt *diag = a->diag,jdx; 1476 const MatScalar *aa=a->a,*v; 1477 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1478 const PetscScalar *b; 1479 1480 PetscFunctionBegin; 1481 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1482 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1483 /* forward solve the lower triangular */ 1484 idx = 0; 1485 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1486 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1487 for (i=1; i<n; i++) { 1488 v = aa + 36*ai[i]; 1489 vi = aj + ai[i]; 1490 nz = diag[i] - ai[i]; 1491 idx = 6*i; 1492 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1493 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1494 while (nz--) { 1495 jdx = 6*(*vi++); 1496 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1497 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1498 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1499 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1500 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1501 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1502 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1503 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1504 v += 36; 1505 } 1506 x[idx] = s1; 1507 x[1+idx] = s2; 1508 x[2+idx] = s3; 1509 x[3+idx] = s4; 1510 x[4+idx] = s5; 1511 x[5+idx] = s6; 1512 } 1513 /* backward solve the upper triangular */ 1514 for (i=n-1; i>=0; i--){ 1515 v = aa + 36*diag[i] + 36; 1516 vi = aj + diag[i] + 1; 1517 nz = ai[i+1] - diag[i] - 1; 1518 idt = 6*i; 1519 s1 = x[idt]; s2 = x[1+idt]; 1520 s3 = x[2+idt]; s4 = x[3+idt]; 1521 s5 = x[4+idt]; s6 = x[5+idt]; 1522 while (nz--) { 1523 idx = 6*(*vi++); 1524 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1525 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1526 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1527 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1528 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1529 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1530 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1531 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1532 v += 36; 1533 } 1534 v = aa + 36*diag[i]; 1535 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1536 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1537 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1538 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1539 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1540 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1541 } 1542 1543 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1544 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1545 ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr); 1546 PetscFunctionReturn(0); 1547 } 1548 1549 #undef __FUNCT__ 1550 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1551 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1552 { 1553 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1554 IS iscol=a->col,isrow=a->row; 1555 PetscErrorCode ierr; 1556 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 1557 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1558 const MatScalar *aa=a->a,*v; 1559 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1560 const PetscScalar *b; 1561 1562 PetscFunctionBegin; 1563 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1564 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1565 t = a->solve_work; 1566 1567 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1568 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1569 1570 /* forward solve the lower triangular */ 1571 idx = 5*(*r++); 1572 t[0] = b[idx]; t[1] = b[1+idx]; 1573 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1574 for (i=1; i<n; i++) { 1575 v = aa + 25*ai[i]; 1576 vi = aj + ai[i]; 1577 nz = diag[i] - ai[i]; 1578 idx = 5*(*r++); 1579 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1580 s5 = b[4+idx]; 1581 while (nz--) { 1582 idx = 5*(*vi++); 1583 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1584 x4 = t[3+idx];x5 = t[4+idx]; 1585 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1586 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1587 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1588 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1589 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1590 v += 25; 1591 } 1592 idx = 5*i; 1593 t[idx] = s1;t[1+idx] = s2; 1594 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1595 } 1596 /* backward solve the upper triangular */ 1597 for (i=n-1; i>=0; i--){ 1598 v = aa + 25*diag[i] + 25; 1599 vi = aj + diag[i] + 1; 1600 nz = ai[i+1] - diag[i] - 1; 1601 idt = 5*i; 1602 s1 = t[idt]; s2 = t[1+idt]; 1603 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1604 while (nz--) { 1605 idx = 5*(*vi++); 1606 x1 = t[idx]; x2 = t[1+idx]; 1607 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1608 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1609 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1610 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1611 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1612 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1613 v += 25; 1614 } 1615 idc = 5*(*c--); 1616 v = aa + 25*diag[i]; 1617 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1618 v[15]*s4+v[20]*s5; 1619 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1620 v[16]*s4+v[21]*s5; 1621 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1622 v[17]*s4+v[22]*s5; 1623 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1624 v[18]*s4+v[23]*s5; 1625 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1626 v[19]*s4+v[24]*s5; 1627 } 1628 1629 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1630 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1631 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1632 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1633 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 1634 PetscFunctionReturn(0); 1635 } 1636 1637 #undef __FUNCT__ 1638 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1639 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1640 { 1641 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1642 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1643 PetscErrorCode ierr; 1644 PetscInt *diag = a->diag,jdx; 1645 const MatScalar *aa=a->a,*v; 1646 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1647 const PetscScalar *b; 1648 1649 PetscFunctionBegin; 1650 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1651 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1652 /* forward solve the lower triangular */ 1653 idx = 0; 1654 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1655 for (i=1; i<n; i++) { 1656 v = aa + 25*ai[i]; 1657 vi = aj + ai[i]; 1658 nz = diag[i] - ai[i]; 1659 idx = 5*i; 1660 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1661 while (nz--) { 1662 jdx = 5*(*vi++); 1663 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1664 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1665 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1666 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1667 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1668 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1669 v += 25; 1670 } 1671 x[idx] = s1; 1672 x[1+idx] = s2; 1673 x[2+idx] = s3; 1674 x[3+idx] = s4; 1675 x[4+idx] = s5; 1676 } 1677 /* backward solve the upper triangular */ 1678 for (i=n-1; i>=0; i--){ 1679 v = aa + 25*diag[i] + 25; 1680 vi = aj + diag[i] + 1; 1681 nz = ai[i+1] - diag[i] - 1; 1682 idt = 5*i; 1683 s1 = x[idt]; s2 = x[1+idt]; 1684 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1685 while (nz--) { 1686 idx = 5*(*vi++); 1687 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1688 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1689 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1690 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1691 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1692 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1693 v += 25; 1694 } 1695 v = aa + 25*diag[i]; 1696 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1697 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1698 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1699 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1700 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1701 } 1702 1703 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1704 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1705 ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr); 1706 PetscFunctionReturn(0); 1707 } 1708 1709 #undef __FUNCT__ 1710 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1711 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1712 { 1713 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1714 IS iscol=a->col,isrow=a->row; 1715 PetscErrorCode ierr; 1716 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1717 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1718 const MatScalar *aa=a->a,*v; 1719 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1720 const PetscScalar *b; 1721 1722 PetscFunctionBegin; 1723 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1724 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1725 t = a->solve_work; 1726 1727 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1728 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1729 1730 /* forward solve the lower triangular */ 1731 idx = 4*(*r++); 1732 t[0] = b[idx]; t[1] = b[1+idx]; 1733 t[2] = b[2+idx]; t[3] = b[3+idx]; 1734 for (i=1; i<n; i++) { 1735 v = aa + 16*ai[i]; 1736 vi = aj + ai[i]; 1737 nz = diag[i] - ai[i]; 1738 idx = 4*(*r++); 1739 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1740 while (nz--) { 1741 idx = 4*(*vi++); 1742 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1743 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1744 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1745 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1746 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1747 v += 16; 1748 } 1749 idx = 4*i; 1750 t[idx] = s1;t[1+idx] = s2; 1751 t[2+idx] = s3;t[3+idx] = s4; 1752 } 1753 /* backward solve the upper triangular */ 1754 for (i=n-1; i>=0; i--){ 1755 v = aa + 16*diag[i] + 16; 1756 vi = aj + diag[i] + 1; 1757 nz = ai[i+1] - diag[i] - 1; 1758 idt = 4*i; 1759 s1 = t[idt]; s2 = t[1+idt]; 1760 s3 = t[2+idt];s4 = t[3+idt]; 1761 while (nz--) { 1762 idx = 4*(*vi++); 1763 x1 = t[idx]; x2 = t[1+idx]; 1764 x3 = t[2+idx]; x4 = t[3+idx]; 1765 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1766 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1767 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1768 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1769 v += 16; 1770 } 1771 idc = 4*(*c--); 1772 v = aa + 16*diag[i]; 1773 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1774 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1775 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1776 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1777 } 1778 1779 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1780 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1781 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1782 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1783 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 1784 PetscFunctionReturn(0); 1785 } 1786 1787 #undef __FUNCT__ 1788 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1789 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1790 { 1791 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1792 IS iscol=a->col,isrow=a->row; 1793 PetscErrorCode ierr; 1794 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1795 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1796 const MatScalar *aa=a->a,*v; 1797 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 1798 PetscScalar *x; 1799 const PetscScalar *b; 1800 1801 PetscFunctionBegin; 1802 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1803 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1804 t = (MatScalar *)a->solve_work; 1805 1806 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1807 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1808 1809 /* forward solve the lower triangular */ 1810 idx = 4*(*r++); 1811 t[0] = (MatScalar)b[idx]; 1812 t[1] = (MatScalar)b[1+idx]; 1813 t[2] = (MatScalar)b[2+idx]; 1814 t[3] = (MatScalar)b[3+idx]; 1815 for (i=1; i<n; i++) { 1816 v = aa + 16*ai[i]; 1817 vi = aj + ai[i]; 1818 nz = diag[i] - ai[i]; 1819 idx = 4*(*r++); 1820 s1 = (MatScalar)b[idx]; 1821 s2 = (MatScalar)b[1+idx]; 1822 s3 = (MatScalar)b[2+idx]; 1823 s4 = (MatScalar)b[3+idx]; 1824 while (nz--) { 1825 idx = 4*(*vi++); 1826 x1 = t[idx]; 1827 x2 = t[1+idx]; 1828 x3 = t[2+idx]; 1829 x4 = t[3+idx]; 1830 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1831 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1832 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1833 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1834 v += 16; 1835 } 1836 idx = 4*i; 1837 t[idx] = s1; 1838 t[1+idx] = s2; 1839 t[2+idx] = s3; 1840 t[3+idx] = s4; 1841 } 1842 /* backward solve the upper triangular */ 1843 for (i=n-1; i>=0; i--){ 1844 v = aa + 16*diag[i] + 16; 1845 vi = aj + diag[i] + 1; 1846 nz = ai[i+1] - diag[i] - 1; 1847 idt = 4*i; 1848 s1 = t[idt]; 1849 s2 = t[1+idt]; 1850 s3 = t[2+idt]; 1851 s4 = t[3+idt]; 1852 while (nz--) { 1853 idx = 4*(*vi++); 1854 x1 = t[idx]; 1855 x2 = t[1+idx]; 1856 x3 = t[2+idx]; 1857 x4 = t[3+idx]; 1858 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1859 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1860 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1861 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1862 v += 16; 1863 } 1864 idc = 4*(*c--); 1865 v = aa + 16*diag[i]; 1866 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1867 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1868 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1869 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1870 x[idc] = (PetscScalar)t[idt]; 1871 x[1+idc] = (PetscScalar)t[1+idt]; 1872 x[2+idc] = (PetscScalar)t[2+idt]; 1873 x[3+idc] = (PetscScalar)t[3+idt]; 1874 } 1875 1876 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1877 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1878 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1879 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1880 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 1881 PetscFunctionReturn(0); 1882 } 1883 1884 #if defined (PETSC_HAVE_SSE) 1885 1886 #include PETSC_HAVE_SSE 1887 1888 #undef __FUNCT__ 1889 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1890 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1891 { 1892 /* 1893 Note: This code uses demotion of double 1894 to float when performing the mixed-mode computation. 1895 This may not be numerically reasonable for all applications. 1896 */ 1897 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1898 IS iscol=a->col,isrow=a->row; 1899 PetscErrorCode ierr; 1900 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 1901 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1902 MatScalar *aa=a->a,*v; 1903 PetscScalar *x,*b,*t; 1904 1905 /* Make space in temp stack for 16 Byte Aligned arrays */ 1906 float ssealignedspace[11],*tmps,*tmpx; 1907 unsigned long offset; 1908 1909 PetscFunctionBegin; 1910 SSE_SCOPE_BEGIN; 1911 1912 offset = (unsigned long)ssealignedspace % 16; 1913 if (offset) offset = (16 - offset)/4; 1914 tmps = &ssealignedspace[offset]; 1915 tmpx = &ssealignedspace[offset+4]; 1916 PREFETCH_NTA(aa+16*ai[1]); 1917 1918 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1919 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1920 t = a->solve_work; 1921 1922 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1923 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1924 1925 /* forward solve the lower triangular */ 1926 idx = 4*(*r++); 1927 t[0] = b[idx]; t[1] = b[1+idx]; 1928 t[2] = b[2+idx]; t[3] = b[3+idx]; 1929 v = aa + 16*ai[1]; 1930 1931 for (i=1; i<n;) { 1932 PREFETCH_NTA(&v[8]); 1933 vi = aj + ai[i]; 1934 nz = diag[i] - ai[i]; 1935 idx = 4*(*r++); 1936 1937 /* Demote sum from double to float */ 1938 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1939 LOAD_PS(tmps,XMM7); 1940 1941 while (nz--) { 1942 PREFETCH_NTA(&v[16]); 1943 idx = 4*(*vi++); 1944 1945 /* Demote solution (so far) from double to float */ 1946 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1947 1948 /* 4x4 Matrix-Vector product with negative accumulation: */ 1949 SSE_INLINE_BEGIN_2(tmpx,v) 1950 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1951 1952 /* First Column */ 1953 SSE_COPY_PS(XMM0,XMM6) 1954 SSE_SHUFFLE(XMM0,XMM0,0x00) 1955 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1956 SSE_SUB_PS(XMM7,XMM0) 1957 1958 /* Second Column */ 1959 SSE_COPY_PS(XMM1,XMM6) 1960 SSE_SHUFFLE(XMM1,XMM1,0x55) 1961 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1962 SSE_SUB_PS(XMM7,XMM1) 1963 1964 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1965 1966 /* Third Column */ 1967 SSE_COPY_PS(XMM2,XMM6) 1968 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1969 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1970 SSE_SUB_PS(XMM7,XMM2) 1971 1972 /* Fourth Column */ 1973 SSE_COPY_PS(XMM3,XMM6) 1974 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1975 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1976 SSE_SUB_PS(XMM7,XMM3) 1977 SSE_INLINE_END_2 1978 1979 v += 16; 1980 } 1981 idx = 4*i; 1982 v = aa + 16*ai[++i]; 1983 PREFETCH_NTA(v); 1984 STORE_PS(tmps,XMM7); 1985 1986 /* Promote result from float to double */ 1987 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1988 } 1989 /* backward solve the upper triangular */ 1990 idt = 4*(n-1); 1991 ai16 = 16*diag[n-1]; 1992 v = aa + ai16 + 16; 1993 for (i=n-1; i>=0;){ 1994 PREFETCH_NTA(&v[8]); 1995 vi = aj + diag[i] + 1; 1996 nz = ai[i+1] - diag[i] - 1; 1997 1998 /* Demote accumulator from double to float */ 1999 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 2000 LOAD_PS(tmps,XMM7); 2001 2002 while (nz--) { 2003 PREFETCH_NTA(&v[16]); 2004 idx = 4*(*vi++); 2005 2006 /* Demote solution (so far) from double to float */ 2007 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 2008 2009 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2010 SSE_INLINE_BEGIN_2(tmpx,v) 2011 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2012 2013 /* First Column */ 2014 SSE_COPY_PS(XMM0,XMM6) 2015 SSE_SHUFFLE(XMM0,XMM0,0x00) 2016 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2017 SSE_SUB_PS(XMM7,XMM0) 2018 2019 /* Second Column */ 2020 SSE_COPY_PS(XMM1,XMM6) 2021 SSE_SHUFFLE(XMM1,XMM1,0x55) 2022 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2023 SSE_SUB_PS(XMM7,XMM1) 2024 2025 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2026 2027 /* Third Column */ 2028 SSE_COPY_PS(XMM2,XMM6) 2029 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2030 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2031 SSE_SUB_PS(XMM7,XMM2) 2032 2033 /* Fourth Column */ 2034 SSE_COPY_PS(XMM3,XMM6) 2035 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2036 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2037 SSE_SUB_PS(XMM7,XMM3) 2038 SSE_INLINE_END_2 2039 v += 16; 2040 } 2041 v = aa + ai16; 2042 ai16 = 16*diag[--i]; 2043 PREFETCH_NTA(aa+ai16+16); 2044 /* 2045 Scale the result by the diagonal 4x4 block, 2046 which was inverted as part of the factorization 2047 */ 2048 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2049 /* First Column */ 2050 SSE_COPY_PS(XMM0,XMM7) 2051 SSE_SHUFFLE(XMM0,XMM0,0x00) 2052 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2053 2054 /* Second Column */ 2055 SSE_COPY_PS(XMM1,XMM7) 2056 SSE_SHUFFLE(XMM1,XMM1,0x55) 2057 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2058 SSE_ADD_PS(XMM0,XMM1) 2059 2060 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2061 2062 /* Third Column */ 2063 SSE_COPY_PS(XMM2,XMM7) 2064 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2065 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2066 SSE_ADD_PS(XMM0,XMM2) 2067 2068 /* Fourth Column */ 2069 SSE_COPY_PS(XMM3,XMM7) 2070 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2071 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2072 SSE_ADD_PS(XMM0,XMM3) 2073 2074 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2075 SSE_INLINE_END_3 2076 2077 /* Promote solution from float to double */ 2078 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2079 2080 /* Apply reordering to t and stream into x. */ 2081 /* This way, x doesn't pollute the cache. */ 2082 /* Be careful with size: 2 doubles = 4 floats! */ 2083 idc = 4*(*c--); 2084 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2085 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2086 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2087 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2088 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2089 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2090 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2091 SSE_INLINE_END_2 2092 v = aa + ai16 + 16; 2093 idt -= 4; 2094 } 2095 2096 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2097 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2098 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2099 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2100 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2101 SSE_SCOPE_END; 2102 PetscFunctionReturn(0); 2103 } 2104 2105 #endif 2106 2107 2108 /* 2109 Special case where the matrix was ILU(0) factored in the natural 2110 ordering. This eliminates the need for the column and row permutation. 2111 */ 2112 #undef __FUNCT__ 2113 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2114 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2115 { 2116 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2117 PetscInt n=a->mbs; 2118 const PetscInt *ai=a->i,*aj=a->j; 2119 PetscErrorCode ierr; 2120 const PetscInt *diag = a->diag; 2121 const MatScalar *aa=a->a; 2122 PetscScalar *x; 2123 const PetscScalar *b; 2124 2125 PetscFunctionBegin; 2126 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2127 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2128 2129 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2130 { 2131 static PetscScalar w[2000]; /* very BAD need to fix */ 2132 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2133 } 2134 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2135 { 2136 static PetscScalar w[2000]; /* very BAD need to fix */ 2137 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2138 } 2139 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2140 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2141 #else 2142 { 2143 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2144 const MatScalar *v; 2145 PetscInt jdx,idt,idx,nz,i,ai16; 2146 const PetscInt *vi; 2147 2148 /* forward solve the lower triangular */ 2149 idx = 0; 2150 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2151 for (i=1; i<n; i++) { 2152 v = aa + 16*ai[i]; 2153 vi = aj + ai[i]; 2154 nz = diag[i] - ai[i]; 2155 idx += 4; 2156 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2157 while (nz--) { 2158 jdx = 4*(*vi++); 2159 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2160 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2161 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2162 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2163 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2164 v += 16; 2165 } 2166 x[idx] = s1; 2167 x[1+idx] = s2; 2168 x[2+idx] = s3; 2169 x[3+idx] = s4; 2170 } 2171 /* backward solve the upper triangular */ 2172 idt = 4*(n-1); 2173 for (i=n-1; i>=0; i--){ 2174 ai16 = 16*diag[i]; 2175 v = aa + ai16 + 16; 2176 vi = aj + diag[i] + 1; 2177 nz = ai[i+1] - diag[i] - 1; 2178 s1 = x[idt]; s2 = x[1+idt]; 2179 s3 = x[2+idt];s4 = x[3+idt]; 2180 while (nz--) { 2181 idx = 4*(*vi++); 2182 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2183 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2184 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2185 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2186 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2187 v += 16; 2188 } 2189 v = aa + ai16; 2190 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2191 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2192 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2193 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2194 idt -= 4; 2195 } 2196 } 2197 #endif 2198 2199 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2200 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2201 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2202 PetscFunctionReturn(0); 2203 } 2204 2205 #undef __FUNCT__ 2206 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2207 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2208 { 2209 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2210 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2211 PetscErrorCode ierr; 2212 PetscInt *diag = a->diag; 2213 MatScalar *aa=a->a; 2214 PetscScalar *x,*b; 2215 2216 PetscFunctionBegin; 2217 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2218 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2219 2220 { 2221 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2222 MatScalar *v,*t=(MatScalar *)x; 2223 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2224 2225 /* forward solve the lower triangular */ 2226 idx = 0; 2227 t[0] = (MatScalar)b[0]; 2228 t[1] = (MatScalar)b[1]; 2229 t[2] = (MatScalar)b[2]; 2230 t[3] = (MatScalar)b[3]; 2231 for (i=1; i<n; i++) { 2232 v = aa + 16*ai[i]; 2233 vi = aj + ai[i]; 2234 nz = diag[i] - ai[i]; 2235 idx += 4; 2236 s1 = (MatScalar)b[idx]; 2237 s2 = (MatScalar)b[1+idx]; 2238 s3 = (MatScalar)b[2+idx]; 2239 s4 = (MatScalar)b[3+idx]; 2240 while (nz--) { 2241 jdx = 4*(*vi++); 2242 x1 = t[jdx]; 2243 x2 = t[1+jdx]; 2244 x3 = t[2+jdx]; 2245 x4 = t[3+jdx]; 2246 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2247 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2248 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2249 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2250 v += 16; 2251 } 2252 t[idx] = s1; 2253 t[1+idx] = s2; 2254 t[2+idx] = s3; 2255 t[3+idx] = s4; 2256 } 2257 /* backward solve the upper triangular */ 2258 idt = 4*(n-1); 2259 for (i=n-1; i>=0; i--){ 2260 ai16 = 16*diag[i]; 2261 v = aa + ai16 + 16; 2262 vi = aj + diag[i] + 1; 2263 nz = ai[i+1] - diag[i] - 1; 2264 s1 = t[idt]; 2265 s2 = t[1+idt]; 2266 s3 = t[2+idt]; 2267 s4 = t[3+idt]; 2268 while (nz--) { 2269 idx = 4*(*vi++); 2270 x1 = (MatScalar)x[idx]; 2271 x2 = (MatScalar)x[1+idx]; 2272 x3 = (MatScalar)x[2+idx]; 2273 x4 = (MatScalar)x[3+idx]; 2274 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2275 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2276 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2277 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2278 v += 16; 2279 } 2280 v = aa + ai16; 2281 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2282 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2283 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2284 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2285 idt -= 4; 2286 } 2287 } 2288 2289 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2290 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2291 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2292 PetscFunctionReturn(0); 2293 } 2294 2295 #if defined (PETSC_HAVE_SSE) 2296 2297 #include PETSC_HAVE_SSE 2298 #undef __FUNCT__ 2299 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2300 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2301 { 2302 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2303 unsigned short *aj=(unsigned short *)a->j; 2304 PetscErrorCode ierr; 2305 int *ai=a->i,n=a->mbs,*diag = a->diag; 2306 MatScalar *aa=a->a; 2307 PetscScalar *x,*b; 2308 2309 PetscFunctionBegin; 2310 SSE_SCOPE_BEGIN; 2311 /* 2312 Note: This code currently uses demotion of double 2313 to float when performing the mixed-mode computation. 2314 This may not be numerically reasonable for all applications. 2315 */ 2316 PREFETCH_NTA(aa+16*ai[1]); 2317 2318 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2319 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2320 { 2321 /* x will first be computed in single precision then promoted inplace to double */ 2322 MatScalar *v,*t=(MatScalar *)x; 2323 int nz,i,idt,ai16; 2324 unsigned int jdx,idx; 2325 unsigned short *vi; 2326 /* Forward solve the lower triangular factor. */ 2327 2328 /* First block is the identity. */ 2329 idx = 0; 2330 CONVERT_DOUBLE4_FLOAT4(t,b); 2331 v = aa + 16*((unsigned int)ai[1]); 2332 2333 for (i=1; i<n;) { 2334 PREFETCH_NTA(&v[8]); 2335 vi = aj + ai[i]; 2336 nz = diag[i] - ai[i]; 2337 idx += 4; 2338 2339 /* Demote RHS from double to float. */ 2340 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2341 LOAD_PS(&t[idx],XMM7); 2342 2343 while (nz--) { 2344 PREFETCH_NTA(&v[16]); 2345 jdx = 4*((unsigned int)(*vi++)); 2346 2347 /* 4x4 Matrix-Vector product with negative accumulation: */ 2348 SSE_INLINE_BEGIN_2(&t[jdx],v) 2349 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2350 2351 /* First Column */ 2352 SSE_COPY_PS(XMM0,XMM6) 2353 SSE_SHUFFLE(XMM0,XMM0,0x00) 2354 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2355 SSE_SUB_PS(XMM7,XMM0) 2356 2357 /* Second Column */ 2358 SSE_COPY_PS(XMM1,XMM6) 2359 SSE_SHUFFLE(XMM1,XMM1,0x55) 2360 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2361 SSE_SUB_PS(XMM7,XMM1) 2362 2363 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2364 2365 /* Third Column */ 2366 SSE_COPY_PS(XMM2,XMM6) 2367 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2368 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2369 SSE_SUB_PS(XMM7,XMM2) 2370 2371 /* Fourth Column */ 2372 SSE_COPY_PS(XMM3,XMM6) 2373 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2374 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2375 SSE_SUB_PS(XMM7,XMM3) 2376 SSE_INLINE_END_2 2377 2378 v += 16; 2379 } 2380 v = aa + 16*ai[++i]; 2381 PREFETCH_NTA(v); 2382 STORE_PS(&t[idx],XMM7); 2383 } 2384 2385 /* Backward solve the upper triangular factor.*/ 2386 2387 idt = 4*(n-1); 2388 ai16 = 16*diag[n-1]; 2389 v = aa + ai16 + 16; 2390 for (i=n-1; i>=0;){ 2391 PREFETCH_NTA(&v[8]); 2392 vi = aj + diag[i] + 1; 2393 nz = ai[i+1] - diag[i] - 1; 2394 2395 LOAD_PS(&t[idt],XMM7); 2396 2397 while (nz--) { 2398 PREFETCH_NTA(&v[16]); 2399 idx = 4*((unsigned int)(*vi++)); 2400 2401 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2402 SSE_INLINE_BEGIN_2(&t[idx],v) 2403 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2404 2405 /* First Column */ 2406 SSE_COPY_PS(XMM0,XMM6) 2407 SSE_SHUFFLE(XMM0,XMM0,0x00) 2408 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2409 SSE_SUB_PS(XMM7,XMM0) 2410 2411 /* Second Column */ 2412 SSE_COPY_PS(XMM1,XMM6) 2413 SSE_SHUFFLE(XMM1,XMM1,0x55) 2414 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2415 SSE_SUB_PS(XMM7,XMM1) 2416 2417 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2418 2419 /* Third Column */ 2420 SSE_COPY_PS(XMM2,XMM6) 2421 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2422 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2423 SSE_SUB_PS(XMM7,XMM2) 2424 2425 /* Fourth Column */ 2426 SSE_COPY_PS(XMM3,XMM6) 2427 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2428 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2429 SSE_SUB_PS(XMM7,XMM3) 2430 SSE_INLINE_END_2 2431 v += 16; 2432 } 2433 v = aa + ai16; 2434 ai16 = 16*diag[--i]; 2435 PREFETCH_NTA(aa+ai16+16); 2436 /* 2437 Scale the result by the diagonal 4x4 block, 2438 which was inverted as part of the factorization 2439 */ 2440 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2441 /* First Column */ 2442 SSE_COPY_PS(XMM0,XMM7) 2443 SSE_SHUFFLE(XMM0,XMM0,0x00) 2444 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2445 2446 /* Second Column */ 2447 SSE_COPY_PS(XMM1,XMM7) 2448 SSE_SHUFFLE(XMM1,XMM1,0x55) 2449 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2450 SSE_ADD_PS(XMM0,XMM1) 2451 2452 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2453 2454 /* Third Column */ 2455 SSE_COPY_PS(XMM2,XMM7) 2456 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2457 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2458 SSE_ADD_PS(XMM0,XMM2) 2459 2460 /* Fourth Column */ 2461 SSE_COPY_PS(XMM3,XMM7) 2462 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2463 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2464 SSE_ADD_PS(XMM0,XMM3) 2465 2466 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2467 SSE_INLINE_END_3 2468 2469 v = aa + ai16 + 16; 2470 idt -= 4; 2471 } 2472 2473 /* Convert t from single precision back to double precision (inplace)*/ 2474 idt = 4*(n-1); 2475 for (i=n-1;i>=0;i--) { 2476 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2477 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2478 PetscScalar *xtemp=&x[idt]; 2479 MatScalar *ttemp=&t[idt]; 2480 xtemp[3] = (PetscScalar)ttemp[3]; 2481 xtemp[2] = (PetscScalar)ttemp[2]; 2482 xtemp[1] = (PetscScalar)ttemp[1]; 2483 xtemp[0] = (PetscScalar)ttemp[0]; 2484 idt -= 4; 2485 } 2486 2487 } /* End of artificial scope. */ 2488 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2489 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2490 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2491 SSE_SCOPE_END; 2492 PetscFunctionReturn(0); 2493 } 2494 2495 #undef __FUNCT__ 2496 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2497 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2498 { 2499 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2500 int *aj=a->j; 2501 PetscErrorCode ierr; 2502 int *ai=a->i,n=a->mbs,*diag = a->diag; 2503 MatScalar *aa=a->a; 2504 PetscScalar *x,*b; 2505 2506 PetscFunctionBegin; 2507 SSE_SCOPE_BEGIN; 2508 /* 2509 Note: This code currently uses demotion of double 2510 to float when performing the mixed-mode computation. 2511 This may not be numerically reasonable for all applications. 2512 */ 2513 PREFETCH_NTA(aa+16*ai[1]); 2514 2515 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2516 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2517 { 2518 /* x will first be computed in single precision then promoted inplace to double */ 2519 MatScalar *v,*t=(MatScalar *)x; 2520 int nz,i,idt,ai16; 2521 int jdx,idx; 2522 int *vi; 2523 /* Forward solve the lower triangular factor. */ 2524 2525 /* First block is the identity. */ 2526 idx = 0; 2527 CONVERT_DOUBLE4_FLOAT4(t,b); 2528 v = aa + 16*ai[1]; 2529 2530 for (i=1; i<n;) { 2531 PREFETCH_NTA(&v[8]); 2532 vi = aj + ai[i]; 2533 nz = diag[i] - ai[i]; 2534 idx += 4; 2535 2536 /* Demote RHS from double to float. */ 2537 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2538 LOAD_PS(&t[idx],XMM7); 2539 2540 while (nz--) { 2541 PREFETCH_NTA(&v[16]); 2542 jdx = 4*(*vi++); 2543 /* jdx = *vi++; */ 2544 2545 /* 4x4 Matrix-Vector product with negative accumulation: */ 2546 SSE_INLINE_BEGIN_2(&t[jdx],v) 2547 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2548 2549 /* First Column */ 2550 SSE_COPY_PS(XMM0,XMM6) 2551 SSE_SHUFFLE(XMM0,XMM0,0x00) 2552 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2553 SSE_SUB_PS(XMM7,XMM0) 2554 2555 /* Second Column */ 2556 SSE_COPY_PS(XMM1,XMM6) 2557 SSE_SHUFFLE(XMM1,XMM1,0x55) 2558 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2559 SSE_SUB_PS(XMM7,XMM1) 2560 2561 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2562 2563 /* Third Column */ 2564 SSE_COPY_PS(XMM2,XMM6) 2565 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2566 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2567 SSE_SUB_PS(XMM7,XMM2) 2568 2569 /* Fourth Column */ 2570 SSE_COPY_PS(XMM3,XMM6) 2571 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2572 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2573 SSE_SUB_PS(XMM7,XMM3) 2574 SSE_INLINE_END_2 2575 2576 v += 16; 2577 } 2578 v = aa + 16*ai[++i]; 2579 PREFETCH_NTA(v); 2580 STORE_PS(&t[idx],XMM7); 2581 } 2582 2583 /* Backward solve the upper triangular factor.*/ 2584 2585 idt = 4*(n-1); 2586 ai16 = 16*diag[n-1]; 2587 v = aa + ai16 + 16; 2588 for (i=n-1; i>=0;){ 2589 PREFETCH_NTA(&v[8]); 2590 vi = aj + diag[i] + 1; 2591 nz = ai[i+1] - diag[i] - 1; 2592 2593 LOAD_PS(&t[idt],XMM7); 2594 2595 while (nz--) { 2596 PREFETCH_NTA(&v[16]); 2597 idx = 4*(*vi++); 2598 /* idx = *vi++; */ 2599 2600 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2601 SSE_INLINE_BEGIN_2(&t[idx],v) 2602 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2603 2604 /* First Column */ 2605 SSE_COPY_PS(XMM0,XMM6) 2606 SSE_SHUFFLE(XMM0,XMM0,0x00) 2607 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2608 SSE_SUB_PS(XMM7,XMM0) 2609 2610 /* Second Column */ 2611 SSE_COPY_PS(XMM1,XMM6) 2612 SSE_SHUFFLE(XMM1,XMM1,0x55) 2613 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2614 SSE_SUB_PS(XMM7,XMM1) 2615 2616 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2617 2618 /* Third Column */ 2619 SSE_COPY_PS(XMM2,XMM6) 2620 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2621 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2622 SSE_SUB_PS(XMM7,XMM2) 2623 2624 /* Fourth Column */ 2625 SSE_COPY_PS(XMM3,XMM6) 2626 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2627 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2628 SSE_SUB_PS(XMM7,XMM3) 2629 SSE_INLINE_END_2 2630 v += 16; 2631 } 2632 v = aa + ai16; 2633 ai16 = 16*diag[--i]; 2634 PREFETCH_NTA(aa+ai16+16); 2635 /* 2636 Scale the result by the diagonal 4x4 block, 2637 which was inverted as part of the factorization 2638 */ 2639 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2640 /* First Column */ 2641 SSE_COPY_PS(XMM0,XMM7) 2642 SSE_SHUFFLE(XMM0,XMM0,0x00) 2643 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2644 2645 /* Second Column */ 2646 SSE_COPY_PS(XMM1,XMM7) 2647 SSE_SHUFFLE(XMM1,XMM1,0x55) 2648 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2649 SSE_ADD_PS(XMM0,XMM1) 2650 2651 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2652 2653 /* Third Column */ 2654 SSE_COPY_PS(XMM2,XMM7) 2655 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2656 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2657 SSE_ADD_PS(XMM0,XMM2) 2658 2659 /* Fourth Column */ 2660 SSE_COPY_PS(XMM3,XMM7) 2661 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2662 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2663 SSE_ADD_PS(XMM0,XMM3) 2664 2665 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2666 SSE_INLINE_END_3 2667 2668 v = aa + ai16 + 16; 2669 idt -= 4; 2670 } 2671 2672 /* Convert t from single precision back to double precision (inplace)*/ 2673 idt = 4*(n-1); 2674 for (i=n-1;i>=0;i--) { 2675 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2676 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2677 PetscScalar *xtemp=&x[idt]; 2678 MatScalar *ttemp=&t[idt]; 2679 xtemp[3] = (PetscScalar)ttemp[3]; 2680 xtemp[2] = (PetscScalar)ttemp[2]; 2681 xtemp[1] = (PetscScalar)ttemp[1]; 2682 xtemp[0] = (PetscScalar)ttemp[0]; 2683 idt -= 4; 2684 } 2685 2686 } /* End of artificial scope. */ 2687 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2688 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2689 ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr); 2690 SSE_SCOPE_END; 2691 PetscFunctionReturn(0); 2692 } 2693 2694 #endif 2695 #undef __FUNCT__ 2696 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2697 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2698 { 2699 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2700 IS iscol=a->col,isrow=a->row; 2701 PetscErrorCode ierr; 2702 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2703 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2704 const MatScalar *aa=a->a,*v; 2705 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 2706 const PetscScalar *b; 2707 2708 PetscFunctionBegin; 2709 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2710 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2711 t = a->solve_work; 2712 2713 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2714 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2715 2716 /* forward solve the lower triangular */ 2717 idx = 3*(*r++); 2718 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2719 for (i=1; i<n; i++) { 2720 v = aa + 9*ai[i]; 2721 vi = aj + ai[i]; 2722 nz = diag[i] - ai[i]; 2723 idx = 3*(*r++); 2724 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2725 while (nz--) { 2726 idx = 3*(*vi++); 2727 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2728 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2729 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2730 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2731 v += 9; 2732 } 2733 idx = 3*i; 2734 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2735 } 2736 /* backward solve the upper triangular */ 2737 for (i=n-1; i>=0; i--){ 2738 v = aa + 9*diag[i] + 9; 2739 vi = aj + diag[i] + 1; 2740 nz = ai[i+1] - diag[i] - 1; 2741 idt = 3*i; 2742 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2743 while (nz--) { 2744 idx = 3*(*vi++); 2745 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2746 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2747 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2748 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2749 v += 9; 2750 } 2751 idc = 3*(*c--); 2752 v = aa + 9*diag[i]; 2753 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2754 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2755 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2756 } 2757 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2758 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2759 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2760 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2761 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 2762 PetscFunctionReturn(0); 2763 } 2764 2765 /* 2766 Special case where the matrix was ILU(0) factored in the natural 2767 ordering. This eliminates the need for the column and row permutation. 2768 */ 2769 #undef __FUNCT__ 2770 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2771 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2772 { 2773 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2774 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2775 PetscErrorCode ierr; 2776 PetscInt *diag = a->diag; 2777 const MatScalar *aa=a->a,*v; 2778 PetscScalar *x,s1,s2,s3,x1,x2,x3; 2779 const PetscScalar *b; 2780 PetscInt jdx,idt,idx,nz,*vi,i; 2781 2782 PetscFunctionBegin; 2783 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2784 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2785 2786 /* forward solve the lower triangular */ 2787 idx = 0; 2788 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2789 for (i=1; i<n; i++) { 2790 v = aa + 9*ai[i]; 2791 vi = aj + ai[i]; 2792 nz = diag[i] - ai[i]; 2793 idx += 3; 2794 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2795 while (nz--) { 2796 jdx = 3*(*vi++); 2797 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2798 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2799 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2800 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2801 v += 9; 2802 } 2803 x[idx] = s1; 2804 x[1+idx] = s2; 2805 x[2+idx] = s3; 2806 } 2807 /* backward solve the upper triangular */ 2808 for (i=n-1; i>=0; i--){ 2809 v = aa + 9*diag[i] + 9; 2810 vi = aj + diag[i] + 1; 2811 nz = ai[i+1] - diag[i] - 1; 2812 idt = 3*i; 2813 s1 = x[idt]; s2 = x[1+idt]; 2814 s3 = x[2+idt]; 2815 while (nz--) { 2816 idx = 3*(*vi++); 2817 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2818 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2819 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2820 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2821 v += 9; 2822 } 2823 v = aa + 9*diag[i]; 2824 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2825 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2826 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2827 } 2828 2829 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2830 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2831 ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr); 2832 PetscFunctionReturn(0); 2833 } 2834 2835 #undef __FUNCT__ 2836 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2837 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2838 { 2839 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2840 IS iscol=a->col,isrow=a->row; 2841 PetscErrorCode ierr; 2842 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2843 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2844 const MatScalar *aa=a->a,*v; 2845 PetscScalar *x,s1,s2,x1,x2,*t; 2846 const PetscScalar *b; 2847 2848 PetscFunctionBegin; 2849 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2850 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2851 t = a->solve_work; 2852 2853 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2854 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2855 2856 /* forward solve the lower triangular */ 2857 idx = 2*(*r++); 2858 t[0] = b[idx]; t[1] = b[1+idx]; 2859 for (i=1; i<n; i++) { 2860 v = aa + 4*ai[i]; 2861 vi = aj + ai[i]; 2862 nz = diag[i] - ai[i]; 2863 idx = 2*(*r++); 2864 s1 = b[idx]; s2 = b[1+idx]; 2865 while (nz--) { 2866 idx = 2*(*vi++); 2867 x1 = t[idx]; x2 = t[1+idx]; 2868 s1 -= v[0]*x1 + v[2]*x2; 2869 s2 -= v[1]*x1 + v[3]*x2; 2870 v += 4; 2871 } 2872 idx = 2*i; 2873 t[idx] = s1; t[1+idx] = s2; 2874 } 2875 /* backward solve the upper triangular */ 2876 for (i=n-1; i>=0; i--){ 2877 v = aa + 4*diag[i] + 4; 2878 vi = aj + diag[i] + 1; 2879 nz = ai[i+1] - diag[i] - 1; 2880 idt = 2*i; 2881 s1 = t[idt]; s2 = t[1+idt]; 2882 while (nz--) { 2883 idx = 2*(*vi++); 2884 x1 = t[idx]; x2 = t[1+idx]; 2885 s1 -= v[0]*x1 + v[2]*x2; 2886 s2 -= v[1]*x1 + v[3]*x2; 2887 v += 4; 2888 } 2889 idc = 2*(*c--); 2890 v = aa + 4*diag[i]; 2891 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2892 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2893 } 2894 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2895 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2896 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2897 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2898 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 2899 PetscFunctionReturn(0); 2900 } 2901 2902 /* 2903 Special case where the matrix was ILU(0) factored in the natural 2904 ordering. This eliminates the need for the column and row permutation. 2905 */ 2906 #undef __FUNCT__ 2907 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2908 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2909 { 2910 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2911 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2912 PetscErrorCode ierr; 2913 PetscInt *diag = a->diag; 2914 const MatScalar *aa=a->a,*v; 2915 PetscScalar *x,s1,s2,x1,x2; 2916 const PetscScalar *b; 2917 PetscInt jdx,idt,idx,nz,*vi,i; 2918 2919 PetscFunctionBegin; 2920 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2921 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2922 2923 /* forward solve the lower triangular */ 2924 idx = 0; 2925 x[0] = b[0]; x[1] = b[1]; 2926 for (i=1; i<n; i++) { 2927 v = aa + 4*ai[i]; 2928 vi = aj + ai[i]; 2929 nz = diag[i] - ai[i]; 2930 idx += 2; 2931 s1 = b[idx];s2 = b[1+idx]; 2932 while (nz--) { 2933 jdx = 2*(*vi++); 2934 x1 = x[jdx];x2 = x[1+jdx]; 2935 s1 -= v[0]*x1 + v[2]*x2; 2936 s2 -= v[1]*x1 + v[3]*x2; 2937 v += 4; 2938 } 2939 x[idx] = s1; 2940 x[1+idx] = s2; 2941 } 2942 /* backward solve the upper triangular */ 2943 for (i=n-1; i>=0; i--){ 2944 v = aa + 4*diag[i] + 4; 2945 vi = aj + diag[i] + 1; 2946 nz = ai[i+1] - diag[i] - 1; 2947 idt = 2*i; 2948 s1 = x[idt]; s2 = x[1+idt]; 2949 while (nz--) { 2950 idx = 2*(*vi++); 2951 x1 = x[idx]; x2 = x[1+idx]; 2952 s1 -= v[0]*x1 + v[2]*x2; 2953 s2 -= v[1]*x1 + v[3]*x2; 2954 v += 4; 2955 } 2956 v = aa + 4*diag[i]; 2957 x[idt] = v[0]*s1 + v[2]*s2; 2958 x[1+idt] = v[1]*s1 + v[3]*s2; 2959 } 2960 2961 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2962 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2963 ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr); 2964 PetscFunctionReturn(0); 2965 } 2966 2967 #undef __FUNCT__ 2968 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2969 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2970 { 2971 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2972 IS iscol=a->col,isrow=a->row; 2973 PetscErrorCode ierr; 2974 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2975 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2976 MatScalar *aa=a->a,*v; 2977 PetscScalar *x,*b,s1,*t; 2978 2979 PetscFunctionBegin; 2980 if (!n) PetscFunctionReturn(0); 2981 2982 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2983 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2984 t = a->solve_work; 2985 2986 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2987 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2988 2989 /* forward solve the lower triangular */ 2990 t[0] = b[*r++]; 2991 for (i=1; i<n; i++) { 2992 v = aa + ai[i]; 2993 vi = aj + ai[i]; 2994 nz = diag[i] - ai[i]; 2995 s1 = b[*r++]; 2996 while (nz--) { 2997 s1 -= (*v++)*t[*vi++]; 2998 } 2999 t[i] = s1; 3000 } 3001 /* backward solve the upper triangular */ 3002 for (i=n-1; i>=0; i--){ 3003 v = aa + diag[i] + 1; 3004 vi = aj + diag[i] + 1; 3005 nz = ai[i+1] - diag[i] - 1; 3006 s1 = t[i]; 3007 while (nz--) { 3008 s1 -= (*v++)*t[*vi++]; 3009 } 3010 x[*c--] = t[i] = aa[diag[i]]*s1; 3011 } 3012 3013 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3014 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3015 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3016 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3017 ierr = PetscLogFlops(2*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3018 PetscFunctionReturn(0); 3019 } 3020 /* 3021 Special case where the matrix was ILU(0) factored in the natural 3022 ordering. This eliminates the need for the column and row permutation. 3023 */ 3024 #undef __FUNCT__ 3025 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3026 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 3027 { 3028 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3029 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3030 PetscErrorCode ierr; 3031 PetscInt *diag = a->diag; 3032 MatScalar *aa=a->a; 3033 PetscScalar *x,*b; 3034 PetscScalar s1,x1; 3035 MatScalar *v; 3036 PetscInt jdx,idt,idx,nz,*vi,i; 3037 3038 PetscFunctionBegin; 3039 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3040 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3041 3042 /* forward solve the lower triangular */ 3043 idx = 0; 3044 x[0] = b[0]; 3045 for (i=1; i<n; i++) { 3046 v = aa + ai[i]; 3047 vi = aj + ai[i]; 3048 nz = diag[i] - ai[i]; 3049 idx += 1; 3050 s1 = b[idx]; 3051 while (nz--) { 3052 jdx = *vi++; 3053 x1 = x[jdx]; 3054 s1 -= v[0]*x1; 3055 v += 1; 3056 } 3057 x[idx] = s1; 3058 } 3059 /* backward solve the upper triangular */ 3060 for (i=n-1; i>=0; i--){ 3061 v = aa + diag[i] + 1; 3062 vi = aj + diag[i] + 1; 3063 nz = ai[i+1] - diag[i] - 1; 3064 idt = i; 3065 s1 = x[idt]; 3066 while (nz--) { 3067 idx = *vi++; 3068 x1 = x[idx]; 3069 s1 -= v[0]*x1; 3070 v += 1; 3071 } 3072 v = aa + diag[i]; 3073 x[idt] = v[0]*s1; 3074 } 3075 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3076 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3077 ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3078 PetscFunctionReturn(0); 3079 } 3080 3081 /* ----------------------------------------------------------------*/ 3082 /* 3083 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3084 except that the data structure of Mat_SeqAIJ is slightly different. 3085 Not a good example of code reuse. 3086 */ 3087 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption); 3088 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 3089 3090 #undef __FUNCT__ 3091 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3092 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3093 { 3094 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3095 IS isicol; 3096 PetscErrorCode ierr; 3097 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 3098 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 3099 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3100 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 3101 PetscTruth col_identity,row_identity,both_identity,flg; 3102 PetscReal f; 3103 3104 PetscFunctionBegin; 3105 f = info->fill; 3106 levels = (PetscInt)info->levels; 3107 diagonal_fill = (PetscInt)info->diagonal_fill; 3108 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3109 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3110 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3111 both_identity = (PetscTruth) (row_identity && col_identity); 3112 3113 if (!levels && both_identity) { /* special case copy the nonzero structure */ 3114 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 3115 fact->factor = MAT_FACTOR_ILU; 3116 b = (Mat_SeqBAIJ*)(fact)->data; 3117 ierr = MatMissingDiagonal_SeqBAIJ(fact,&flg,&dd);CHKERRQ(ierr); 3118 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd); 3119 b->row = isrow; 3120 b->col = iscol; 3121 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3122 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3123 b->icol = isicol; 3124 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3125 ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3126 } else { /* general case perform the symbolic factorization */ 3127 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3128 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3129 3130 /* get new row pointers */ 3131 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 3132 ainew[0] = 0; 3133 /* don't know how many column pointers are needed so estimate */ 3134 jmax = (PetscInt)(f*ai[n] + 1); 3135 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 3136 /* ajfill is level of fill for each fill entry */ 3137 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 3138 /* fill is a linked list of nonzeros in active row */ 3139 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 3140 /* im is level for each filled value */ 3141 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 3142 /* dloc is location of diagonal in factor */ 3143 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 3144 dloc[0] = 0; 3145 for (prow=0; prow<n; prow++) { 3146 3147 /* copy prow into linked list */ 3148 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3149 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 3150 xi = aj + ai[r[prow]]; 3151 fill[n] = n; 3152 fill[prow] = -1; /* marker for diagonal entry */ 3153 while (nz--) { 3154 fm = n; 3155 idx = ic[*xi++]; 3156 do { 3157 m = fm; 3158 fm = fill[m]; 3159 } while (fm < idx); 3160 fill[m] = idx; 3161 fill[idx] = fm; 3162 im[idx] = 0; 3163 } 3164 3165 /* make sure diagonal entry is included */ 3166 if (diagonal_fill && fill[prow] == -1) { 3167 fm = n; 3168 while (fill[fm] < prow) fm = fill[fm]; 3169 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3170 fill[fm] = prow; 3171 im[prow] = 0; 3172 nzf++; 3173 dcount++; 3174 } 3175 3176 nzi = 0; 3177 row = fill[n]; 3178 while (row < prow) { 3179 incrlev = im[row] + 1; 3180 nz = dloc[row]; 3181 xi = ajnew + ainew[row] + nz + 1; 3182 flev = ajfill + ainew[row] + nz + 1; 3183 nnz = ainew[row+1] - ainew[row] - nz - 1; 3184 fm = row; 3185 while (nnz-- > 0) { 3186 idx = *xi++; 3187 if (*flev + incrlev > levels) { 3188 flev++; 3189 continue; 3190 } 3191 do { 3192 m = fm; 3193 fm = fill[m]; 3194 } while (fm < idx); 3195 if (fm != idx) { 3196 im[idx] = *flev + incrlev; 3197 fill[m] = idx; 3198 fill[idx] = fm; 3199 fm = idx; 3200 nzf++; 3201 } else { 3202 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3203 } 3204 flev++; 3205 } 3206 row = fill[row]; 3207 nzi++; 3208 } 3209 /* copy new filled row into permanent storage */ 3210 ainew[prow+1] = ainew[prow] + nzf; 3211 if (ainew[prow+1] > jmax) { 3212 3213 /* estimate how much additional space we will need */ 3214 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3215 /* just double the memory each time */ 3216 PetscInt maxadd = jmax; 3217 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3218 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3219 jmax += maxadd; 3220 3221 /* allocate a longer ajnew and ajfill */ 3222 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3223 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3224 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3225 ajnew = xitmp; 3226 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3227 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3228 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3229 ajfill = xitmp; 3230 reallocate++; /* count how many reallocations are needed */ 3231 } 3232 xitmp = ajnew + ainew[prow]; 3233 flev = ajfill + ainew[prow]; 3234 dloc[prow] = nzi; 3235 fm = fill[n]; 3236 while (nzf--) { 3237 *xitmp++ = fm; 3238 *flev++ = im[fm]; 3239 fm = fill[fm]; 3240 } 3241 /* make sure row has diagonal entry */ 3242 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3243 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 3244 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3245 } 3246 } 3247 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3248 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3249 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3250 ierr = PetscFree(fill);CHKERRQ(ierr); 3251 ierr = PetscFree(im);CHKERRQ(ierr); 3252 3253 #if defined(PETSC_USE_INFO) 3254 { 3255 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3256 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3257 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3258 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3259 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3260 if (diagonal_fill) { 3261 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3262 } 3263 } 3264 #endif 3265 3266 /* put together the new matrix */ 3267 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3268 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 3269 b = (Mat_SeqBAIJ*)(fact)->data; 3270 b->free_a = PETSC_TRUE; 3271 b->free_ij = PETSC_TRUE; 3272 b->singlemalloc = PETSC_FALSE; 3273 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 3274 b->j = ajnew; 3275 b->i = ainew; 3276 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3277 b->diag = dloc; 3278 b->ilen = 0; 3279 b->imax = 0; 3280 b->row = isrow; 3281 b->col = iscol; 3282 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3283 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3284 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3285 b->icol = isicol; 3286 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3287 /* In b structure: Free imax, ilen, old a, old j. 3288 Allocate dloc, solve_work, new a, new j */ 3289 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 3290 b->maxnz = b->nz = ainew[n]; 3291 3292 (fact)->info.factor_mallocs = reallocate; 3293 (fact)->info.fill_ratio_given = f; 3294 (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3295 } 3296 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 3297 PetscFunctionReturn(0); 3298 } 3299 3300 #undef __FUNCT__ 3301 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3302 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3303 { 3304 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3305 /* int i,*AJ=a->j,nz=a->nz; */ 3306 PetscFunctionBegin; 3307 /* Undo Column scaling */ 3308 /* while (nz--) { */ 3309 /* AJ[i] = AJ[i]/4; */ 3310 /* } */ 3311 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3312 A->ops->setunfactored = PETSC_NULL; 3313 PetscFunctionReturn(0); 3314 } 3315 3316 #undef __FUNCT__ 3317 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3318 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3319 { 3320 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3321 PetscInt *AJ=a->j,nz=a->nz; 3322 unsigned short *aj=(unsigned short *)AJ; 3323 PetscFunctionBegin; 3324 /* Is this really necessary? */ 3325 while (nz--) { 3326 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3327 } 3328 A->ops->setunfactored = PETSC_NULL; 3329 PetscFunctionReturn(0); 3330 } 3331 3332 3333