1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 PetscErrorCode ierr; 17 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18 PetscInt *diag = a->diag; 19 MatScalar *aa=a->a,*v; 20 PetscScalar s1,*x,*b; 21 22 PetscFunctionBegin; 23 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 24 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 27 /* forward solve the U^T */ 28 for (i=0; i<n; i++) { 29 30 v = aa + diag[i]; 31 /* multiply by the inverse of the block diagonal */ 32 s1 = (*v++)*x[i]; 33 vi = aj + diag[i] + 1; 34 nz = ai[i+1] - diag[i] - 1; 35 while (nz--) { 36 x[*vi++] -= (*v++)*s1; 37 } 38 x[i] = s1; 39 } 40 /* backward solve the L^T */ 41 for (i=n-1; i>=0; i--){ 42 v = aa + diag[i] - 1; 43 vi = aj + diag[i] - 1; 44 nz = diag[i] - ai[i]; 45 s1 = x[i]; 46 while (nz--) { 47 x[*vi--] -= (*v--)*s1; 48 } 49 } 50 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 51 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 59 { 60 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 61 PetscErrorCode ierr; 62 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 63 PetscInt *diag = a->diag,oidx; 64 MatScalar *aa=a->a,*v; 65 PetscScalar s1,s2,x1,x2; 66 PetscScalar *x,*b; 67 68 PetscFunctionBegin; 69 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 70 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 71 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 72 73 /* forward solve the U^T */ 74 idx = 0; 75 for (i=0; i<n; i++) { 76 77 v = aa + 4*diag[i]; 78 /* multiply by the inverse of the block diagonal */ 79 x1 = x[idx]; x2 = x[1+idx]; 80 s1 = v[0]*x1 + v[1]*x2; 81 s2 = v[2]*x1 + v[3]*x2; 82 v += 4; 83 84 vi = aj + diag[i] + 1; 85 nz = ai[i+1] - diag[i] - 1; 86 while (nz--) { 87 oidx = 2*(*vi++); 88 x[oidx] -= v[0]*s1 + v[1]*s2; 89 x[oidx+1] -= v[2]*s1 + v[3]*s2; 90 v += 4; 91 } 92 x[idx] = s1;x[1+idx] = s2; 93 idx += 2; 94 } 95 /* backward solve the L^T */ 96 for (i=n-1; i>=0; i--){ 97 v = aa + 4*diag[i] - 4; 98 vi = aj + diag[i] - 1; 99 nz = diag[i] - ai[i]; 100 idt = 2*i; 101 s1 = x[idt]; s2 = x[1+idt]; 102 while (nz--) { 103 idx = 2*(*vi--); 104 x[idx] -= v[0]*s1 + v[1]*s2; 105 x[idx+1] -= v[2]*s1 + v[3]*s2; 106 v -= 4; 107 } 108 } 109 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 110 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 111 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 112 PetscFunctionReturn(0); 113 } 114 115 #undef __FUNCT__ 116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 118 { 119 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 120 PetscErrorCode ierr; 121 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 122 PetscInt *diag = a->diag,oidx; 123 MatScalar *aa=a->a,*v; 124 PetscScalar s1,s2,s3,x1,x2,x3; 125 PetscScalar *x,*b; 126 127 PetscFunctionBegin; 128 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 129 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 130 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 131 132 /* forward solve the U^T */ 133 idx = 0; 134 for (i=0; i<n; i++) { 135 136 v = aa + 9*diag[i]; 137 /* multiply by the inverse of the block diagonal */ 138 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 139 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 140 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 141 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 142 v += 9; 143 144 vi = aj + diag[i] + 1; 145 nz = ai[i+1] - diag[i] - 1; 146 while (nz--) { 147 oidx = 3*(*vi++); 148 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 149 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 150 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 151 v += 9; 152 } 153 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 154 idx += 3; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + 9*diag[i] - 9; 159 vi = aj + diag[i] - 1; 160 nz = diag[i] - ai[i]; 161 idt = 3*i; 162 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 163 while (nz--) { 164 idx = 3*(*vi--); 165 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 166 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 167 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 168 v -= 9; 169 } 170 } 171 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 172 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 173 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 174 PetscFunctionReturn(0); 175 } 176 177 #undef __FUNCT__ 178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 180 { 181 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182 PetscErrorCode ierr; 183 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 184 PetscInt *diag = a->diag,oidx; 185 MatScalar *aa=a->a,*v; 186 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 187 PetscScalar *x,*b; 188 189 PetscFunctionBegin; 190 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 191 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 192 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 193 194 /* forward solve the U^T */ 195 idx = 0; 196 for (i=0; i<n; i++) { 197 198 v = aa + 16*diag[i]; 199 /* multiply by the inverse of the block diagonal */ 200 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 201 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 202 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 203 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 204 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 205 v += 16; 206 207 vi = aj + diag[i] + 1; 208 nz = ai[i+1] - diag[i] - 1; 209 while (nz--) { 210 oidx = 4*(*vi++); 211 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 212 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 213 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 214 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 215 v += 16; 216 } 217 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 218 idx += 4; 219 } 220 /* backward solve the L^T */ 221 for (i=n-1; i>=0; i--){ 222 v = aa + 16*diag[i] - 16; 223 vi = aj + diag[i] - 1; 224 nz = diag[i] - ai[i]; 225 idt = 4*i; 226 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 227 while (nz--) { 228 idx = 4*(*vi--); 229 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 230 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 231 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 232 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 233 v -= 16; 234 } 235 } 236 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 237 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 238 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 #undef __FUNCT__ 243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 245 { 246 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 247 PetscErrorCode ierr; 248 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 249 PetscInt *diag = a->diag,oidx; 250 MatScalar *aa=a->a,*v; 251 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 252 PetscScalar *x,*b; 253 254 PetscFunctionBegin; 255 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 256 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 257 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 258 259 /* forward solve the U^T */ 260 idx = 0; 261 for (i=0; i<n; i++) { 262 263 v = aa + 25*diag[i]; 264 /* multiply by the inverse of the block diagonal */ 265 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 266 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 267 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 268 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 269 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 270 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 271 v += 25; 272 273 vi = aj + diag[i] + 1; 274 nz = ai[i+1] - diag[i] - 1; 275 while (nz--) { 276 oidx = 5*(*vi++); 277 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 278 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 279 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 280 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 281 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 282 v += 25; 283 } 284 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 285 idx += 5; 286 } 287 /* backward solve the L^T */ 288 for (i=n-1; i>=0; i--){ 289 v = aa + 25*diag[i] - 25; 290 vi = aj + diag[i] - 1; 291 nz = diag[i] - ai[i]; 292 idt = 5*i; 293 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 294 while (nz--) { 295 idx = 5*(*vi--); 296 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 297 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 298 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 299 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 300 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 301 v -= 25; 302 } 303 } 304 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 305 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 306 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309 310 #undef __FUNCT__ 311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 313 { 314 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 315 PetscErrorCode ierr; 316 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 317 PetscInt *diag = a->diag,oidx; 318 MatScalar *aa=a->a,*v; 319 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 320 PetscScalar *x,*b; 321 322 PetscFunctionBegin; 323 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 324 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 325 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 326 327 /* forward solve the U^T */ 328 idx = 0; 329 for (i=0; i<n; i++) { 330 331 v = aa + 36*diag[i]; 332 /* multiply by the inverse of the block diagonal */ 333 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 334 x6 = x[5+idx]; 335 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 336 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 337 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 338 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 339 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 340 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 341 v += 36; 342 343 vi = aj + diag[i] + 1; 344 nz = ai[i+1] - diag[i] - 1; 345 while (nz--) { 346 oidx = 6*(*vi++); 347 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 348 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 349 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 350 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 351 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 352 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 353 v += 36; 354 } 355 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 356 x[5+idx] = s6; 357 idx += 6; 358 } 359 /* backward solve the L^T */ 360 for (i=n-1; i>=0; i--){ 361 v = aa + 36*diag[i] - 36; 362 vi = aj + diag[i] - 1; 363 nz = diag[i] - ai[i]; 364 idt = 6*i; 365 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 366 s6 = x[5+idt]; 367 while (nz--) { 368 idx = 6*(*vi--); 369 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 370 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 371 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 372 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 373 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 374 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 375 v -= 36; 376 } 377 } 378 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 379 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 380 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 381 PetscFunctionReturn(0); 382 } 383 384 #undef __FUNCT__ 385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 387 { 388 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 389 PetscErrorCode ierr; 390 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 391 PetscInt *diag = a->diag,oidx; 392 MatScalar *aa=a->a,*v; 393 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 394 PetscScalar *x,*b; 395 396 PetscFunctionBegin; 397 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 398 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 399 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 400 401 /* forward solve the U^T */ 402 idx = 0; 403 for (i=0; i<n; i++) { 404 405 v = aa + 49*diag[i]; 406 /* multiply by the inverse of the block diagonal */ 407 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 408 x6 = x[5+idx]; x7 = x[6+idx]; 409 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 410 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 411 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 412 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 413 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 414 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 415 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 416 v += 49; 417 418 vi = aj + diag[i] + 1; 419 nz = ai[i+1] - diag[i] - 1; 420 while (nz--) { 421 oidx = 7*(*vi++); 422 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 423 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 424 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 425 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 426 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 427 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 428 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 429 v += 49; 430 } 431 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 432 x[5+idx] = s6;x[6+idx] = s7; 433 idx += 7; 434 } 435 /* backward solve the L^T */ 436 for (i=n-1; i>=0; i--){ 437 v = aa + 49*diag[i] - 49; 438 vi = aj + diag[i] - 1; 439 nz = diag[i] - ai[i]; 440 idt = 7*i; 441 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 442 s6 = x[5+idt];s7 = x[6+idt]; 443 while (nz--) { 444 idx = 7*(*vi--); 445 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 446 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 447 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 448 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 449 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 450 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 451 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 452 v -= 49; 453 } 454 } 455 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 458 PetscFunctionReturn(0); 459 } 460 461 /*---------------------------------------------------------------------------------------------*/ 462 #undef __FUNCT__ 463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 465 { 466 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 467 IS iscol=a->col,isrow=a->row; 468 PetscErrorCode ierr; 469 const PetscInt *r,*c,*rout,*cout; 470 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 471 PetscInt *diag = a->diag; 472 MatScalar *aa=a->a,*v; 473 PetscScalar s1,*x,*b,*t; 474 475 PetscFunctionBegin; 476 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 477 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 478 t = a->solve_work; 479 480 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 481 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 482 483 /* copy the b into temp work space according to permutation */ 484 for (i=0; i<n; i++) { 485 t[i] = b[c[i]]; 486 } 487 488 /* forward solve the U^T */ 489 for (i=0; i<n; i++) { 490 491 v = aa + diag[i]; 492 /* multiply by the inverse of the block diagonal */ 493 s1 = (*v++)*t[i]; 494 vi = aj + diag[i] + 1; 495 nz = ai[i+1] - diag[i] - 1; 496 while (nz--) { 497 t[*vi++] -= (*v++)*s1; 498 } 499 t[i] = s1; 500 } 501 /* backward solve the L^T */ 502 for (i=n-1; i>=0; i--){ 503 v = aa + diag[i] - 1; 504 vi = aj + diag[i] - 1; 505 nz = diag[i] - ai[i]; 506 s1 = t[i]; 507 while (nz--) { 508 t[*vi--] -= (*v--)*s1; 509 } 510 } 511 512 /* copy t into x according to permutation */ 513 for (i=0; i<n; i++) { 514 x[r[i]] = t[i]; 515 } 516 517 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 518 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 519 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 520 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 521 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 522 PetscFunctionReturn(0); 523 } 524 525 #undef __FUNCT__ 526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 527 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 528 { 529 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 530 IS iscol=a->col,isrow=a->row; 531 PetscErrorCode ierr; 532 const PetscInt *r,*c,*rout,*cout; 533 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 534 PetscInt *diag = a->diag,ii,ic,ir,oidx; 535 MatScalar *aa=a->a,*v; 536 PetscScalar s1,s2,x1,x2; 537 PetscScalar *x,*b,*t; 538 539 PetscFunctionBegin; 540 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 541 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 542 t = a->solve_work; 543 544 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 545 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 546 547 /* copy the b into temp work space according to permutation */ 548 ii = 0; 549 for (i=0; i<n; i++) { 550 ic = 2*c[i]; 551 t[ii] = b[ic]; 552 t[ii+1] = b[ic+1]; 553 ii += 2; 554 } 555 556 /* forward solve the U^T */ 557 idx = 0; 558 for (i=0; i<n; i++) { 559 560 v = aa + 4*diag[i]; 561 /* multiply by the inverse of the block diagonal */ 562 x1 = t[idx]; x2 = t[1+idx]; 563 s1 = v[0]*x1 + v[1]*x2; 564 s2 = v[2]*x1 + v[3]*x2; 565 v += 4; 566 567 vi = aj + diag[i] + 1; 568 nz = ai[i+1] - diag[i] - 1; 569 while (nz--) { 570 oidx = 2*(*vi++); 571 t[oidx] -= v[0]*s1 + v[1]*s2; 572 t[oidx+1] -= v[2]*s1 + v[3]*s2; 573 v += 4; 574 } 575 t[idx] = s1;t[1+idx] = s2; 576 idx += 2; 577 } 578 /* backward solve the L^T */ 579 for (i=n-1; i>=0; i--){ 580 v = aa + 4*diag[i] - 4; 581 vi = aj + diag[i] - 1; 582 nz = diag[i] - ai[i]; 583 idt = 2*i; 584 s1 = t[idt]; s2 = t[1+idt]; 585 while (nz--) { 586 idx = 2*(*vi--); 587 t[idx] -= v[0]*s1 + v[1]*s2; 588 t[idx+1] -= v[2]*s1 + v[3]*s2; 589 v -= 4; 590 } 591 } 592 593 /* copy t into x according to permutation */ 594 ii = 0; 595 for (i=0; i<n; i++) { 596 ir = 2*r[i]; 597 x[ir] = t[ii]; 598 x[ir+1] = t[ii+1]; 599 ii += 2; 600 } 601 602 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 603 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 604 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 605 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 606 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 607 PetscFunctionReturn(0); 608 } 609 610 #undef __FUNCT__ 611 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 612 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 613 { 614 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 615 IS iscol=a->col,isrow=a->row; 616 PetscErrorCode ierr; 617 const PetscInt *r,*c,*rout,*cout; 618 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 619 PetscInt *diag = a->diag,ii,ic,ir,oidx; 620 MatScalar *aa=a->a,*v; 621 PetscScalar s1,s2,s3,x1,x2,x3; 622 PetscScalar *x,*b,*t; 623 624 PetscFunctionBegin; 625 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 626 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 627 t = a->solve_work; 628 629 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 630 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 631 632 /* copy the b into temp work space according to permutation */ 633 ii = 0; 634 for (i=0; i<n; i++) { 635 ic = 3*c[i]; 636 t[ii] = b[ic]; 637 t[ii+1] = b[ic+1]; 638 t[ii+2] = b[ic+2]; 639 ii += 3; 640 } 641 642 /* forward solve the U^T */ 643 idx = 0; 644 for (i=0; i<n; i++) { 645 646 v = aa + 9*diag[i]; 647 /* multiply by the inverse of the block diagonal */ 648 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 649 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 650 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 651 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 652 v += 9; 653 654 vi = aj + diag[i] + 1; 655 nz = ai[i+1] - diag[i] - 1; 656 while (nz--) { 657 oidx = 3*(*vi++); 658 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 659 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 660 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 661 v += 9; 662 } 663 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 664 idx += 3; 665 } 666 /* backward solve the L^T */ 667 for (i=n-1; i>=0; i--){ 668 v = aa + 9*diag[i] - 9; 669 vi = aj + diag[i] - 1; 670 nz = diag[i] - ai[i]; 671 idt = 3*i; 672 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 673 while (nz--) { 674 idx = 3*(*vi--); 675 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 676 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 677 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 678 v -= 9; 679 } 680 } 681 682 /* copy t into x according to permutation */ 683 ii = 0; 684 for (i=0; i<n; i++) { 685 ir = 3*r[i]; 686 x[ir] = t[ii]; 687 x[ir+1] = t[ii+1]; 688 x[ir+2] = t[ii+2]; 689 ii += 3; 690 } 691 692 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 693 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 694 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 695 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 696 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 697 PetscFunctionReturn(0); 698 } 699 700 #undef __FUNCT__ 701 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 702 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 703 { 704 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 705 IS iscol=a->col,isrow=a->row; 706 PetscErrorCode ierr; 707 const PetscInt *r,*c,*rout,*cout; 708 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 709 PetscInt *diag = a->diag,ii,ic,ir,oidx; 710 MatScalar *aa=a->a,*v; 711 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 712 PetscScalar *x,*b,*t; 713 714 PetscFunctionBegin; 715 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 716 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 717 t = a->solve_work; 718 719 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 720 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 721 722 /* copy the b into temp work space according to permutation */ 723 ii = 0; 724 for (i=0; i<n; i++) { 725 ic = 4*c[i]; 726 t[ii] = b[ic]; 727 t[ii+1] = b[ic+1]; 728 t[ii+2] = b[ic+2]; 729 t[ii+3] = b[ic+3]; 730 ii += 4; 731 } 732 733 /* forward solve the U^T */ 734 idx = 0; 735 for (i=0; i<n; i++) { 736 737 v = aa + 16*diag[i]; 738 /* multiply by the inverse of the block diagonal */ 739 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 740 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 741 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 742 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 743 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 744 v += 16; 745 746 vi = aj + diag[i] + 1; 747 nz = ai[i+1] - diag[i] - 1; 748 while (nz--) { 749 oidx = 4*(*vi++); 750 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 751 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 752 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 753 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 754 v += 16; 755 } 756 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 757 idx += 4; 758 } 759 /* backward solve the L^T */ 760 for (i=n-1; i>=0; i--){ 761 v = aa + 16*diag[i] - 16; 762 vi = aj + diag[i] - 1; 763 nz = diag[i] - ai[i]; 764 idt = 4*i; 765 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 766 while (nz--) { 767 idx = 4*(*vi--); 768 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 769 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 770 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 771 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 772 v -= 16; 773 } 774 } 775 776 /* copy t into x according to permutation */ 777 ii = 0; 778 for (i=0; i<n; i++) { 779 ir = 4*r[i]; 780 x[ir] = t[ii]; 781 x[ir+1] = t[ii+1]; 782 x[ir+2] = t[ii+2]; 783 x[ir+3] = t[ii+3]; 784 ii += 4; 785 } 786 787 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 788 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 789 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 790 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 791 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 792 PetscFunctionReturn(0); 793 } 794 795 #undef __FUNCT__ 796 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 797 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 798 { 799 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 800 IS iscol=a->col,isrow=a->row; 801 PetscErrorCode ierr; 802 const PetscInt *r,*c,*rout,*cout; 803 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 804 PetscInt *diag = a->diag,ii,ic,ir,oidx; 805 MatScalar *aa=a->a,*v; 806 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 807 PetscScalar *x,*b,*t; 808 809 PetscFunctionBegin; 810 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 811 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 812 t = a->solve_work; 813 814 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 815 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 816 817 /* copy the b into temp work space according to permutation */ 818 ii = 0; 819 for (i=0; i<n; i++) { 820 ic = 5*c[i]; 821 t[ii] = b[ic]; 822 t[ii+1] = b[ic+1]; 823 t[ii+2] = b[ic+2]; 824 t[ii+3] = b[ic+3]; 825 t[ii+4] = b[ic+4]; 826 ii += 5; 827 } 828 829 /* forward solve the U^T */ 830 idx = 0; 831 for (i=0; i<n; i++) { 832 833 v = aa + 25*diag[i]; 834 /* multiply by the inverse of the block diagonal */ 835 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 836 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 837 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 838 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 839 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 840 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 841 v += 25; 842 843 vi = aj + diag[i] + 1; 844 nz = ai[i+1] - diag[i] - 1; 845 while (nz--) { 846 oidx = 5*(*vi++); 847 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 848 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 849 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 850 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 851 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 852 v += 25; 853 } 854 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 855 idx += 5; 856 } 857 /* backward solve the L^T */ 858 for (i=n-1; i>=0; i--){ 859 v = aa + 25*diag[i] - 25; 860 vi = aj + diag[i] - 1; 861 nz = diag[i] - ai[i]; 862 idt = 5*i; 863 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 864 while (nz--) { 865 idx = 5*(*vi--); 866 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 867 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 868 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 869 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 870 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 871 v -= 25; 872 } 873 } 874 875 /* copy t into x according to permutation */ 876 ii = 0; 877 for (i=0; i<n; i++) { 878 ir = 5*r[i]; 879 x[ir] = t[ii]; 880 x[ir+1] = t[ii+1]; 881 x[ir+2] = t[ii+2]; 882 x[ir+3] = t[ii+3]; 883 x[ir+4] = t[ii+4]; 884 ii += 5; 885 } 886 887 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 888 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 889 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 890 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 891 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 892 PetscFunctionReturn(0); 893 } 894 895 #undef __FUNCT__ 896 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 897 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 898 { 899 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 900 IS iscol=a->col,isrow=a->row; 901 PetscErrorCode ierr; 902 const PetscInt *r,*c,*rout,*cout; 903 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 904 PetscInt *diag = a->diag,ii,ic,ir,oidx; 905 MatScalar *aa=a->a,*v; 906 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 907 PetscScalar *x,*b,*t; 908 909 PetscFunctionBegin; 910 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 911 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 912 t = a->solve_work; 913 914 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 915 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 916 917 /* copy the b into temp work space according to permutation */ 918 ii = 0; 919 for (i=0; i<n; i++) { 920 ic = 6*c[i]; 921 t[ii] = b[ic]; 922 t[ii+1] = b[ic+1]; 923 t[ii+2] = b[ic+2]; 924 t[ii+3] = b[ic+3]; 925 t[ii+4] = b[ic+4]; 926 t[ii+5] = b[ic+5]; 927 ii += 6; 928 } 929 930 /* forward solve the U^T */ 931 idx = 0; 932 for (i=0; i<n; i++) { 933 934 v = aa + 36*diag[i]; 935 /* multiply by the inverse of the block diagonal */ 936 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 937 x6 = t[5+idx]; 938 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 939 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 940 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 941 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 942 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 943 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 944 v += 36; 945 946 vi = aj + diag[i] + 1; 947 nz = ai[i+1] - diag[i] - 1; 948 while (nz--) { 949 oidx = 6*(*vi++); 950 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 951 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 952 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 953 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 954 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 955 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 956 v += 36; 957 } 958 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 959 t[5+idx] = s6; 960 idx += 6; 961 } 962 /* backward solve the L^T */ 963 for (i=n-1; i>=0; i--){ 964 v = aa + 36*diag[i] - 36; 965 vi = aj + diag[i] - 1; 966 nz = diag[i] - ai[i]; 967 idt = 6*i; 968 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 969 s6 = t[5+idt]; 970 while (nz--) { 971 idx = 6*(*vi--); 972 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 973 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 974 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 975 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 976 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 977 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 978 v -= 36; 979 } 980 } 981 982 /* copy t into x according to permutation */ 983 ii = 0; 984 for (i=0; i<n; i++) { 985 ir = 6*r[i]; 986 x[ir] = t[ii]; 987 x[ir+1] = t[ii+1]; 988 x[ir+2] = t[ii+2]; 989 x[ir+3] = t[ii+3]; 990 x[ir+4] = t[ii+4]; 991 x[ir+5] = t[ii+5]; 992 ii += 6; 993 } 994 995 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 996 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 997 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 998 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 999 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1000 PetscFunctionReturn(0); 1001 } 1002 1003 #undef __FUNCT__ 1004 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1005 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1006 { 1007 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1008 IS iscol=a->col,isrow=a->row; 1009 PetscErrorCode ierr; 1010 const PetscInt *r,*c,*rout,*cout; 1011 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1012 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1013 MatScalar *aa=a->a,*v; 1014 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1015 PetscScalar *x,*b,*t; 1016 1017 PetscFunctionBegin; 1018 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1019 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1020 t = a->solve_work; 1021 1022 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1023 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1024 1025 /* copy the b into temp work space according to permutation */ 1026 ii = 0; 1027 for (i=0; i<n; i++) { 1028 ic = 7*c[i]; 1029 t[ii] = b[ic]; 1030 t[ii+1] = b[ic+1]; 1031 t[ii+2] = b[ic+2]; 1032 t[ii+3] = b[ic+3]; 1033 t[ii+4] = b[ic+4]; 1034 t[ii+5] = b[ic+5]; 1035 t[ii+6] = b[ic+6]; 1036 ii += 7; 1037 } 1038 1039 /* forward solve the U^T */ 1040 idx = 0; 1041 for (i=0; i<n; i++) { 1042 1043 v = aa + 49*diag[i]; 1044 /* multiply by the inverse of the block diagonal */ 1045 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1046 x6 = t[5+idx]; x7 = t[6+idx]; 1047 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1048 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1049 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1050 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1051 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1052 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1053 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1054 v += 49; 1055 1056 vi = aj + diag[i] + 1; 1057 nz = ai[i+1] - diag[i] - 1; 1058 while (nz--) { 1059 oidx = 7*(*vi++); 1060 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1061 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1062 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1063 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1064 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1065 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1066 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1067 v += 49; 1068 } 1069 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1070 t[5+idx] = s6;t[6+idx] = s7; 1071 idx += 7; 1072 } 1073 /* backward solve the L^T */ 1074 for (i=n-1; i>=0; i--){ 1075 v = aa + 49*diag[i] - 49; 1076 vi = aj + diag[i] - 1; 1077 nz = diag[i] - ai[i]; 1078 idt = 7*i; 1079 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1080 s6 = t[5+idt];s7 = t[6+idt]; 1081 while (nz--) { 1082 idx = 7*(*vi--); 1083 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1084 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1085 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1086 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1087 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1088 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1089 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1090 v -= 49; 1091 } 1092 } 1093 1094 /* copy t into x according to permutation */ 1095 ii = 0; 1096 for (i=0; i<n; i++) { 1097 ir = 7*r[i]; 1098 x[ir] = t[ii]; 1099 x[ir+1] = t[ii+1]; 1100 x[ir+2] = t[ii+2]; 1101 x[ir+3] = t[ii+3]; 1102 x[ir+4] = t[ii+4]; 1103 x[ir+5] = t[ii+5]; 1104 x[ir+6] = t[ii+6]; 1105 ii += 7; 1106 } 1107 1108 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1109 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1110 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1111 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1112 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1113 PetscFunctionReturn(0); 1114 } 1115 1116 /* ----------------------------------------------------------- */ 1117 #undef __FUNCT__ 1118 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1119 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1120 { 1121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1122 IS iscol=a->col,isrow=a->row; 1123 PetscErrorCode ierr; 1124 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1125 PetscInt i,n=a->mbs; 1126 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1127 MatScalar *aa=a->a,*v; 1128 PetscScalar *x,*b,*s,*t,*ls; 1129 1130 PetscFunctionBegin; 1131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1133 t = a->solve_work; 1134 1135 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1136 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1137 1138 /* forward solve the lower triangular */ 1139 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1140 for (i=1; i<n; i++) { 1141 v = aa + bs2*ai[i]; 1142 vi = aj + ai[i]; 1143 nz = a->diag[i] - ai[i]; 1144 s = t + bs*i; 1145 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1146 while (nz--) { 1147 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1148 v += bs2; 1149 } 1150 } 1151 /* backward solve the upper triangular */ 1152 ls = a->solve_work + A->cmap->n; 1153 for (i=n-1; i>=0; i--){ 1154 v = aa + bs2*(a->diag[i] + 1); 1155 vi = aj + a->diag[i] + 1; 1156 nz = ai[i+1] - a->diag[i] - 1; 1157 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1158 while (nz--) { 1159 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1160 v += bs2; 1161 } 1162 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1163 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1164 } 1165 1166 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1167 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1168 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1170 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1171 PetscFunctionReturn(0); 1172 } 1173 1174 #undef __FUNCT__ 1175 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1176 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1177 { 1178 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1179 IS iscol=a->col,isrow=a->row; 1180 PetscErrorCode ierr; 1181 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1182 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1183 MatScalar *aa=a->a,*v; 1184 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1185 PetscScalar *x,*b,*t; 1186 1187 PetscFunctionBegin; 1188 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1189 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1190 t = a->solve_work; 1191 1192 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1193 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1194 1195 /* forward solve the lower triangular */ 1196 idx = 7*(*r++); 1197 t[0] = b[idx]; t[1] = b[1+idx]; 1198 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1199 t[5] = b[5+idx]; t[6] = b[6+idx]; 1200 1201 for (i=1; i<n; i++) { 1202 v = aa + 49*ai[i]; 1203 vi = aj + ai[i]; 1204 nz = diag[i] - ai[i]; 1205 idx = 7*(*r++); 1206 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1207 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1208 while (nz--) { 1209 idx = 7*(*vi++); 1210 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1211 x4 = t[3+idx];x5 = t[4+idx]; 1212 x6 = t[5+idx];x7 = t[6+idx]; 1213 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1214 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1215 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1216 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1217 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1218 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1219 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1220 v += 49; 1221 } 1222 idx = 7*i; 1223 t[idx] = s1;t[1+idx] = s2; 1224 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1225 t[5+idx] = s6;t[6+idx] = s7; 1226 } 1227 /* backward solve the upper triangular */ 1228 for (i=n-1; i>=0; i--){ 1229 v = aa + 49*diag[i] + 49; 1230 vi = aj + diag[i] + 1; 1231 nz = ai[i+1] - diag[i] - 1; 1232 idt = 7*i; 1233 s1 = t[idt]; s2 = t[1+idt]; 1234 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1235 s6 = t[5+idt];s7 = t[6+idt]; 1236 while (nz--) { 1237 idx = 7*(*vi++); 1238 x1 = t[idx]; x2 = t[1+idx]; 1239 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1240 x6 = t[5+idx]; x7 = t[6+idx]; 1241 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1242 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1243 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1244 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1245 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1246 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1247 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1248 v += 49; 1249 } 1250 idc = 7*(*c--); 1251 v = aa + 49*diag[i]; 1252 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1253 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1254 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1255 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1256 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1257 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1258 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1259 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1260 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1261 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1262 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1263 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1264 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1265 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1266 } 1267 1268 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1269 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1270 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1271 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1272 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1273 PetscFunctionReturn(0); 1274 } 1275 1276 #undef __FUNCT__ 1277 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1278 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1279 { 1280 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1281 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1282 PetscErrorCode ierr; 1283 PetscInt *diag = a->diag,jdx; 1284 const MatScalar *aa=a->a,*v; 1285 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1286 const PetscScalar *b; 1287 1288 PetscFunctionBegin; 1289 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1290 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1291 /* forward solve the lower triangular */ 1292 idx = 0; 1293 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1294 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1295 x[6] = b[6+idx]; 1296 for (i=1; i<n; i++) { 1297 v = aa + 49*ai[i]; 1298 vi = aj + ai[i]; 1299 nz = diag[i] - ai[i]; 1300 idx = 7*i; 1301 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1302 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1303 s7 = b[6+idx]; 1304 while (nz--) { 1305 jdx = 7*(*vi++); 1306 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1307 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1308 x7 = x[6+jdx]; 1309 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1310 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1311 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1312 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1313 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1314 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1315 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1316 v += 49; 1317 } 1318 x[idx] = s1; 1319 x[1+idx] = s2; 1320 x[2+idx] = s3; 1321 x[3+idx] = s4; 1322 x[4+idx] = s5; 1323 x[5+idx] = s6; 1324 x[6+idx] = s7; 1325 } 1326 /* backward solve the upper triangular */ 1327 for (i=n-1; i>=0; i--){ 1328 v = aa + 49*diag[i] + 49; 1329 vi = aj + diag[i] + 1; 1330 nz = ai[i+1] - diag[i] - 1; 1331 idt = 7*i; 1332 s1 = x[idt]; s2 = x[1+idt]; 1333 s3 = x[2+idt]; s4 = x[3+idt]; 1334 s5 = x[4+idt]; s6 = x[5+idt]; 1335 s7 = x[6+idt]; 1336 while (nz--) { 1337 idx = 7*(*vi++); 1338 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1339 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1340 x7 = x[6+idx]; 1341 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1342 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1343 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1344 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1345 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1346 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1347 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1348 v += 49; 1349 } 1350 v = aa + 49*diag[i]; 1351 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1352 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1353 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1354 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1355 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1356 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1357 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1358 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1359 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1360 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1361 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1362 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1363 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1364 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1365 } 1366 1367 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1368 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1369 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1370 PetscFunctionReturn(0); 1371 } 1372 1373 #undef __FUNCT__ 1374 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1375 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1376 { 1377 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1378 IS iscol=a->col,isrow=a->row; 1379 PetscErrorCode ierr; 1380 const PetscInt *r,*c,*rout,*cout; 1381 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1382 const MatScalar *aa=a->a,*v; 1383 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1384 const PetscScalar *b; 1385 PetscFunctionBegin; 1386 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1387 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1388 t = a->solve_work; 1389 1390 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1391 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1392 1393 /* forward solve the lower triangular */ 1394 idx = 6*(*r++); 1395 t[0] = b[idx]; t[1] = b[1+idx]; 1396 t[2] = b[2+idx]; t[3] = b[3+idx]; 1397 t[4] = b[4+idx]; t[5] = b[5+idx]; 1398 for (i=1; i<n; i++) { 1399 v = aa + 36*ai[i]; 1400 vi = aj + ai[i]; 1401 nz = diag[i] - ai[i]; 1402 idx = 6*(*r++); 1403 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1404 s5 = b[4+idx]; s6 = b[5+idx]; 1405 while (nz--) { 1406 idx = 6*(*vi++); 1407 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1408 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1409 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1410 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1411 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1412 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1413 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1414 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1415 v += 36; 1416 } 1417 idx = 6*i; 1418 t[idx] = s1;t[1+idx] = s2; 1419 t[2+idx] = s3;t[3+idx] = s4; 1420 t[4+idx] = s5;t[5+idx] = s6; 1421 } 1422 /* backward solve the upper triangular */ 1423 for (i=n-1; i>=0; i--){ 1424 v = aa + 36*diag[i] + 36; 1425 vi = aj + diag[i] + 1; 1426 nz = ai[i+1] - diag[i] - 1; 1427 idt = 6*i; 1428 s1 = t[idt]; s2 = t[1+idt]; 1429 s3 = t[2+idt];s4 = t[3+idt]; 1430 s5 = t[4+idt];s6 = t[5+idt]; 1431 while (nz--) { 1432 idx = 6*(*vi++); 1433 x1 = t[idx]; x2 = t[1+idx]; 1434 x3 = t[2+idx]; x4 = t[3+idx]; 1435 x5 = t[4+idx]; x6 = t[5+idx]; 1436 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1437 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1438 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1439 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1440 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1441 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1442 v += 36; 1443 } 1444 idc = 6*(*c--); 1445 v = aa + 36*diag[i]; 1446 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1447 v[18]*s4+v[24]*s5+v[30]*s6; 1448 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1449 v[19]*s4+v[25]*s5+v[31]*s6; 1450 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1451 v[20]*s4+v[26]*s5+v[32]*s6; 1452 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1453 v[21]*s4+v[27]*s5+v[33]*s6; 1454 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1455 v[22]*s4+v[28]*s5+v[34]*s6; 1456 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1457 v[23]*s4+v[29]*s5+v[35]*s6; 1458 } 1459 1460 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1461 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1462 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1463 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1464 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1465 PetscFunctionReturn(0); 1466 } 1467 1468 #undef __FUNCT__ 1469 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1470 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1471 { 1472 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1473 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1474 PetscErrorCode ierr; 1475 PetscInt *diag = a->diag,jdx; 1476 const MatScalar *aa=a->a,*v; 1477 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1478 const PetscScalar *b; 1479 1480 PetscFunctionBegin; 1481 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1482 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1483 /* forward solve the lower triangular */ 1484 idx = 0; 1485 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1486 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1487 for (i=1; i<n; i++) { 1488 v = aa + 36*ai[i]; 1489 vi = aj + ai[i]; 1490 nz = diag[i] - ai[i]; 1491 idx = 6*i; 1492 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1493 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1494 while (nz--) { 1495 jdx = 6*(*vi++); 1496 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1497 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1498 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1499 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1500 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1501 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1502 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1503 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1504 v += 36; 1505 } 1506 x[idx] = s1; 1507 x[1+idx] = s2; 1508 x[2+idx] = s3; 1509 x[3+idx] = s4; 1510 x[4+idx] = s5; 1511 x[5+idx] = s6; 1512 } 1513 /* backward solve the upper triangular */ 1514 for (i=n-1; i>=0; i--){ 1515 v = aa + 36*diag[i] + 36; 1516 vi = aj + diag[i] + 1; 1517 nz = ai[i+1] - diag[i] - 1; 1518 idt = 6*i; 1519 s1 = x[idt]; s2 = x[1+idt]; 1520 s3 = x[2+idt]; s4 = x[3+idt]; 1521 s5 = x[4+idt]; s6 = x[5+idt]; 1522 while (nz--) { 1523 idx = 6*(*vi++); 1524 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1525 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1526 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1527 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1528 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1529 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1530 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1531 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1532 v += 36; 1533 } 1534 v = aa + 36*diag[i]; 1535 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1536 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1537 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1538 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1539 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1540 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1541 } 1542 1543 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1544 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1545 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1546 PetscFunctionReturn(0); 1547 } 1548 1549 #undef __FUNCT__ 1550 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1551 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1552 { 1553 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1554 IS iscol=a->col,isrow=a->row; 1555 PetscErrorCode ierr; 1556 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 1557 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1558 const MatScalar *aa=a->a,*v; 1559 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1560 const PetscScalar *b; 1561 1562 PetscFunctionBegin; 1563 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1564 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1565 t = a->solve_work; 1566 1567 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1568 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1569 1570 /* forward solve the lower triangular */ 1571 idx = 5*(*r++); 1572 t[0] = b[idx]; t[1] = b[1+idx]; 1573 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1574 for (i=1; i<n; i++) { 1575 v = aa + 25*ai[i]; 1576 vi = aj + ai[i]; 1577 nz = diag[i] - ai[i]; 1578 idx = 5*(*r++); 1579 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1580 s5 = b[4+idx]; 1581 while (nz--) { 1582 idx = 5*(*vi++); 1583 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1584 x4 = t[3+idx];x5 = t[4+idx]; 1585 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1586 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1587 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1588 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1589 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1590 v += 25; 1591 } 1592 idx = 5*i; 1593 t[idx] = s1;t[1+idx] = s2; 1594 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1595 } 1596 /* backward solve the upper triangular */ 1597 for (i=n-1; i>=0; i--){ 1598 v = aa + 25*diag[i] + 25; 1599 vi = aj + diag[i] + 1; 1600 nz = ai[i+1] - diag[i] - 1; 1601 idt = 5*i; 1602 s1 = t[idt]; s2 = t[1+idt]; 1603 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1604 while (nz--) { 1605 idx = 5*(*vi++); 1606 x1 = t[idx]; x2 = t[1+idx]; 1607 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1608 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1609 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1610 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1611 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1612 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1613 v += 25; 1614 } 1615 idc = 5*(*c--); 1616 v = aa + 25*diag[i]; 1617 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1618 v[15]*s4+v[20]*s5; 1619 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1620 v[16]*s4+v[21]*s5; 1621 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1622 v[17]*s4+v[22]*s5; 1623 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1624 v[18]*s4+v[23]*s5; 1625 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1626 v[19]*s4+v[24]*s5; 1627 } 1628 1629 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1630 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1631 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1632 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1633 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1634 PetscFunctionReturn(0); 1635 } 1636 1637 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1638 { 1639 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1640 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1641 PetscErrorCode ierr; 1642 PetscInt jdx; 1643 const MatScalar *aa=a->a,*v; 1644 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1645 const PetscScalar *b; 1646 1647 PetscFunctionBegin; 1648 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1649 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1650 /* forward solve the lower triangular */ 1651 idx = 0; 1652 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1653 for (i=1; i<n; i++) { 1654 v = aa + 25*ai[i]; 1655 vi = aj + ai[i]; 1656 nz = ai[i+1] - ai[i]; 1657 idx = 5*i; 1658 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1659 while (nz--) { 1660 jdx = 5*(*vi++); 1661 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1662 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1663 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1664 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1665 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1666 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1667 v += 25; 1668 } 1669 x[idx] = s1; 1670 x[1+idx] = s2; 1671 x[2+idx] = s3; 1672 x[3+idx] = s4; 1673 x[4+idx] = s5; 1674 } 1675 1676 /* backward solve the upper triangular */ 1677 for (i=n-1; i>=0; i--){ 1678 v = aa + 25*ai[2*n-i]; 1679 vi = aj + ai[2*n-i]; 1680 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1681 idt = 5*i; 1682 s1 = x[idt]; s2 = x[1+idt]; 1683 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1684 while (nz--) { 1685 idx = 5*(*vi++); 1686 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1687 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1688 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1689 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1690 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1691 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1692 v += 25; 1693 } 1694 /* x = inv_diagonal*x */ 1695 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1696 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1697 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1698 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1699 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1700 } 1701 1702 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1703 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1704 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1705 PetscFunctionReturn(0); 1706 } 1707 1708 #undef __FUNCT__ 1709 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1710 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1711 { 1712 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1713 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1714 PetscErrorCode ierr; 1715 PetscInt *diag = a->diag,jdx; 1716 const MatScalar *aa=a->a,*v; 1717 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1718 const PetscScalar *b; 1719 1720 PetscFunctionBegin; 1721 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1722 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1723 /* forward solve the lower triangular */ 1724 idx = 0; 1725 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1726 for (i=1; i<n; i++) { 1727 v = aa + 25*ai[i]; 1728 vi = aj + ai[i]; 1729 nz = diag[i] - ai[i]; 1730 idx = 5*i; 1731 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1732 while (nz--) { 1733 jdx = 5*(*vi++); 1734 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1735 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1736 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1737 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1738 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1739 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1740 v += 25; 1741 } 1742 x[idx] = s1; 1743 x[1+idx] = s2; 1744 x[2+idx] = s3; 1745 x[3+idx] = s4; 1746 x[4+idx] = s5; 1747 } 1748 /* backward solve the upper triangular */ 1749 for (i=n-1; i>=0; i--){ 1750 v = aa + 25*diag[i] + 25; 1751 vi = aj + diag[i] + 1; 1752 nz = ai[i+1] - diag[i] - 1; 1753 idt = 5*i; 1754 s1 = x[idt]; s2 = x[1+idt]; 1755 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1756 while (nz--) { 1757 idx = 5*(*vi++); 1758 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1759 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1760 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1761 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1762 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1763 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1764 v += 25; 1765 } 1766 v = aa + 25*diag[i]; 1767 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1768 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1769 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1770 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1771 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1772 } 1773 1774 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1775 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1776 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1777 PetscFunctionReturn(0); 1778 } 1779 1780 #undef __FUNCT__ 1781 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1782 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1783 { 1784 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1785 IS iscol=a->col,isrow=a->row; 1786 PetscErrorCode ierr; 1787 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1788 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1789 const MatScalar *aa=a->a,*v; 1790 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1791 const PetscScalar *b; 1792 1793 PetscFunctionBegin; 1794 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1795 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1796 t = a->solve_work; 1797 1798 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1799 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1800 1801 /* forward solve the lower triangular */ 1802 idx = 4*(*r++); 1803 t[0] = b[idx]; t[1] = b[1+idx]; 1804 t[2] = b[2+idx]; t[3] = b[3+idx]; 1805 for (i=1; i<n; i++) { 1806 v = aa + 16*ai[i]; 1807 vi = aj + ai[i]; 1808 nz = diag[i] - ai[i]; 1809 idx = 4*(*r++); 1810 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1811 while (nz--) { 1812 idx = 4*(*vi++); 1813 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1814 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1815 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1816 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1817 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1818 v += 16; 1819 } 1820 idx = 4*i; 1821 t[idx] = s1;t[1+idx] = s2; 1822 t[2+idx] = s3;t[3+idx] = s4; 1823 } 1824 /* backward solve the upper triangular */ 1825 for (i=n-1; i>=0; i--){ 1826 v = aa + 16*diag[i] + 16; 1827 vi = aj + diag[i] + 1; 1828 nz = ai[i+1] - diag[i] - 1; 1829 idt = 4*i; 1830 s1 = t[idt]; s2 = t[1+idt]; 1831 s3 = t[2+idt];s4 = t[3+idt]; 1832 while (nz--) { 1833 idx = 4*(*vi++); 1834 x1 = t[idx]; x2 = t[1+idx]; 1835 x3 = t[2+idx]; x4 = t[3+idx]; 1836 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1837 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1838 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1839 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1840 v += 16; 1841 } 1842 idc = 4*(*c--); 1843 v = aa + 16*diag[i]; 1844 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1845 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1846 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1847 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1848 } 1849 1850 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1851 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1852 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1853 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1854 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1855 PetscFunctionReturn(0); 1856 } 1857 1858 #undef __FUNCT__ 1859 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1860 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1861 { 1862 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1863 IS iscol=a->col,isrow=a->row; 1864 PetscErrorCode ierr; 1865 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1866 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1867 const MatScalar *aa=a->a,*v; 1868 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 1869 PetscScalar *x; 1870 const PetscScalar *b; 1871 1872 PetscFunctionBegin; 1873 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1874 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1875 t = (MatScalar *)a->solve_work; 1876 1877 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1878 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1879 1880 /* forward solve the lower triangular */ 1881 idx = 4*(*r++); 1882 t[0] = (MatScalar)b[idx]; 1883 t[1] = (MatScalar)b[1+idx]; 1884 t[2] = (MatScalar)b[2+idx]; 1885 t[3] = (MatScalar)b[3+idx]; 1886 for (i=1; i<n; i++) { 1887 v = aa + 16*ai[i]; 1888 vi = aj + ai[i]; 1889 nz = diag[i] - ai[i]; 1890 idx = 4*(*r++); 1891 s1 = (MatScalar)b[idx]; 1892 s2 = (MatScalar)b[1+idx]; 1893 s3 = (MatScalar)b[2+idx]; 1894 s4 = (MatScalar)b[3+idx]; 1895 while (nz--) { 1896 idx = 4*(*vi++); 1897 x1 = t[idx]; 1898 x2 = t[1+idx]; 1899 x3 = t[2+idx]; 1900 x4 = t[3+idx]; 1901 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1902 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1903 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1904 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1905 v += 16; 1906 } 1907 idx = 4*i; 1908 t[idx] = s1; 1909 t[1+idx] = s2; 1910 t[2+idx] = s3; 1911 t[3+idx] = s4; 1912 } 1913 /* backward solve the upper triangular */ 1914 for (i=n-1; i>=0; i--){ 1915 v = aa + 16*diag[i] + 16; 1916 vi = aj + diag[i] + 1; 1917 nz = ai[i+1] - diag[i] - 1; 1918 idt = 4*i; 1919 s1 = t[idt]; 1920 s2 = t[1+idt]; 1921 s3 = t[2+idt]; 1922 s4 = t[3+idt]; 1923 while (nz--) { 1924 idx = 4*(*vi++); 1925 x1 = t[idx]; 1926 x2 = t[1+idx]; 1927 x3 = t[2+idx]; 1928 x4 = t[3+idx]; 1929 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1930 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1931 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1932 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1933 v += 16; 1934 } 1935 idc = 4*(*c--); 1936 v = aa + 16*diag[i]; 1937 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1938 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1939 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1940 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1941 x[idc] = (PetscScalar)t[idt]; 1942 x[1+idc] = (PetscScalar)t[1+idt]; 1943 x[2+idc] = (PetscScalar)t[2+idt]; 1944 x[3+idc] = (PetscScalar)t[3+idt]; 1945 } 1946 1947 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1948 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1949 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1950 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1951 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1952 PetscFunctionReturn(0); 1953 } 1954 1955 #if defined (PETSC_HAVE_SSE) 1956 1957 #include PETSC_HAVE_SSE 1958 1959 #undef __FUNCT__ 1960 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1961 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1962 { 1963 /* 1964 Note: This code uses demotion of double 1965 to float when performing the mixed-mode computation. 1966 This may not be numerically reasonable for all applications. 1967 */ 1968 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1969 IS iscol=a->col,isrow=a->row; 1970 PetscErrorCode ierr; 1971 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 1972 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1973 MatScalar *aa=a->a,*v; 1974 PetscScalar *x,*b,*t; 1975 1976 /* Make space in temp stack for 16 Byte Aligned arrays */ 1977 float ssealignedspace[11],*tmps,*tmpx; 1978 unsigned long offset; 1979 1980 PetscFunctionBegin; 1981 SSE_SCOPE_BEGIN; 1982 1983 offset = (unsigned long)ssealignedspace % 16; 1984 if (offset) offset = (16 - offset)/4; 1985 tmps = &ssealignedspace[offset]; 1986 tmpx = &ssealignedspace[offset+4]; 1987 PREFETCH_NTA(aa+16*ai[1]); 1988 1989 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1990 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1991 t = a->solve_work; 1992 1993 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1994 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1995 1996 /* forward solve the lower triangular */ 1997 idx = 4*(*r++); 1998 t[0] = b[idx]; t[1] = b[1+idx]; 1999 t[2] = b[2+idx]; t[3] = b[3+idx]; 2000 v = aa + 16*ai[1]; 2001 2002 for (i=1; i<n;) { 2003 PREFETCH_NTA(&v[8]); 2004 vi = aj + ai[i]; 2005 nz = diag[i] - ai[i]; 2006 idx = 4*(*r++); 2007 2008 /* Demote sum from double to float */ 2009 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 2010 LOAD_PS(tmps,XMM7); 2011 2012 while (nz--) { 2013 PREFETCH_NTA(&v[16]); 2014 idx = 4*(*vi++); 2015 2016 /* Demote solution (so far) from double to float */ 2017 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 2018 2019 /* 4x4 Matrix-Vector product with negative accumulation: */ 2020 SSE_INLINE_BEGIN_2(tmpx,v) 2021 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2022 2023 /* First Column */ 2024 SSE_COPY_PS(XMM0,XMM6) 2025 SSE_SHUFFLE(XMM0,XMM0,0x00) 2026 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2027 SSE_SUB_PS(XMM7,XMM0) 2028 2029 /* Second Column */ 2030 SSE_COPY_PS(XMM1,XMM6) 2031 SSE_SHUFFLE(XMM1,XMM1,0x55) 2032 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2033 SSE_SUB_PS(XMM7,XMM1) 2034 2035 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2036 2037 /* Third Column */ 2038 SSE_COPY_PS(XMM2,XMM6) 2039 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2040 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2041 SSE_SUB_PS(XMM7,XMM2) 2042 2043 /* Fourth Column */ 2044 SSE_COPY_PS(XMM3,XMM6) 2045 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2046 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2047 SSE_SUB_PS(XMM7,XMM3) 2048 SSE_INLINE_END_2 2049 2050 v += 16; 2051 } 2052 idx = 4*i; 2053 v = aa + 16*ai[++i]; 2054 PREFETCH_NTA(v); 2055 STORE_PS(tmps,XMM7); 2056 2057 /* Promote result from float to double */ 2058 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 2059 } 2060 /* backward solve the upper triangular */ 2061 idt = 4*(n-1); 2062 ai16 = 16*diag[n-1]; 2063 v = aa + ai16 + 16; 2064 for (i=n-1; i>=0;){ 2065 PREFETCH_NTA(&v[8]); 2066 vi = aj + diag[i] + 1; 2067 nz = ai[i+1] - diag[i] - 1; 2068 2069 /* Demote accumulator from double to float */ 2070 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 2071 LOAD_PS(tmps,XMM7); 2072 2073 while (nz--) { 2074 PREFETCH_NTA(&v[16]); 2075 idx = 4*(*vi++); 2076 2077 /* Demote solution (so far) from double to float */ 2078 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 2079 2080 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2081 SSE_INLINE_BEGIN_2(tmpx,v) 2082 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2083 2084 /* First Column */ 2085 SSE_COPY_PS(XMM0,XMM6) 2086 SSE_SHUFFLE(XMM0,XMM0,0x00) 2087 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2088 SSE_SUB_PS(XMM7,XMM0) 2089 2090 /* Second Column */ 2091 SSE_COPY_PS(XMM1,XMM6) 2092 SSE_SHUFFLE(XMM1,XMM1,0x55) 2093 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2094 SSE_SUB_PS(XMM7,XMM1) 2095 2096 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2097 2098 /* Third Column */ 2099 SSE_COPY_PS(XMM2,XMM6) 2100 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2101 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2102 SSE_SUB_PS(XMM7,XMM2) 2103 2104 /* Fourth Column */ 2105 SSE_COPY_PS(XMM3,XMM6) 2106 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2107 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2108 SSE_SUB_PS(XMM7,XMM3) 2109 SSE_INLINE_END_2 2110 v += 16; 2111 } 2112 v = aa + ai16; 2113 ai16 = 16*diag[--i]; 2114 PREFETCH_NTA(aa+ai16+16); 2115 /* 2116 Scale the result by the diagonal 4x4 block, 2117 which was inverted as part of the factorization 2118 */ 2119 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2120 /* First Column */ 2121 SSE_COPY_PS(XMM0,XMM7) 2122 SSE_SHUFFLE(XMM0,XMM0,0x00) 2123 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2124 2125 /* Second Column */ 2126 SSE_COPY_PS(XMM1,XMM7) 2127 SSE_SHUFFLE(XMM1,XMM1,0x55) 2128 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2129 SSE_ADD_PS(XMM0,XMM1) 2130 2131 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2132 2133 /* Third Column */ 2134 SSE_COPY_PS(XMM2,XMM7) 2135 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2136 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2137 SSE_ADD_PS(XMM0,XMM2) 2138 2139 /* Fourth Column */ 2140 SSE_COPY_PS(XMM3,XMM7) 2141 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2142 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2143 SSE_ADD_PS(XMM0,XMM3) 2144 2145 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2146 SSE_INLINE_END_3 2147 2148 /* Promote solution from float to double */ 2149 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2150 2151 /* Apply reordering to t and stream into x. */ 2152 /* This way, x doesn't pollute the cache. */ 2153 /* Be careful with size: 2 doubles = 4 floats! */ 2154 idc = 4*(*c--); 2155 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2156 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2157 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2158 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2159 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2160 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2161 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2162 SSE_INLINE_END_2 2163 v = aa + ai16 + 16; 2164 idt -= 4; 2165 } 2166 2167 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2168 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2169 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2170 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2171 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2172 SSE_SCOPE_END; 2173 PetscFunctionReturn(0); 2174 } 2175 2176 #endif 2177 2178 2179 /* 2180 Special case where the matrix was ILU(0) factored in the natural 2181 ordering. This eliminates the need for the column and row permutation. 2182 */ 2183 #undef __FUNCT__ 2184 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2185 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2186 { 2187 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2188 PetscInt n=a->mbs; 2189 const PetscInt *ai=a->i,*aj=a->j; 2190 PetscErrorCode ierr; 2191 const PetscInt *diag = a->diag; 2192 const MatScalar *aa=a->a; 2193 PetscScalar *x; 2194 const PetscScalar *b; 2195 2196 PetscFunctionBegin; 2197 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2198 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2199 2200 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2201 { 2202 static PetscScalar w[2000]; /* very BAD need to fix */ 2203 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2204 } 2205 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2206 { 2207 static PetscScalar w[2000]; /* very BAD need to fix */ 2208 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2209 } 2210 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2211 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2212 #else 2213 { 2214 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2215 const MatScalar *v; 2216 PetscInt jdx,idt,idx,nz,i,ai16; 2217 const PetscInt *vi; 2218 2219 /* forward solve the lower triangular */ 2220 idx = 0; 2221 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2222 for (i=1; i<n; i++) { 2223 v = aa + 16*ai[i]; 2224 vi = aj + ai[i]; 2225 nz = diag[i] - ai[i]; 2226 idx += 4; 2227 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2228 while (nz--) { 2229 jdx = 4*(*vi++); 2230 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2231 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2232 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2233 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2234 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2235 v += 16; 2236 } 2237 x[idx] = s1; 2238 x[1+idx] = s2; 2239 x[2+idx] = s3; 2240 x[3+idx] = s4; 2241 } 2242 /* backward solve the upper triangular */ 2243 idt = 4*(n-1); 2244 for (i=n-1; i>=0; i--){ 2245 ai16 = 16*diag[i]; 2246 v = aa + ai16 + 16; 2247 vi = aj + diag[i] + 1; 2248 nz = ai[i+1] - diag[i] - 1; 2249 s1 = x[idt]; s2 = x[1+idt]; 2250 s3 = x[2+idt];s4 = x[3+idt]; 2251 while (nz--) { 2252 idx = 4*(*vi++); 2253 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2254 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2255 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2256 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2257 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2258 v += 16; 2259 } 2260 v = aa + ai16; 2261 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2262 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2263 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2264 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2265 idt -= 4; 2266 } 2267 } 2268 #endif 2269 2270 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2271 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2272 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2273 PetscFunctionReturn(0); 2274 } 2275 2276 #undef __FUNCT__ 2277 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2278 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2279 { 2280 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2281 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2282 PetscErrorCode ierr; 2283 PetscInt *diag = a->diag; 2284 MatScalar *aa=a->a; 2285 PetscScalar *x,*b; 2286 2287 PetscFunctionBegin; 2288 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2289 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2290 2291 { 2292 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2293 MatScalar *v,*t=(MatScalar *)x; 2294 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2295 2296 /* forward solve the lower triangular */ 2297 idx = 0; 2298 t[0] = (MatScalar)b[0]; 2299 t[1] = (MatScalar)b[1]; 2300 t[2] = (MatScalar)b[2]; 2301 t[3] = (MatScalar)b[3]; 2302 for (i=1; i<n; i++) { 2303 v = aa + 16*ai[i]; 2304 vi = aj + ai[i]; 2305 nz = diag[i] - ai[i]; 2306 idx += 4; 2307 s1 = (MatScalar)b[idx]; 2308 s2 = (MatScalar)b[1+idx]; 2309 s3 = (MatScalar)b[2+idx]; 2310 s4 = (MatScalar)b[3+idx]; 2311 while (nz--) { 2312 jdx = 4*(*vi++); 2313 x1 = t[jdx]; 2314 x2 = t[1+jdx]; 2315 x3 = t[2+jdx]; 2316 x4 = t[3+jdx]; 2317 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2318 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2319 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2320 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2321 v += 16; 2322 } 2323 t[idx] = s1; 2324 t[1+idx] = s2; 2325 t[2+idx] = s3; 2326 t[3+idx] = s4; 2327 } 2328 /* backward solve the upper triangular */ 2329 idt = 4*(n-1); 2330 for (i=n-1; i>=0; i--){ 2331 ai16 = 16*diag[i]; 2332 v = aa + ai16 + 16; 2333 vi = aj + diag[i] + 1; 2334 nz = ai[i+1] - diag[i] - 1; 2335 s1 = t[idt]; 2336 s2 = t[1+idt]; 2337 s3 = t[2+idt]; 2338 s4 = t[3+idt]; 2339 while (nz--) { 2340 idx = 4*(*vi++); 2341 x1 = (MatScalar)x[idx]; 2342 x2 = (MatScalar)x[1+idx]; 2343 x3 = (MatScalar)x[2+idx]; 2344 x4 = (MatScalar)x[3+idx]; 2345 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2346 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2347 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2348 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2349 v += 16; 2350 } 2351 v = aa + ai16; 2352 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2353 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2354 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2355 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2356 idt -= 4; 2357 } 2358 } 2359 2360 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2361 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2362 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2363 PetscFunctionReturn(0); 2364 } 2365 2366 #if defined (PETSC_HAVE_SSE) 2367 2368 #include PETSC_HAVE_SSE 2369 #undef __FUNCT__ 2370 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2371 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2372 { 2373 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2374 unsigned short *aj=(unsigned short *)a->j; 2375 PetscErrorCode ierr; 2376 int *ai=a->i,n=a->mbs,*diag = a->diag; 2377 MatScalar *aa=a->a; 2378 PetscScalar *x,*b; 2379 2380 PetscFunctionBegin; 2381 SSE_SCOPE_BEGIN; 2382 /* 2383 Note: This code currently uses demotion of double 2384 to float when performing the mixed-mode computation. 2385 This may not be numerically reasonable for all applications. 2386 */ 2387 PREFETCH_NTA(aa+16*ai[1]); 2388 2389 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2390 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2391 { 2392 /* x will first be computed in single precision then promoted inplace to double */ 2393 MatScalar *v,*t=(MatScalar *)x; 2394 int nz,i,idt,ai16; 2395 unsigned int jdx,idx; 2396 unsigned short *vi; 2397 /* Forward solve the lower triangular factor. */ 2398 2399 /* First block is the identity. */ 2400 idx = 0; 2401 CONVERT_DOUBLE4_FLOAT4(t,b); 2402 v = aa + 16*((unsigned int)ai[1]); 2403 2404 for (i=1; i<n;) { 2405 PREFETCH_NTA(&v[8]); 2406 vi = aj + ai[i]; 2407 nz = diag[i] - ai[i]; 2408 idx += 4; 2409 2410 /* Demote RHS from double to float. */ 2411 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2412 LOAD_PS(&t[idx],XMM7); 2413 2414 while (nz--) { 2415 PREFETCH_NTA(&v[16]); 2416 jdx = 4*((unsigned int)(*vi++)); 2417 2418 /* 4x4 Matrix-Vector product with negative accumulation: */ 2419 SSE_INLINE_BEGIN_2(&t[jdx],v) 2420 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2421 2422 /* First Column */ 2423 SSE_COPY_PS(XMM0,XMM6) 2424 SSE_SHUFFLE(XMM0,XMM0,0x00) 2425 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2426 SSE_SUB_PS(XMM7,XMM0) 2427 2428 /* Second Column */ 2429 SSE_COPY_PS(XMM1,XMM6) 2430 SSE_SHUFFLE(XMM1,XMM1,0x55) 2431 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2432 SSE_SUB_PS(XMM7,XMM1) 2433 2434 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2435 2436 /* Third Column */ 2437 SSE_COPY_PS(XMM2,XMM6) 2438 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2439 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2440 SSE_SUB_PS(XMM7,XMM2) 2441 2442 /* Fourth Column */ 2443 SSE_COPY_PS(XMM3,XMM6) 2444 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2445 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2446 SSE_SUB_PS(XMM7,XMM3) 2447 SSE_INLINE_END_2 2448 2449 v += 16; 2450 } 2451 v = aa + 16*ai[++i]; 2452 PREFETCH_NTA(v); 2453 STORE_PS(&t[idx],XMM7); 2454 } 2455 2456 /* Backward solve the upper triangular factor.*/ 2457 2458 idt = 4*(n-1); 2459 ai16 = 16*diag[n-1]; 2460 v = aa + ai16 + 16; 2461 for (i=n-1; i>=0;){ 2462 PREFETCH_NTA(&v[8]); 2463 vi = aj + diag[i] + 1; 2464 nz = ai[i+1] - diag[i] - 1; 2465 2466 LOAD_PS(&t[idt],XMM7); 2467 2468 while (nz--) { 2469 PREFETCH_NTA(&v[16]); 2470 idx = 4*((unsigned int)(*vi++)); 2471 2472 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2473 SSE_INLINE_BEGIN_2(&t[idx],v) 2474 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2475 2476 /* First Column */ 2477 SSE_COPY_PS(XMM0,XMM6) 2478 SSE_SHUFFLE(XMM0,XMM0,0x00) 2479 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2480 SSE_SUB_PS(XMM7,XMM0) 2481 2482 /* Second Column */ 2483 SSE_COPY_PS(XMM1,XMM6) 2484 SSE_SHUFFLE(XMM1,XMM1,0x55) 2485 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2486 SSE_SUB_PS(XMM7,XMM1) 2487 2488 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2489 2490 /* Third Column */ 2491 SSE_COPY_PS(XMM2,XMM6) 2492 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2493 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2494 SSE_SUB_PS(XMM7,XMM2) 2495 2496 /* Fourth Column */ 2497 SSE_COPY_PS(XMM3,XMM6) 2498 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2499 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2500 SSE_SUB_PS(XMM7,XMM3) 2501 SSE_INLINE_END_2 2502 v += 16; 2503 } 2504 v = aa + ai16; 2505 ai16 = 16*diag[--i]; 2506 PREFETCH_NTA(aa+ai16+16); 2507 /* 2508 Scale the result by the diagonal 4x4 block, 2509 which was inverted as part of the factorization 2510 */ 2511 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2512 /* First Column */ 2513 SSE_COPY_PS(XMM0,XMM7) 2514 SSE_SHUFFLE(XMM0,XMM0,0x00) 2515 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2516 2517 /* Second Column */ 2518 SSE_COPY_PS(XMM1,XMM7) 2519 SSE_SHUFFLE(XMM1,XMM1,0x55) 2520 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2521 SSE_ADD_PS(XMM0,XMM1) 2522 2523 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2524 2525 /* Third Column */ 2526 SSE_COPY_PS(XMM2,XMM7) 2527 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2528 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2529 SSE_ADD_PS(XMM0,XMM2) 2530 2531 /* Fourth Column */ 2532 SSE_COPY_PS(XMM3,XMM7) 2533 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2534 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2535 SSE_ADD_PS(XMM0,XMM3) 2536 2537 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2538 SSE_INLINE_END_3 2539 2540 v = aa + ai16 + 16; 2541 idt -= 4; 2542 } 2543 2544 /* Convert t from single precision back to double precision (inplace)*/ 2545 idt = 4*(n-1); 2546 for (i=n-1;i>=0;i--) { 2547 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2548 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2549 PetscScalar *xtemp=&x[idt]; 2550 MatScalar *ttemp=&t[idt]; 2551 xtemp[3] = (PetscScalar)ttemp[3]; 2552 xtemp[2] = (PetscScalar)ttemp[2]; 2553 xtemp[1] = (PetscScalar)ttemp[1]; 2554 xtemp[0] = (PetscScalar)ttemp[0]; 2555 idt -= 4; 2556 } 2557 2558 } /* End of artificial scope. */ 2559 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2560 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2561 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2562 SSE_SCOPE_END; 2563 PetscFunctionReturn(0); 2564 } 2565 2566 #undef __FUNCT__ 2567 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2568 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2569 { 2570 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2571 int *aj=a->j; 2572 PetscErrorCode ierr; 2573 int *ai=a->i,n=a->mbs,*diag = a->diag; 2574 MatScalar *aa=a->a; 2575 PetscScalar *x,*b; 2576 2577 PetscFunctionBegin; 2578 SSE_SCOPE_BEGIN; 2579 /* 2580 Note: This code currently uses demotion of double 2581 to float when performing the mixed-mode computation. 2582 This may not be numerically reasonable for all applications. 2583 */ 2584 PREFETCH_NTA(aa+16*ai[1]); 2585 2586 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2587 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2588 { 2589 /* x will first be computed in single precision then promoted inplace to double */ 2590 MatScalar *v,*t=(MatScalar *)x; 2591 int nz,i,idt,ai16; 2592 int jdx,idx; 2593 int *vi; 2594 /* Forward solve the lower triangular factor. */ 2595 2596 /* First block is the identity. */ 2597 idx = 0; 2598 CONVERT_DOUBLE4_FLOAT4(t,b); 2599 v = aa + 16*ai[1]; 2600 2601 for (i=1; i<n;) { 2602 PREFETCH_NTA(&v[8]); 2603 vi = aj + ai[i]; 2604 nz = diag[i] - ai[i]; 2605 idx += 4; 2606 2607 /* Demote RHS from double to float. */ 2608 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2609 LOAD_PS(&t[idx],XMM7); 2610 2611 while (nz--) { 2612 PREFETCH_NTA(&v[16]); 2613 jdx = 4*(*vi++); 2614 /* jdx = *vi++; */ 2615 2616 /* 4x4 Matrix-Vector product with negative accumulation: */ 2617 SSE_INLINE_BEGIN_2(&t[jdx],v) 2618 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2619 2620 /* First Column */ 2621 SSE_COPY_PS(XMM0,XMM6) 2622 SSE_SHUFFLE(XMM0,XMM0,0x00) 2623 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2624 SSE_SUB_PS(XMM7,XMM0) 2625 2626 /* Second Column */ 2627 SSE_COPY_PS(XMM1,XMM6) 2628 SSE_SHUFFLE(XMM1,XMM1,0x55) 2629 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2630 SSE_SUB_PS(XMM7,XMM1) 2631 2632 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2633 2634 /* Third Column */ 2635 SSE_COPY_PS(XMM2,XMM6) 2636 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2637 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2638 SSE_SUB_PS(XMM7,XMM2) 2639 2640 /* Fourth Column */ 2641 SSE_COPY_PS(XMM3,XMM6) 2642 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2643 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2644 SSE_SUB_PS(XMM7,XMM3) 2645 SSE_INLINE_END_2 2646 2647 v += 16; 2648 } 2649 v = aa + 16*ai[++i]; 2650 PREFETCH_NTA(v); 2651 STORE_PS(&t[idx],XMM7); 2652 } 2653 2654 /* Backward solve the upper triangular factor.*/ 2655 2656 idt = 4*(n-1); 2657 ai16 = 16*diag[n-1]; 2658 v = aa + ai16 + 16; 2659 for (i=n-1; i>=0;){ 2660 PREFETCH_NTA(&v[8]); 2661 vi = aj + diag[i] + 1; 2662 nz = ai[i+1] - diag[i] - 1; 2663 2664 LOAD_PS(&t[idt],XMM7); 2665 2666 while (nz--) { 2667 PREFETCH_NTA(&v[16]); 2668 idx = 4*(*vi++); 2669 /* idx = *vi++; */ 2670 2671 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2672 SSE_INLINE_BEGIN_2(&t[idx],v) 2673 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2674 2675 /* First Column */ 2676 SSE_COPY_PS(XMM0,XMM6) 2677 SSE_SHUFFLE(XMM0,XMM0,0x00) 2678 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2679 SSE_SUB_PS(XMM7,XMM0) 2680 2681 /* Second Column */ 2682 SSE_COPY_PS(XMM1,XMM6) 2683 SSE_SHUFFLE(XMM1,XMM1,0x55) 2684 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2685 SSE_SUB_PS(XMM7,XMM1) 2686 2687 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2688 2689 /* Third Column */ 2690 SSE_COPY_PS(XMM2,XMM6) 2691 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2692 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2693 SSE_SUB_PS(XMM7,XMM2) 2694 2695 /* Fourth Column */ 2696 SSE_COPY_PS(XMM3,XMM6) 2697 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2698 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2699 SSE_SUB_PS(XMM7,XMM3) 2700 SSE_INLINE_END_2 2701 v += 16; 2702 } 2703 v = aa + ai16; 2704 ai16 = 16*diag[--i]; 2705 PREFETCH_NTA(aa+ai16+16); 2706 /* 2707 Scale the result by the diagonal 4x4 block, 2708 which was inverted as part of the factorization 2709 */ 2710 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2711 /* First Column */ 2712 SSE_COPY_PS(XMM0,XMM7) 2713 SSE_SHUFFLE(XMM0,XMM0,0x00) 2714 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2715 2716 /* Second Column */ 2717 SSE_COPY_PS(XMM1,XMM7) 2718 SSE_SHUFFLE(XMM1,XMM1,0x55) 2719 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2720 SSE_ADD_PS(XMM0,XMM1) 2721 2722 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2723 2724 /* Third Column */ 2725 SSE_COPY_PS(XMM2,XMM7) 2726 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2727 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2728 SSE_ADD_PS(XMM0,XMM2) 2729 2730 /* Fourth Column */ 2731 SSE_COPY_PS(XMM3,XMM7) 2732 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2733 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2734 SSE_ADD_PS(XMM0,XMM3) 2735 2736 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2737 SSE_INLINE_END_3 2738 2739 v = aa + ai16 + 16; 2740 idt -= 4; 2741 } 2742 2743 /* Convert t from single precision back to double precision (inplace)*/ 2744 idt = 4*(n-1); 2745 for (i=n-1;i>=0;i--) { 2746 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2747 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2748 PetscScalar *xtemp=&x[idt]; 2749 MatScalar *ttemp=&t[idt]; 2750 xtemp[3] = (PetscScalar)ttemp[3]; 2751 xtemp[2] = (PetscScalar)ttemp[2]; 2752 xtemp[1] = (PetscScalar)ttemp[1]; 2753 xtemp[0] = (PetscScalar)ttemp[0]; 2754 idt -= 4; 2755 } 2756 2757 } /* End of artificial scope. */ 2758 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2759 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2760 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2761 SSE_SCOPE_END; 2762 PetscFunctionReturn(0); 2763 } 2764 2765 #endif 2766 #undef __FUNCT__ 2767 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2768 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2769 { 2770 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2771 IS iscol=a->col,isrow=a->row; 2772 PetscErrorCode ierr; 2773 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2774 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2775 const MatScalar *aa=a->a,*v; 2776 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 2777 const PetscScalar *b; 2778 2779 PetscFunctionBegin; 2780 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2781 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2782 t = a->solve_work; 2783 2784 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2785 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2786 2787 /* forward solve the lower triangular */ 2788 idx = 3*(*r++); 2789 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2790 for (i=1; i<n; i++) { 2791 v = aa + 9*ai[i]; 2792 vi = aj + ai[i]; 2793 nz = diag[i] - ai[i]; 2794 idx = 3*(*r++); 2795 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2796 while (nz--) { 2797 idx = 3*(*vi++); 2798 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2799 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2800 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2801 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2802 v += 9; 2803 } 2804 idx = 3*i; 2805 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2806 } 2807 /* backward solve the upper triangular */ 2808 for (i=n-1; i>=0; i--){ 2809 v = aa + 9*diag[i] + 9; 2810 vi = aj + diag[i] + 1; 2811 nz = ai[i+1] - diag[i] - 1; 2812 idt = 3*i; 2813 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2814 while (nz--) { 2815 idx = 3*(*vi++); 2816 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2817 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2818 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2819 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2820 v += 9; 2821 } 2822 idc = 3*(*c--); 2823 v = aa + 9*diag[i]; 2824 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2825 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2826 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2827 } 2828 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2829 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2830 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2831 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2832 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 2833 PetscFunctionReturn(0); 2834 } 2835 2836 /* 2837 Special case where the matrix was ILU(0) factored in the natural 2838 ordering. This eliminates the need for the column and row permutation. 2839 */ 2840 #undef __FUNCT__ 2841 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2842 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2843 { 2844 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2845 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2846 PetscErrorCode ierr; 2847 PetscInt *diag = a->diag; 2848 const MatScalar *aa=a->a,*v; 2849 PetscScalar *x,s1,s2,s3,x1,x2,x3; 2850 const PetscScalar *b; 2851 PetscInt jdx,idt,idx,nz,*vi,i; 2852 2853 PetscFunctionBegin; 2854 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2855 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2856 2857 /* forward solve the lower triangular */ 2858 idx = 0; 2859 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2860 for (i=1; i<n; i++) { 2861 v = aa + 9*ai[i]; 2862 vi = aj + ai[i]; 2863 nz = diag[i] - ai[i]; 2864 idx += 3; 2865 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2866 while (nz--) { 2867 jdx = 3*(*vi++); 2868 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2869 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2870 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2871 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2872 v += 9; 2873 } 2874 x[idx] = s1; 2875 x[1+idx] = s2; 2876 x[2+idx] = s3; 2877 } 2878 /* backward solve the upper triangular */ 2879 for (i=n-1; i>=0; i--){ 2880 v = aa + 9*diag[i] + 9; 2881 vi = aj + diag[i] + 1; 2882 nz = ai[i+1] - diag[i] - 1; 2883 idt = 3*i; 2884 s1 = x[idt]; s2 = x[1+idt]; 2885 s3 = x[2+idt]; 2886 while (nz--) { 2887 idx = 3*(*vi++); 2888 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2889 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2890 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2891 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2892 v += 9; 2893 } 2894 v = aa + 9*diag[i]; 2895 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2896 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2897 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2898 } 2899 2900 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2901 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2902 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 2903 PetscFunctionReturn(0); 2904 } 2905 2906 #undef __FUNCT__ 2907 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2908 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2909 { 2910 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2911 IS iscol=a->col,isrow=a->row; 2912 PetscErrorCode ierr; 2913 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2914 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2915 const MatScalar *aa=a->a,*v; 2916 PetscScalar *x,s1,s2,x1,x2,*t; 2917 const PetscScalar *b; 2918 2919 PetscFunctionBegin; 2920 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2921 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2922 t = a->solve_work; 2923 2924 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2925 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2926 2927 /* forward solve the lower triangular */ 2928 idx = 2*(*r++); 2929 t[0] = b[idx]; t[1] = b[1+idx]; 2930 for (i=1; i<n; i++) { 2931 v = aa + 4*ai[i]; 2932 vi = aj + ai[i]; 2933 nz = diag[i] - ai[i]; 2934 idx = 2*(*r++); 2935 s1 = b[idx]; s2 = b[1+idx]; 2936 while (nz--) { 2937 idx = 2*(*vi++); 2938 x1 = t[idx]; x2 = t[1+idx]; 2939 s1 -= v[0]*x1 + v[2]*x2; 2940 s2 -= v[1]*x1 + v[3]*x2; 2941 v += 4; 2942 } 2943 idx = 2*i; 2944 t[idx] = s1; t[1+idx] = s2; 2945 } 2946 /* backward solve the upper triangular */ 2947 for (i=n-1; i>=0; i--){ 2948 v = aa + 4*diag[i] + 4; 2949 vi = aj + diag[i] + 1; 2950 nz = ai[i+1] - diag[i] - 1; 2951 idt = 2*i; 2952 s1 = t[idt]; s2 = t[1+idt]; 2953 while (nz--) { 2954 idx = 2*(*vi++); 2955 x1 = t[idx]; x2 = t[1+idx]; 2956 s1 -= v[0]*x1 + v[2]*x2; 2957 s2 -= v[1]*x1 + v[3]*x2; 2958 v += 4; 2959 } 2960 idc = 2*(*c--); 2961 v = aa + 4*diag[i]; 2962 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2963 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2964 } 2965 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2966 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2967 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2968 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2969 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 2970 PetscFunctionReturn(0); 2971 } 2972 2973 /* 2974 Special case where the matrix was ILU(0) factored in the natural 2975 ordering. This eliminates the need for the column and row permutation. 2976 */ 2977 #undef __FUNCT__ 2978 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2979 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2980 { 2981 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2982 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2983 PetscErrorCode ierr; 2984 PetscInt *diag = a->diag; 2985 const MatScalar *aa=a->a,*v; 2986 PetscScalar *x,s1,s2,x1,x2; 2987 const PetscScalar *b; 2988 PetscInt jdx,idt,idx,nz,*vi,i; 2989 2990 PetscFunctionBegin; 2991 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2992 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2993 2994 /* forward solve the lower triangular */ 2995 idx = 0; 2996 x[0] = b[0]; x[1] = b[1]; 2997 for (i=1; i<n; i++) { 2998 v = aa + 4*ai[i]; 2999 vi = aj + ai[i]; 3000 nz = diag[i] - ai[i]; 3001 idx += 2; 3002 s1 = b[idx];s2 = b[1+idx]; 3003 while (nz--) { 3004 jdx = 2*(*vi++); 3005 x1 = x[jdx];x2 = x[1+jdx]; 3006 s1 -= v[0]*x1 + v[2]*x2; 3007 s2 -= v[1]*x1 + v[3]*x2; 3008 v += 4; 3009 } 3010 x[idx] = s1; 3011 x[1+idx] = s2; 3012 } 3013 /* backward solve the upper triangular */ 3014 for (i=n-1; i>=0; i--){ 3015 v = aa + 4*diag[i] + 4; 3016 vi = aj + diag[i] + 1; 3017 nz = ai[i+1] - diag[i] - 1; 3018 idt = 2*i; 3019 s1 = x[idt]; s2 = x[1+idt]; 3020 while (nz--) { 3021 idx = 2*(*vi++); 3022 x1 = x[idx]; x2 = x[1+idx]; 3023 s1 -= v[0]*x1 + v[2]*x2; 3024 s2 -= v[1]*x1 + v[3]*x2; 3025 v += 4; 3026 } 3027 v = aa + 4*diag[i]; 3028 x[idt] = v[0]*s1 + v[2]*s2; 3029 x[1+idt] = v[1]*s1 + v[3]*s2; 3030 } 3031 3032 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3033 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3034 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3035 PetscFunctionReturn(0); 3036 } 3037 3038 #undef __FUNCT__ 3039 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 3040 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 3041 { 3042 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3043 IS iscol=a->col,isrow=a->row; 3044 PetscErrorCode ierr; 3045 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3046 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3047 MatScalar *aa=a->a,*v; 3048 PetscScalar *x,*b,s1,*t; 3049 3050 PetscFunctionBegin; 3051 if (!n) PetscFunctionReturn(0); 3052 3053 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3054 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3055 t = a->solve_work; 3056 3057 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3058 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3059 3060 /* forward solve the lower triangular */ 3061 t[0] = b[*r++]; 3062 for (i=1; i<n; i++) { 3063 v = aa + ai[i]; 3064 vi = aj + ai[i]; 3065 nz = diag[i] - ai[i]; 3066 s1 = b[*r++]; 3067 while (nz--) { 3068 s1 -= (*v++)*t[*vi++]; 3069 } 3070 t[i] = s1; 3071 } 3072 /* backward solve the upper triangular */ 3073 for (i=n-1; i>=0; i--){ 3074 v = aa + diag[i] + 1; 3075 vi = aj + diag[i] + 1; 3076 nz = ai[i+1] - diag[i] - 1; 3077 s1 = t[i]; 3078 while (nz--) { 3079 s1 -= (*v++)*t[*vi++]; 3080 } 3081 x[*c--] = t[i] = aa[diag[i]]*s1; 3082 } 3083 3084 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3085 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3086 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3087 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3088 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3089 PetscFunctionReturn(0); 3090 } 3091 /* 3092 Special case where the matrix was ILU(0) factored in the natural 3093 ordering. This eliminates the need for the column and row permutation. 3094 */ 3095 #undef __FUNCT__ 3096 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3097 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 3098 { 3099 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3100 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3101 PetscErrorCode ierr; 3102 PetscInt *diag = a->diag; 3103 MatScalar *aa=a->a; 3104 PetscScalar *x,*b; 3105 PetscScalar s1,x1; 3106 MatScalar *v; 3107 PetscInt jdx,idt,idx,nz,*vi,i; 3108 3109 PetscFunctionBegin; 3110 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3111 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3112 3113 /* forward solve the lower triangular */ 3114 idx = 0; 3115 x[0] = b[0]; 3116 for (i=1; i<n; i++) { 3117 v = aa + ai[i]; 3118 vi = aj + ai[i]; 3119 nz = diag[i] - ai[i]; 3120 idx += 1; 3121 s1 = b[idx]; 3122 while (nz--) { 3123 jdx = *vi++; 3124 x1 = x[jdx]; 3125 s1 -= v[0]*x1; 3126 v += 1; 3127 } 3128 x[idx] = s1; 3129 } 3130 /* backward solve the upper triangular */ 3131 for (i=n-1; i>=0; i--){ 3132 v = aa + diag[i] + 1; 3133 vi = aj + diag[i] + 1; 3134 nz = ai[i+1] - diag[i] - 1; 3135 idt = i; 3136 s1 = x[idt]; 3137 while (nz--) { 3138 idx = *vi++; 3139 x1 = x[idx]; 3140 s1 -= v[0]*x1; 3141 v += 1; 3142 } 3143 v = aa + diag[i]; 3144 x[idt] = v[0]*s1; 3145 } 3146 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3147 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3148 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3149 PetscFunctionReturn(0); 3150 } 3151 3152 /* ----------------------------------------------------------------*/ 3153 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption); 3154 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 3155 3156 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 3157 #undef __FUNCT__ 3158 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 3159 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 3160 { 3161 Mat C=B; 3162 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 3163 IS isrow = b->row,isicol = b->icol; 3164 PetscErrorCode ierr; 3165 const PetscInt *r,*ic,*ics; 3166 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 3167 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 3168 MatScalar *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a; 3169 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 3170 MatScalar *v_work; 3171 3172 PetscFunctionBegin; 3173 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3174 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3175 ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 3176 ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 3177 ics = ic; 3178 3179 /* generate work space needed by dense LU factorization */ 3180 ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 3181 multiplier = v_work + bs; 3182 v_pivots = (PetscInt*)(multiplier + bs2); 3183 3184 for (i=0; i<n; i++){ 3185 /* zero rtmp */ 3186 /* L part */ 3187 nz = bi[i+1] - bi[i]; 3188 bjtmp = bj + bi[i]; 3189 for (j=0; j<nz; j++){ 3190 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3191 } 3192 3193 /* U part */ 3194 nz = bi[2*n-i+1] - bi[2*n-i]; 3195 bjtmp = bj + bi[2*n-i]; 3196 for (j=0; j<nz; j++){ 3197 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3198 } 3199 3200 /* load in initial (unfactored row) */ 3201 nz = ai[r[i]+1] - ai[r[i]]; 3202 ajtmp = aj + ai[r[i]]; 3203 v = aa + bs2*ai[r[i]]; 3204 for (j=0; j<nz; j++) { 3205 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 3206 } 3207 3208 /* elimination */ 3209 bjtmp = bj + bi[i]; 3210 row = *bjtmp++; 3211 nzL = bi[i+1] - bi[i]; 3212 k = 0; 3213 while (k < nzL) { 3214 pc = rtmp + bs2*row; 3215 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 3216 if (flg) { 3217 pv = b->a + bs2*bdiag[row]; 3218 Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */ 3219 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 3220 pv = b->a + bs2*bi[2*n-row]; 3221 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 3222 for (j=0; j<nz; j++) { 3223 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 3224 } 3225 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 3226 } 3227 row = *bjtmp++; k++; 3228 } 3229 3230 /* finished row so stick it into b->a */ 3231 /* L part */ 3232 pv = b->a + bs2*bi[i] ; 3233 pj = b->j + bi[i] ; 3234 nz = bi[i+1] - bi[i]; 3235 for (j=0; j<nz; j++) { 3236 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3237 } 3238 3239 /* Mark diagonal and invert diagonal for simplier triangular solves */ 3240 pv = b->a + bs2*bdiag[i]; 3241 pj = b->j + bdiag[i]; 3242 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 3243 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3244 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 3245 3246 /* U part */ 3247 pv = b->a + bs2*bi[2*n-i]; 3248 pj = b->j + bi[2*n-i]; 3249 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 3250 for (j=0; j<nz; j++){ 3251 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3252 } 3253 } 3254 3255 ierr = PetscFree(rtmp);CHKERRQ(ierr); 3256 ierr = PetscFree(v_work);CHKERRQ(ierr); 3257 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3258 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3259 if (bs == 5){ 3260 C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 3261 } else { 3262 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 3263 } 3264 C->assembled = PETSC_TRUE; 3265 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 3266 PetscFunctionReturn(0); 3267 } 3268 3269 /* 3270 ilu(0) with natural ordering under new data structure. 3271 Factored arrays bj and ba are stored as 3272 L(0,:), L(1,:), ...,L(n-1,:), U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:) 3273 3274 bi=fact->i is an array of size 2n+2, in which 3275 bi+ 3276 bi[i] -> 1st entry of L(i,:),i=0,...,i-1 3277 bi[n] -> end of L(n-1,:)+1 3278 bi[n+1] -> 1st entry of U(n-1,:) 3279 bi[2n-i] -> 1st entry of U(i,:) 3280 bi[2n-i+1] -> end of U(i,:)+1, the 1st entry of U(i-1,:) 3281 bi[2n] -> end of U(0,:)+1 3282 3283 U(i,:) contains diag[i] as its last entry, i.e., 3284 U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i]) 3285 */ 3286 #undef __FUNCT__ 3287 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 3288 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3289 { 3290 3291 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3292 PetscErrorCode ierr; 3293 PetscInt mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 3294 PetscInt i,j,nz=a->nz,*bi,*bj,*bdiag; 3295 3296 PetscFunctionBegin; 3297 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 3298 b = (Mat_SeqBAIJ*)(fact)->data; 3299 bdiag = b->diag; 3300 3301 /* replace matrix arrays with single allocations, then reset values */ 3302 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 3303 3304 ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr); 3305 ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr); 3306 ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr); 3307 b->singlemalloc = PETSC_FALSE; 3308 if (mbs > 0) { 3309 ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr); 3310 } 3311 3312 /* set bi and bj with new data structure */ 3313 bi = b->i; 3314 bj = b->j; 3315 3316 /* L part */ 3317 bi[0] = 0; 3318 for (i=0; i<mbs; i++){ 3319 nz = adiag[i] - ai[i]; 3320 bi[i+1] = bi[i] + nz; 3321 aj = a->j + ai[i]; 3322 for (j=0; j<nz; j++){ 3323 *bj = aj[j]; bj++; 3324 } 3325 } 3326 3327 /* U part */ 3328 bi[mbs+1] = bi[mbs]; 3329 for (i=mbs-1; i>=0; i--){ 3330 nz = ai[i+1] - adiag[i] - 1; 3331 if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz); 3332 bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1; 3333 aj = a->j + adiag[i] + 1; 3334 for (j=0; j<nz; j++){ 3335 *bj = aj[j]; bj++; 3336 } 3337 /* diag[i] */ 3338 *bj = i; bj++; 3339 bdiag[i] = bi[2*mbs-i+1]-1; 3340 } 3341 PetscFunctionReturn(0); 3342 } 3343 3344 /* 3345 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3346 except that the data structure of Mat_SeqAIJ is slightly different. 3347 Not a good example of code reuse. 3348 */ 3349 3350 #undef __FUNCT__ 3351 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3352 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3353 { 3354 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3355 IS isicol; 3356 PetscErrorCode ierr; 3357 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 3358 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 3359 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3360 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 3361 PetscTruth col_identity,row_identity,both_identity,flg; 3362 PetscReal f; 3363 3364 PetscFunctionBegin; 3365 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 3366 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 3367 3368 f = info->fill; 3369 levels = (PetscInt)info->levels; 3370 diagonal_fill = (PetscInt)info->diagonal_fill; 3371 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3372 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3373 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3374 both_identity = (PetscTruth) (row_identity && col_identity); 3375 3376 if (!levels && both_identity) { /* special case copy the nonzero structure */ 3377 3378 PetscTruth newdatastruct=PETSC_FALSE; 3379 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 3380 if (newdatastruct){ 3381 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 3382 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 3383 } else { 3384 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 3385 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 3386 } 3387 3388 fact->factor = MAT_FACTOR_ILU; 3389 b = (Mat_SeqBAIJ*)(fact)->data; 3390 b->row = isrow; 3391 b->col = iscol; 3392 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3393 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3394 b->icol = isicol; 3395 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3396 ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3397 PetscFunctionReturn(0); 3398 } 3399 3400 /* general case perform the symbolic factorization */ 3401 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3402 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3403 3404 /* get new row pointers */ 3405 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 3406 ainew[0] = 0; 3407 /* don't know how many column pointers are needed so estimate */ 3408 jmax = (PetscInt)(f*ai[n] + 1); 3409 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 3410 /* ajfill is level of fill for each fill entry */ 3411 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 3412 /* fill is a linked list of nonzeros in active row */ 3413 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 3414 /* im is level for each filled value */ 3415 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 3416 /* dloc is location of diagonal in factor */ 3417 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 3418 dloc[0] = 0; 3419 for (prow=0; prow<n; prow++) { 3420 3421 /* copy prow into linked list */ 3422 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3423 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 3424 xi = aj + ai[r[prow]]; 3425 fill[n] = n; 3426 fill[prow] = -1; /* marker for diagonal entry */ 3427 while (nz--) { 3428 fm = n; 3429 idx = ic[*xi++]; 3430 do { 3431 m = fm; 3432 fm = fill[m]; 3433 } while (fm < idx); 3434 fill[m] = idx; 3435 fill[idx] = fm; 3436 im[idx] = 0; 3437 } 3438 3439 /* make sure diagonal entry is included */ 3440 if (diagonal_fill && fill[prow] == -1) { 3441 fm = n; 3442 while (fill[fm] < prow) fm = fill[fm]; 3443 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3444 fill[fm] = prow; 3445 im[prow] = 0; 3446 nzf++; 3447 dcount++; 3448 } 3449 3450 nzi = 0; 3451 row = fill[n]; 3452 while (row < prow) { 3453 incrlev = im[row] + 1; 3454 nz = dloc[row]; 3455 xi = ajnew + ainew[row] + nz + 1; 3456 flev = ajfill + ainew[row] + nz + 1; 3457 nnz = ainew[row+1] - ainew[row] - nz - 1; 3458 fm = row; 3459 while (nnz-- > 0) { 3460 idx = *xi++; 3461 if (*flev + incrlev > levels) { 3462 flev++; 3463 continue; 3464 } 3465 do { 3466 m = fm; 3467 fm = fill[m]; 3468 } while (fm < idx); 3469 if (fm != idx) { 3470 im[idx] = *flev + incrlev; 3471 fill[m] = idx; 3472 fill[idx] = fm; 3473 fm = idx; 3474 nzf++; 3475 } else { 3476 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3477 } 3478 flev++; 3479 } 3480 row = fill[row]; 3481 nzi++; 3482 } 3483 /* copy new filled row into permanent storage */ 3484 ainew[prow+1] = ainew[prow] + nzf; 3485 if (ainew[prow+1] > jmax) { 3486 3487 /* estimate how much additional space we will need */ 3488 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3489 /* just double the memory each time */ 3490 PetscInt maxadd = jmax; 3491 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3492 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3493 jmax += maxadd; 3494 3495 /* allocate a longer ajnew and ajfill */ 3496 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3497 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3498 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3499 ajnew = xitmp; 3500 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3501 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3502 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3503 ajfill = xitmp; 3504 reallocate++; /* count how many reallocations are needed */ 3505 } 3506 xitmp = ajnew + ainew[prow]; 3507 flev = ajfill + ainew[prow]; 3508 dloc[prow] = nzi; 3509 fm = fill[n]; 3510 while (nzf--) { 3511 *xitmp++ = fm; 3512 *flev++ = im[fm]; 3513 fm = fill[fm]; 3514 } 3515 /* make sure row has diagonal entry */ 3516 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3517 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 3518 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3519 } 3520 } 3521 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3522 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3523 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3524 ierr = PetscFree(fill);CHKERRQ(ierr); 3525 ierr = PetscFree(im);CHKERRQ(ierr); 3526 3527 #if defined(PETSC_USE_INFO) 3528 { 3529 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3530 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3531 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3532 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3533 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3534 if (diagonal_fill) { 3535 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3536 } 3537 } 3538 #endif 3539 3540 /* put together the new matrix */ 3541 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3542 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 3543 b = (Mat_SeqBAIJ*)(fact)->data; 3544 b->free_a = PETSC_TRUE; 3545 b->free_ij = PETSC_TRUE; 3546 b->singlemalloc = PETSC_FALSE; 3547 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 3548 b->j = ajnew; 3549 b->i = ainew; 3550 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3551 b->diag = dloc; 3552 b->ilen = 0; 3553 b->imax = 0; 3554 b->row = isrow; 3555 b->col = iscol; 3556 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3557 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3558 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3559 b->icol = isicol; 3560 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3561 /* In b structure: Free imax, ilen, old a, old j. 3562 Allocate dloc, solve_work, new a, new j */ 3563 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 3564 b->maxnz = b->nz = ainew[n]; 3565 3566 (fact)->info.factor_mallocs = reallocate; 3567 (fact)->info.fill_ratio_given = f; 3568 (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3569 3570 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 3571 PetscFunctionReturn(0); 3572 } 3573 3574 #undef __FUNCT__ 3575 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3576 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3577 { 3578 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3579 /* int i,*AJ=a->j,nz=a->nz; */ 3580 PetscFunctionBegin; 3581 /* Undo Column scaling */ 3582 /* while (nz--) { */ 3583 /* AJ[i] = AJ[i]/4; */ 3584 /* } */ 3585 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3586 A->ops->setunfactored = PETSC_NULL; 3587 PetscFunctionReturn(0); 3588 } 3589 3590 #undef __FUNCT__ 3591 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3592 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3593 { 3594 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3595 PetscInt *AJ=a->j,nz=a->nz; 3596 unsigned short *aj=(unsigned short *)AJ; 3597 PetscFunctionBegin; 3598 /* Is this really necessary? */ 3599 while (nz--) { 3600 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3601 } 3602 A->ops->setunfactored = PETSC_NULL; 3603 PetscFunctionReturn(0); 3604 } 3605 3606 3607