1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 #include "petscbt.h" 11 #include "../src/mat/utils/freespace.h" 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16 { 17 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18 PetscErrorCode ierr; 19 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20 PetscInt *diag = a->diag; 21 MatScalar *aa=a->a,*v; 22 PetscScalar s1,*x,*b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65 PetscInt *diag = a->diag,oidx; 66 MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2; 68 PetscScalar *x,*b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 124 PetscInt *diag = a->diag,oidx; 125 MatScalar *aa=a->a,*v; 126 PetscScalar s1,s2,s3,x1,x2,x3; 127 PetscScalar *x,*b; 128 129 PetscFunctionBegin; 130 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133 134 /* forward solve the U^T */ 135 idx = 0; 136 for (i=0; i<n; i++) { 137 138 v = aa + 9*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 141 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 142 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 143 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 144 v += 9; 145 146 vi = aj + diag[i] + 1; 147 nz = ai[i+1] - diag[i] - 1; 148 while (nz--) { 149 oidx = 3*(*vi++); 150 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 151 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 152 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 153 v += 9; 154 } 155 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 156 idx += 3; 157 } 158 /* backward solve the L^T */ 159 for (i=n-1; i>=0; i--){ 160 v = aa + 9*diag[i] - 9; 161 vi = aj + diag[i] - 1; 162 nz = diag[i] - ai[i]; 163 idt = 3*i; 164 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 165 while (nz--) { 166 idx = 3*(*vi--); 167 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 168 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 169 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 170 v -= 9; 171 } 172 } 173 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 174 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 175 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 176 PetscFunctionReturn(0); 177 } 178 179 #undef __FUNCT__ 180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 181 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 182 { 183 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 184 PetscErrorCode ierr; 185 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 186 PetscInt *diag = a->diag,oidx; 187 MatScalar *aa=a->a,*v; 188 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 189 PetscScalar *x,*b; 190 191 PetscFunctionBegin; 192 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 193 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 194 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 195 196 /* forward solve the U^T */ 197 idx = 0; 198 for (i=0; i<n; i++) { 199 200 v = aa + 16*diag[i]; 201 /* multiply by the inverse of the block diagonal */ 202 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 203 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 204 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 205 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 206 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 207 v += 16; 208 209 vi = aj + diag[i] + 1; 210 nz = ai[i+1] - diag[i] - 1; 211 while (nz--) { 212 oidx = 4*(*vi++); 213 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 214 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 215 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 216 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 217 v += 16; 218 } 219 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 220 idx += 4; 221 } 222 /* backward solve the L^T */ 223 for (i=n-1; i>=0; i--){ 224 v = aa + 16*diag[i] - 16; 225 vi = aj + diag[i] - 1; 226 nz = diag[i] - ai[i]; 227 idt = 4*i; 228 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 229 while (nz--) { 230 idx = 4*(*vi--); 231 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 232 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 233 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 234 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 235 v -= 16; 236 } 237 } 238 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 239 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 240 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 241 PetscFunctionReturn(0); 242 } 243 244 #undef __FUNCT__ 245 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 246 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 247 { 248 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 249 PetscErrorCode ierr; 250 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 251 PetscInt *diag = a->diag,oidx; 252 MatScalar *aa=a->a,*v; 253 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 254 PetscScalar *x,*b; 255 256 PetscFunctionBegin; 257 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 258 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 259 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 260 261 /* forward solve the U^T */ 262 idx = 0; 263 for (i=0; i<n; i++) { 264 265 v = aa + 25*diag[i]; 266 /* multiply by the inverse of the block diagonal */ 267 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 268 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 269 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 270 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 271 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 272 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 273 v += 25; 274 275 vi = aj + diag[i] + 1; 276 nz = ai[i+1] - diag[i] - 1; 277 while (nz--) { 278 oidx = 5*(*vi++); 279 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 280 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 281 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 282 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 283 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 284 v += 25; 285 } 286 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 287 idx += 5; 288 } 289 /* backward solve the L^T */ 290 for (i=n-1; i>=0; i--){ 291 v = aa + 25*diag[i] - 25; 292 vi = aj + diag[i] - 1; 293 nz = diag[i] - ai[i]; 294 idt = 5*i; 295 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 296 while (nz--) { 297 idx = 5*(*vi--); 298 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 299 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 300 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 301 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 302 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 303 v -= 25; 304 } 305 } 306 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 307 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 308 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 309 PetscFunctionReturn(0); 310 } 311 312 #undef __FUNCT__ 313 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 314 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 315 { 316 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 317 PetscErrorCode ierr; 318 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 319 PetscInt *diag = a->diag,oidx; 320 MatScalar *aa=a->a,*v; 321 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 322 PetscScalar *x,*b; 323 324 PetscFunctionBegin; 325 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 326 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 327 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 328 329 /* forward solve the U^T */ 330 idx = 0; 331 for (i=0; i<n; i++) { 332 333 v = aa + 36*diag[i]; 334 /* multiply by the inverse of the block diagonal */ 335 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 336 x6 = x[5+idx]; 337 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 338 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 339 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 340 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 341 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 342 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 343 v += 36; 344 345 vi = aj + diag[i] + 1; 346 nz = ai[i+1] - diag[i] - 1; 347 while (nz--) { 348 oidx = 6*(*vi++); 349 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 350 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 351 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 352 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 353 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 354 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 355 v += 36; 356 } 357 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 358 x[5+idx] = s6; 359 idx += 6; 360 } 361 /* backward solve the L^T */ 362 for (i=n-1; i>=0; i--){ 363 v = aa + 36*diag[i] - 36; 364 vi = aj + diag[i] - 1; 365 nz = diag[i] - ai[i]; 366 idt = 6*i; 367 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 368 s6 = x[5+idt]; 369 while (nz--) { 370 idx = 6*(*vi--); 371 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 372 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 373 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 374 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 375 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 376 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 377 v -= 36; 378 } 379 } 380 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 381 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 383 PetscFunctionReturn(0); 384 } 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 388 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 389 { 390 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 391 PetscErrorCode ierr; 392 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 393 PetscInt *diag = a->diag,oidx; 394 MatScalar *aa=a->a,*v; 395 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 396 PetscScalar *x,*b; 397 398 PetscFunctionBegin; 399 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 400 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 401 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 402 403 /* forward solve the U^T */ 404 idx = 0; 405 for (i=0; i<n; i++) { 406 407 v = aa + 49*diag[i]; 408 /* multiply by the inverse of the block diagonal */ 409 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 410 x6 = x[5+idx]; x7 = x[6+idx]; 411 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 412 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 413 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 414 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 415 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 416 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 417 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 418 v += 49; 419 420 vi = aj + diag[i] + 1; 421 nz = ai[i+1] - diag[i] - 1; 422 while (nz--) { 423 oidx = 7*(*vi++); 424 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 425 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 426 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 427 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 428 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 429 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 430 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 431 v += 49; 432 } 433 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 434 x[5+idx] = s6;x[6+idx] = s7; 435 idx += 7; 436 } 437 /* backward solve the L^T */ 438 for (i=n-1; i>=0; i--){ 439 v = aa + 49*diag[i] - 49; 440 vi = aj + diag[i] - 1; 441 nz = diag[i] - ai[i]; 442 idt = 7*i; 443 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 444 s6 = x[5+idt];s7 = x[6+idt]; 445 while (nz--) { 446 idx = 7*(*vi--); 447 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 448 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 449 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 450 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 451 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 452 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 453 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 454 v -= 49; 455 } 456 } 457 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 458 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 460 PetscFunctionReturn(0); 461 } 462 463 /*---------------------------------------------------------------------------------------------*/ 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 466 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 467 { 468 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 469 IS iscol=a->col,isrow=a->row; 470 PetscErrorCode ierr; 471 const PetscInt *r,*c,*rout,*cout; 472 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 473 PetscInt *diag = a->diag; 474 MatScalar *aa=a->a,*v; 475 PetscScalar s1,*x,*b,*t; 476 477 PetscFunctionBegin; 478 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 479 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480 t = a->solve_work; 481 482 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 483 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 484 485 /* copy the b into temp work space according to permutation */ 486 for (i=0; i<n; i++) { 487 t[i] = b[c[i]]; 488 } 489 490 /* forward solve the U^T */ 491 for (i=0; i<n; i++) { 492 493 v = aa + diag[i]; 494 /* multiply by the inverse of the block diagonal */ 495 s1 = (*v++)*t[i]; 496 vi = aj + diag[i] + 1; 497 nz = ai[i+1] - diag[i] - 1; 498 while (nz--) { 499 t[*vi++] -= (*v++)*s1; 500 } 501 t[i] = s1; 502 } 503 /* backward solve the L^T */ 504 for (i=n-1; i>=0; i--){ 505 v = aa + diag[i] - 1; 506 vi = aj + diag[i] - 1; 507 nz = diag[i] - ai[i]; 508 s1 = t[i]; 509 while (nz--) { 510 t[*vi--] -= (*v--)*s1; 511 } 512 } 513 514 /* copy t into x according to permutation */ 515 for (i=0; i<n; i++) { 516 x[r[i]] = t[i]; 517 } 518 519 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 520 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 521 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 522 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 523 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 524 PetscFunctionReturn(0); 525 } 526 527 #undef __FUNCT__ 528 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 529 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 530 { 531 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 532 IS iscol=a->col,isrow=a->row; 533 PetscErrorCode ierr; 534 const PetscInt *r,*c,*rout,*cout; 535 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 536 PetscInt *diag = a->diag,ii,ic,ir,oidx; 537 MatScalar *aa=a->a,*v; 538 PetscScalar s1,s2,x1,x2; 539 PetscScalar *x,*b,*t; 540 541 PetscFunctionBegin; 542 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 543 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544 t = a->solve_work; 545 546 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 547 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 548 549 /* copy the b into temp work space according to permutation */ 550 ii = 0; 551 for (i=0; i<n; i++) { 552 ic = 2*c[i]; 553 t[ii] = b[ic]; 554 t[ii+1] = b[ic+1]; 555 ii += 2; 556 } 557 558 /* forward solve the U^T */ 559 idx = 0; 560 for (i=0; i<n; i++) { 561 562 v = aa + 4*diag[i]; 563 /* multiply by the inverse of the block diagonal */ 564 x1 = t[idx]; x2 = t[1+idx]; 565 s1 = v[0]*x1 + v[1]*x2; 566 s2 = v[2]*x1 + v[3]*x2; 567 v += 4; 568 569 vi = aj + diag[i] + 1; 570 nz = ai[i+1] - diag[i] - 1; 571 while (nz--) { 572 oidx = 2*(*vi++); 573 t[oidx] -= v[0]*s1 + v[1]*s2; 574 t[oidx+1] -= v[2]*s1 + v[3]*s2; 575 v += 4; 576 } 577 t[idx] = s1;t[1+idx] = s2; 578 idx += 2; 579 } 580 /* backward solve the L^T */ 581 for (i=n-1; i>=0; i--){ 582 v = aa + 4*diag[i] - 4; 583 vi = aj + diag[i] - 1; 584 nz = diag[i] - ai[i]; 585 idt = 2*i; 586 s1 = t[idt]; s2 = t[1+idt]; 587 while (nz--) { 588 idx = 2*(*vi--); 589 t[idx] -= v[0]*s1 + v[1]*s2; 590 t[idx+1] -= v[2]*s1 + v[3]*s2; 591 v -= 4; 592 } 593 } 594 595 /* copy t into x according to permutation */ 596 ii = 0; 597 for (i=0; i<n; i++) { 598 ir = 2*r[i]; 599 x[ir] = t[ii]; 600 x[ir+1] = t[ii+1]; 601 ii += 2; 602 } 603 604 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 605 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 606 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 607 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 608 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 609 PetscFunctionReturn(0); 610 } 611 612 #undef __FUNCT__ 613 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 614 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 615 { 616 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 617 IS iscol=a->col,isrow=a->row; 618 PetscErrorCode ierr; 619 const PetscInt *r,*c,*rout,*cout; 620 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 621 PetscInt *diag = a->diag,ii,ic,ir,oidx; 622 MatScalar *aa=a->a,*v; 623 PetscScalar s1,s2,s3,x1,x2,x3; 624 PetscScalar *x,*b,*t; 625 626 PetscFunctionBegin; 627 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 628 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 629 t = a->solve_work; 630 631 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 632 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 633 634 /* copy the b into temp work space according to permutation */ 635 ii = 0; 636 for (i=0; i<n; i++) { 637 ic = 3*c[i]; 638 t[ii] = b[ic]; 639 t[ii+1] = b[ic+1]; 640 t[ii+2] = b[ic+2]; 641 ii += 3; 642 } 643 644 /* forward solve the U^T */ 645 idx = 0; 646 for (i=0; i<n; i++) { 647 648 v = aa + 9*diag[i]; 649 /* multiply by the inverse of the block diagonal */ 650 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 651 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 652 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 653 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 654 v += 9; 655 656 vi = aj + diag[i] + 1; 657 nz = ai[i+1] - diag[i] - 1; 658 while (nz--) { 659 oidx = 3*(*vi++); 660 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663 v += 9; 664 } 665 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 666 idx += 3; 667 } 668 /* backward solve the L^T */ 669 for (i=n-1; i>=0; i--){ 670 v = aa + 9*diag[i] - 9; 671 vi = aj + diag[i] - 1; 672 nz = diag[i] - ai[i]; 673 idt = 3*i; 674 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 675 while (nz--) { 676 idx = 3*(*vi--); 677 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 678 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 679 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 680 v -= 9; 681 } 682 } 683 684 /* copy t into x according to permutation */ 685 ii = 0; 686 for (i=0; i<n; i++) { 687 ir = 3*r[i]; 688 x[ir] = t[ii]; 689 x[ir+1] = t[ii+1]; 690 x[ir+2] = t[ii+2]; 691 ii += 3; 692 } 693 694 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 695 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 696 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 697 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 #undef __FUNCT__ 703 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 704 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 705 { 706 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 707 IS iscol=a->col,isrow=a->row; 708 PetscErrorCode ierr; 709 const PetscInt *r,*c,*rout,*cout; 710 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 711 PetscInt *diag = a->diag,ii,ic,ir,oidx; 712 MatScalar *aa=a->a,*v; 713 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 714 PetscScalar *x,*b,*t; 715 716 PetscFunctionBegin; 717 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 718 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719 t = a->solve_work; 720 721 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 722 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 723 724 /* copy the b into temp work space according to permutation */ 725 ii = 0; 726 for (i=0; i<n; i++) { 727 ic = 4*c[i]; 728 t[ii] = b[ic]; 729 t[ii+1] = b[ic+1]; 730 t[ii+2] = b[ic+2]; 731 t[ii+3] = b[ic+3]; 732 ii += 4; 733 } 734 735 /* forward solve the U^T */ 736 idx = 0; 737 for (i=0; i<n; i++) { 738 739 v = aa + 16*diag[i]; 740 /* multiply by the inverse of the block diagonal */ 741 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 742 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 743 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 744 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 745 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 746 v += 16; 747 748 vi = aj + diag[i] + 1; 749 nz = ai[i+1] - diag[i] - 1; 750 while (nz--) { 751 oidx = 4*(*vi++); 752 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 753 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 754 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 755 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 756 v += 16; 757 } 758 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 759 idx += 4; 760 } 761 /* backward solve the L^T */ 762 for (i=n-1; i>=0; i--){ 763 v = aa + 16*diag[i] - 16; 764 vi = aj + diag[i] - 1; 765 nz = diag[i] - ai[i]; 766 idt = 4*i; 767 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 768 while (nz--) { 769 idx = 4*(*vi--); 770 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 771 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 772 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 773 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 774 v -= 16; 775 } 776 } 777 778 /* copy t into x according to permutation */ 779 ii = 0; 780 for (i=0; i<n; i++) { 781 ir = 4*r[i]; 782 x[ir] = t[ii]; 783 x[ir+1] = t[ii+1]; 784 x[ir+2] = t[ii+2]; 785 x[ir+3] = t[ii+3]; 786 ii += 4; 787 } 788 789 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 790 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 791 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 792 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 793 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 794 PetscFunctionReturn(0); 795 } 796 797 #undef __FUNCT__ 798 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 799 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 800 { 801 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 802 IS iscol=a->col,isrow=a->row; 803 PetscErrorCode ierr; 804 const PetscInt *r,*c,*rout,*cout; 805 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 806 PetscInt *diag = a->diag,ii,ic,ir,oidx; 807 MatScalar *aa=a->a,*v; 808 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 809 PetscScalar *x,*b,*t; 810 811 PetscFunctionBegin; 812 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 813 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 814 t = a->solve_work; 815 816 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 817 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 818 819 /* copy the b into temp work space according to permutation */ 820 ii = 0; 821 for (i=0; i<n; i++) { 822 ic = 5*c[i]; 823 t[ii] = b[ic]; 824 t[ii+1] = b[ic+1]; 825 t[ii+2] = b[ic+2]; 826 t[ii+3] = b[ic+3]; 827 t[ii+4] = b[ic+4]; 828 ii += 5; 829 } 830 831 /* forward solve the U^T */ 832 idx = 0; 833 for (i=0; i<n; i++) { 834 835 v = aa + 25*diag[i]; 836 /* multiply by the inverse of the block diagonal */ 837 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 838 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 839 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 840 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 841 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 842 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 843 v += 25; 844 845 vi = aj + diag[i] + 1; 846 nz = ai[i+1] - diag[i] - 1; 847 while (nz--) { 848 oidx = 5*(*vi++); 849 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854 v += 25; 855 } 856 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 857 idx += 5; 858 } 859 /* backward solve the L^T */ 860 for (i=n-1; i>=0; i--){ 861 v = aa + 25*diag[i] - 25; 862 vi = aj + diag[i] - 1; 863 nz = diag[i] - ai[i]; 864 idt = 5*i; 865 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 866 while (nz--) { 867 idx = 5*(*vi--); 868 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 869 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 870 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 871 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 872 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 873 v -= 25; 874 } 875 } 876 877 /* copy t into x according to permutation */ 878 ii = 0; 879 for (i=0; i<n; i++) { 880 ir = 5*r[i]; 881 x[ir] = t[ii]; 882 x[ir+1] = t[ii+1]; 883 x[ir+2] = t[ii+2]; 884 x[ir+3] = t[ii+3]; 885 x[ir+4] = t[ii+4]; 886 ii += 5; 887 } 888 889 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 890 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 891 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 892 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 893 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 894 PetscFunctionReturn(0); 895 } 896 897 #undef __FUNCT__ 898 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 899 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 900 { 901 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 902 IS iscol=a->col,isrow=a->row; 903 PetscErrorCode ierr; 904 const PetscInt *r,*c,*rout,*cout; 905 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 906 PetscInt *diag = a->diag,ii,ic,ir,oidx; 907 MatScalar *aa=a->a,*v; 908 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 909 PetscScalar *x,*b,*t; 910 911 PetscFunctionBegin; 912 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914 t = a->solve_work; 915 916 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 917 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 918 919 /* copy the b into temp work space according to permutation */ 920 ii = 0; 921 for (i=0; i<n; i++) { 922 ic = 6*c[i]; 923 t[ii] = b[ic]; 924 t[ii+1] = b[ic+1]; 925 t[ii+2] = b[ic+2]; 926 t[ii+3] = b[ic+3]; 927 t[ii+4] = b[ic+4]; 928 t[ii+5] = b[ic+5]; 929 ii += 6; 930 } 931 932 /* forward solve the U^T */ 933 idx = 0; 934 for (i=0; i<n; i++) { 935 936 v = aa + 36*diag[i]; 937 /* multiply by the inverse of the block diagonal */ 938 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 939 x6 = t[5+idx]; 940 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 941 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 942 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 943 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 944 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 945 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 946 v += 36; 947 948 vi = aj + diag[i] + 1; 949 nz = ai[i+1] - diag[i] - 1; 950 while (nz--) { 951 oidx = 6*(*vi++); 952 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 953 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 954 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 955 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 956 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 957 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 958 v += 36; 959 } 960 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 961 t[5+idx] = s6; 962 idx += 6; 963 } 964 /* backward solve the L^T */ 965 for (i=n-1; i>=0; i--){ 966 v = aa + 36*diag[i] - 36; 967 vi = aj + diag[i] - 1; 968 nz = diag[i] - ai[i]; 969 idt = 6*i; 970 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 971 s6 = t[5+idt]; 972 while (nz--) { 973 idx = 6*(*vi--); 974 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 975 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 976 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 977 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 978 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 979 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 980 v -= 36; 981 } 982 } 983 984 /* copy t into x according to permutation */ 985 ii = 0; 986 for (i=0; i<n; i++) { 987 ir = 6*r[i]; 988 x[ir] = t[ii]; 989 x[ir+1] = t[ii+1]; 990 x[ir+2] = t[ii+2]; 991 x[ir+3] = t[ii+3]; 992 x[ir+4] = t[ii+4]; 993 x[ir+5] = t[ii+5]; 994 ii += 6; 995 } 996 997 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 998 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 999 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1000 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1001 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1002 PetscFunctionReturn(0); 1003 } 1004 1005 #undef __FUNCT__ 1006 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1007 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1008 { 1009 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1010 IS iscol=a->col,isrow=a->row; 1011 PetscErrorCode ierr; 1012 const PetscInt *r,*c,*rout,*cout; 1013 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1014 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1015 MatScalar *aa=a->a,*v; 1016 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1017 PetscScalar *x,*b,*t; 1018 1019 PetscFunctionBegin; 1020 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1021 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1022 t = a->solve_work; 1023 1024 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1025 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1026 1027 /* copy the b into temp work space according to permutation */ 1028 ii = 0; 1029 for (i=0; i<n; i++) { 1030 ic = 7*c[i]; 1031 t[ii] = b[ic]; 1032 t[ii+1] = b[ic+1]; 1033 t[ii+2] = b[ic+2]; 1034 t[ii+3] = b[ic+3]; 1035 t[ii+4] = b[ic+4]; 1036 t[ii+5] = b[ic+5]; 1037 t[ii+6] = b[ic+6]; 1038 ii += 7; 1039 } 1040 1041 /* forward solve the U^T */ 1042 idx = 0; 1043 for (i=0; i<n; i++) { 1044 1045 v = aa + 49*diag[i]; 1046 /* multiply by the inverse of the block diagonal */ 1047 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1048 x6 = t[5+idx]; x7 = t[6+idx]; 1049 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1050 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1051 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1052 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1053 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1054 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1055 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1056 v += 49; 1057 1058 vi = aj + diag[i] + 1; 1059 nz = ai[i+1] - diag[i] - 1; 1060 while (nz--) { 1061 oidx = 7*(*vi++); 1062 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069 v += 49; 1070 } 1071 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1072 t[5+idx] = s6;t[6+idx] = s7; 1073 idx += 7; 1074 } 1075 /* backward solve the L^T */ 1076 for (i=n-1; i>=0; i--){ 1077 v = aa + 49*diag[i] - 49; 1078 vi = aj + diag[i] - 1; 1079 nz = diag[i] - ai[i]; 1080 idt = 7*i; 1081 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1082 s6 = t[5+idt];s7 = t[6+idt]; 1083 while (nz--) { 1084 idx = 7*(*vi--); 1085 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1086 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1087 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1088 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1089 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1090 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1091 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1092 v -= 49; 1093 } 1094 } 1095 1096 /* copy t into x according to permutation */ 1097 ii = 0; 1098 for (i=0; i<n; i++) { 1099 ir = 7*r[i]; 1100 x[ir] = t[ii]; 1101 x[ir+1] = t[ii+1]; 1102 x[ir+2] = t[ii+2]; 1103 x[ir+3] = t[ii+3]; 1104 x[ir+4] = t[ii+4]; 1105 x[ir+5] = t[ii+5]; 1106 x[ir+6] = t[ii+6]; 1107 ii += 7; 1108 } 1109 1110 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1111 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1112 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1113 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1114 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1115 PetscFunctionReturn(0); 1116 } 1117 1118 /* ----------------------------------------------------------- */ 1119 #undef __FUNCT__ 1120 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1121 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1122 { 1123 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1124 IS iscol=a->col,isrow=a->row; 1125 PetscErrorCode ierr; 1126 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1127 PetscInt i,n=a->mbs; 1128 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1129 MatScalar *aa=a->a,*v; 1130 PetscScalar *x,*b,*s,*t,*ls; 1131 1132 PetscFunctionBegin; 1133 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1134 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1135 t = a->solve_work; 1136 1137 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1138 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1139 1140 /* forward solve the lower triangular */ 1141 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1142 for (i=1; i<n; i++) { 1143 v = aa + bs2*ai[i]; 1144 vi = aj + ai[i]; 1145 nz = a->diag[i] - ai[i]; 1146 s = t + bs*i; 1147 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1148 while (nz--) { 1149 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1150 v += bs2; 1151 } 1152 } 1153 /* backward solve the upper triangular */ 1154 ls = a->solve_work + A->cmap->n; 1155 for (i=n-1; i>=0; i--){ 1156 v = aa + bs2*(a->diag[i] + 1); 1157 vi = aj + a->diag[i] + 1; 1158 nz = ai[i+1] - a->diag[i] - 1; 1159 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1160 while (nz--) { 1161 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1162 v += bs2; 1163 } 1164 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1165 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1166 } 1167 1168 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1169 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 #undef __FUNCT__ 1177 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1178 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1179 { 1180 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1181 IS iscol=a->col,isrow=a->row; 1182 PetscErrorCode ierr; 1183 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1184 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1185 MatScalar *aa=a->a,*v; 1186 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1187 PetscScalar *x,*b,*t; 1188 1189 PetscFunctionBegin; 1190 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1191 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1192 t = a->solve_work; 1193 1194 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1195 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1196 1197 /* forward solve the lower triangular */ 1198 idx = 7*(*r++); 1199 t[0] = b[idx]; t[1] = b[1+idx]; 1200 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1201 t[5] = b[5+idx]; t[6] = b[6+idx]; 1202 1203 for (i=1; i<n; i++) { 1204 v = aa + 49*ai[i]; 1205 vi = aj + ai[i]; 1206 nz = diag[i] - ai[i]; 1207 idx = 7*(*r++); 1208 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1209 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1210 while (nz--) { 1211 idx = 7*(*vi++); 1212 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1213 x4 = t[3+idx];x5 = t[4+idx]; 1214 x6 = t[5+idx];x7 = t[6+idx]; 1215 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1216 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1217 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1218 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1219 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1220 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1221 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1222 v += 49; 1223 } 1224 idx = 7*i; 1225 t[idx] = s1;t[1+idx] = s2; 1226 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1227 t[5+idx] = s6;t[6+idx] = s7; 1228 } 1229 /* backward solve the upper triangular */ 1230 for (i=n-1; i>=0; i--){ 1231 v = aa + 49*diag[i] + 49; 1232 vi = aj + diag[i] + 1; 1233 nz = ai[i+1] - diag[i] - 1; 1234 idt = 7*i; 1235 s1 = t[idt]; s2 = t[1+idt]; 1236 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1237 s6 = t[5+idt];s7 = t[6+idt]; 1238 while (nz--) { 1239 idx = 7*(*vi++); 1240 x1 = t[idx]; x2 = t[1+idx]; 1241 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242 x6 = t[5+idx]; x7 = t[6+idx]; 1243 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1244 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1245 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1246 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1247 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1248 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1249 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1250 v += 49; 1251 } 1252 idc = 7*(*c--); 1253 v = aa + 49*diag[i]; 1254 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1255 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1256 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1257 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1258 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1259 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1260 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1261 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1262 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1263 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1264 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1265 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1266 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1267 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1268 } 1269 1270 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1271 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1272 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1273 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1274 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1275 PetscFunctionReturn(0); 1276 } 1277 1278 #undef __FUNCT__ 1279 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1280 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 1281 { 1282 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1283 IS iscol=a->col,isrow=a->row; 1284 PetscErrorCode ierr; 1285 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi; 1286 PetscInt i,n=a->mbs,nz,idx,idt,idc,k,m; 1287 MatScalar *aa=a->a,*v; 1288 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1289 PetscScalar *x,*b,*t; 1290 1291 PetscFunctionBegin; 1292 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1293 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1294 t = a->solve_work; 1295 1296 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1297 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1298 1299 /* forward solve the lower triangular */ 1300 idx = 7*r[0]; 1301 t[0] = b[idx]; t[1] = b[1+idx]; 1302 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1303 t[5] = b[5+idx]; t[6] = b[6+idx]; 1304 1305 for (i=1; i<n; i++) { 1306 v = aa + 49*ai[i]; 1307 vi = aj + ai[i]; 1308 nz = ai[i+1] - ai[i]; 1309 idx = 7*r[i]; 1310 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1311 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1312 for(m=0;m<nz;m++){ 1313 idx = 7*vi[m]; 1314 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1315 x4 = t[3+idx];x5 = t[4+idx]; 1316 x6 = t[5+idx];x7 = t[6+idx]; 1317 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1318 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1319 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1320 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1321 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1322 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1323 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1324 v += 49; 1325 } 1326 idx = 7*i; 1327 t[idx] = s1;t[1+idx] = s2; 1328 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1329 t[5+idx] = s6;t[6+idx] = s7; 1330 } 1331 /* backward solve the upper triangular */ 1332 for (i=n-1; i>=0; i--){ 1333 k = 2*n-i; 1334 v = aa + 49*ai[k]; 1335 vi = aj + ai[k]; 1336 nz = ai[k+1] - ai[k] - 1; 1337 idt = 7*i; 1338 s1 = t[idt]; s2 = t[1+idt]; 1339 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1340 s6 = t[5+idt];s7 = t[6+idt]; 1341 for(m=0;m<nz;m++){ 1342 idx = 7*vi[m]; 1343 x1 = t[idx]; x2 = t[1+idx]; 1344 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1345 x6 = t[5+idx]; x7 = t[6+idx]; 1346 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1347 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1348 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1349 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1350 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1351 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1352 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1353 v += 49; 1354 } 1355 idc = 7*c[i]; 1356 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1357 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1358 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1359 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1360 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1361 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1362 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1363 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1364 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1365 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1366 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1367 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1368 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1369 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1370 } 1371 1372 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1373 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1374 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1375 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1376 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1377 PetscFunctionReturn(0); 1378 } 1379 1380 #undef __FUNCT__ 1381 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct_v2" 1382 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct_v2(Mat A,Vec bb,Vec xx) 1383 { 1384 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1385 IS iscol=a->col,isrow=a->row; 1386 PetscErrorCode ierr; 1387 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 1388 PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 1389 MatScalar *aa=a->a,*v; 1390 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1391 PetscScalar *x,*b,*t; 1392 1393 PetscFunctionBegin; 1394 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1395 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1396 t = a->solve_work; 1397 1398 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1399 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1400 1401 /* forward solve the lower triangular */ 1402 idx = 7*r[0]; 1403 t[0] = b[idx]; t[1] = b[1+idx]; 1404 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1405 t[5] = b[5+idx]; t[6] = b[6+idx]; 1406 1407 for (i=1; i<n; i++) { 1408 v = aa + 49*ai[i]; 1409 vi = aj + ai[i]; 1410 nz = ai[i+1] - ai[i]; 1411 idx = 7*r[i]; 1412 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1413 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1414 for(m=0;m<nz;m++){ 1415 idx = 7*vi[m]; 1416 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1417 x4 = t[3+idx];x5 = t[4+idx]; 1418 x6 = t[5+idx];x7 = t[6+idx]; 1419 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1420 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1421 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1422 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1423 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1424 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1425 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1426 v += 49; 1427 } 1428 idx = 7*i; 1429 t[idx] = s1;t[1+idx] = s2; 1430 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1431 t[5+idx] = s6;t[6+idx] = s7; 1432 } 1433 /* backward solve the upper triangular */ 1434 for (i=n-1; i>=0; i--){ 1435 v = aa + 49*(adiag[i+1]+1); 1436 vi = aj + adiag[i+1]+1; 1437 nz = adiag[i] - adiag[i+1] - 1; 1438 idt = 7*i; 1439 s1 = t[idt]; s2 = t[1+idt]; 1440 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1441 s6 = t[5+idt];s7 = t[6+idt]; 1442 for(m=0;m<nz;m++){ 1443 idx = 7*vi[m]; 1444 x1 = t[idx]; x2 = t[1+idx]; 1445 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1446 x6 = t[5+idx]; x7 = t[6+idx]; 1447 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1448 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1449 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1450 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1451 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1452 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1453 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1454 v += 49; 1455 } 1456 idc = 7*c[i]; 1457 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1458 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1459 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1460 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1461 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1462 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1463 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1464 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1465 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1466 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1467 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1468 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1469 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1470 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1471 } 1472 1473 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1474 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1475 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1476 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1477 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1478 PetscFunctionReturn(0); 1479 } 1480 1481 #undef __FUNCT__ 1482 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1483 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1484 { 1485 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1486 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1487 PetscErrorCode ierr; 1488 PetscInt *diag = a->diag,jdx; 1489 const MatScalar *aa=a->a,*v; 1490 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1491 const PetscScalar *b; 1492 1493 PetscFunctionBegin; 1494 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1495 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1496 /* forward solve the lower triangular */ 1497 idx = 0; 1498 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1499 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1500 x[6] = b[6+idx]; 1501 for (i=1; i<n; i++) { 1502 v = aa + 49*ai[i]; 1503 vi = aj + ai[i]; 1504 nz = diag[i] - ai[i]; 1505 idx = 7*i; 1506 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1507 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1508 s7 = b[6+idx]; 1509 while (nz--) { 1510 jdx = 7*(*vi++); 1511 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1512 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1513 x7 = x[6+jdx]; 1514 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1515 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1516 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1517 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1518 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1519 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1520 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1521 v += 49; 1522 } 1523 x[idx] = s1; 1524 x[1+idx] = s2; 1525 x[2+idx] = s3; 1526 x[3+idx] = s4; 1527 x[4+idx] = s5; 1528 x[5+idx] = s6; 1529 x[6+idx] = s7; 1530 } 1531 /* backward solve the upper triangular */ 1532 for (i=n-1; i>=0; i--){ 1533 v = aa + 49*diag[i] + 49; 1534 vi = aj + diag[i] + 1; 1535 nz = ai[i+1] - diag[i] - 1; 1536 idt = 7*i; 1537 s1 = x[idt]; s2 = x[1+idt]; 1538 s3 = x[2+idt]; s4 = x[3+idt]; 1539 s5 = x[4+idt]; s6 = x[5+idt]; 1540 s7 = x[6+idt]; 1541 while (nz--) { 1542 idx = 7*(*vi++); 1543 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1544 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1545 x7 = x[6+idx]; 1546 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1547 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1548 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1549 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1550 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1551 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1552 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1553 v += 49; 1554 } 1555 v = aa + 49*diag[i]; 1556 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1557 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1558 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1559 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1560 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1561 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1562 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1563 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1564 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1565 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1566 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1567 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1568 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1569 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1570 } 1571 1572 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1573 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1574 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1575 PetscFunctionReturn(0); 1576 } 1577 1578 #undef __FUNCT__ 1579 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 1580 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1581 { 1582 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1583 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 1584 PetscErrorCode ierr; 1585 PetscInt idx,jdx,idt; 1586 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1587 const MatScalar *aa=a->a,*v; 1588 PetscScalar *x; 1589 const PetscScalar *b; 1590 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1591 1592 PetscFunctionBegin; 1593 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1594 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1595 /* forward solve the lower triangular */ 1596 idx = 0; 1597 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1598 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1599 for (i=1; i<n; i++) { 1600 v = aa + bs2*ai[i]; 1601 vi = aj + ai[i]; 1602 nz = ai[i+1] - ai[i]; 1603 idx = bs*i; 1604 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1605 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1606 for(k=0;k<nz;k++) { 1607 jdx = bs*vi[k]; 1608 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1609 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1610 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1611 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1612 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1613 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1614 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1615 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1616 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1617 v += bs2; 1618 } 1619 1620 x[idx] = s1; 1621 x[1+idx] = s2; 1622 x[2+idx] = s3; 1623 x[3+idx] = s4; 1624 x[4+idx] = s5; 1625 x[5+idx] = s6; 1626 x[6+idx] = s7; 1627 } 1628 1629 /* backward solve the upper triangular */ 1630 for (i=n-1; i>=0; i--){ 1631 v = aa + bs2*ai[2*n-i]; 1632 vi = aj + ai[2*n-i]; 1633 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1634 idt = bs*i; 1635 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1636 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1637 for(k=0;k<nz;k++) { 1638 idx = bs*vi[k]; 1639 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1640 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1641 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1642 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1643 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1644 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1645 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1646 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1647 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1648 v += bs2; 1649 } 1650 /* x = inv_diagonal*x */ 1651 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1652 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1653 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1654 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1655 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1656 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1657 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1658 } 1659 1660 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1661 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1662 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1663 PetscFunctionReturn(0); 1664 } 1665 1666 #undef __FUNCT__ 1667 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2" 1668 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 1669 { 1670 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1671 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 1672 PetscErrorCode ierr; 1673 PetscInt idx,jdx,idt; 1674 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 1675 const MatScalar *aa=a->a,*v; 1676 PetscScalar *x; 1677 const PetscScalar *b; 1678 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1679 1680 PetscFunctionBegin; 1681 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1682 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1683 /* forward solve the lower triangular */ 1684 idx = 0; 1685 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 1686 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 1687 for (i=1; i<n; i++) { 1688 v = aa + bs2*ai[i]; 1689 vi = aj + ai[i]; 1690 nz = ai[i+1] - ai[i]; 1691 idx = bs*i; 1692 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1693 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1694 for(k=0;k<nz;k++) { 1695 jdx = bs*vi[k]; 1696 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 1697 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 1698 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1699 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1700 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1701 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1702 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1703 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1704 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1705 v += bs2; 1706 } 1707 1708 x[idx] = s1; 1709 x[1+idx] = s2; 1710 x[2+idx] = s3; 1711 x[3+idx] = s4; 1712 x[4+idx] = s5; 1713 x[5+idx] = s6; 1714 x[6+idx] = s7; 1715 } 1716 1717 /* backward solve the upper triangular */ 1718 for (i=n-1; i>=0; i--){ 1719 v = aa + bs2*(adiag[i+1]+1); 1720 vi = aj + adiag[i+1]+1; 1721 nz = adiag[i] - adiag[i+1]-1; 1722 idt = bs*i; 1723 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 1724 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 1725 for(k=0;k<nz;k++) { 1726 idx = bs*vi[k]; 1727 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 1728 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 1729 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1730 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1731 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1732 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1733 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1734 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1735 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1736 v += bs2; 1737 } 1738 /* x = inv_diagonal*x */ 1739 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1740 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1741 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1742 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1743 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1744 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1745 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1746 } 1747 1748 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1749 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1750 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1751 PetscFunctionReturn(0); 1752 } 1753 1754 #undef __FUNCT__ 1755 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1756 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1757 { 1758 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1759 IS iscol=a->col,isrow=a->row; 1760 PetscErrorCode ierr; 1761 const PetscInt *r,*c,*rout,*cout; 1762 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1763 const MatScalar *aa=a->a,*v; 1764 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1765 const PetscScalar *b; 1766 PetscFunctionBegin; 1767 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1768 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1769 t = a->solve_work; 1770 1771 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1772 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1773 1774 /* forward solve the lower triangular */ 1775 idx = 6*(*r++); 1776 t[0] = b[idx]; t[1] = b[1+idx]; 1777 t[2] = b[2+idx]; t[3] = b[3+idx]; 1778 t[4] = b[4+idx]; t[5] = b[5+idx]; 1779 for (i=1; i<n; i++) { 1780 v = aa + 36*ai[i]; 1781 vi = aj + ai[i]; 1782 nz = diag[i] - ai[i]; 1783 idx = 6*(*r++); 1784 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1785 s5 = b[4+idx]; s6 = b[5+idx]; 1786 while (nz--) { 1787 idx = 6*(*vi++); 1788 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1789 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1790 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1791 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1792 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1793 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1794 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1795 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1796 v += 36; 1797 } 1798 idx = 6*i; 1799 t[idx] = s1;t[1+idx] = s2; 1800 t[2+idx] = s3;t[3+idx] = s4; 1801 t[4+idx] = s5;t[5+idx] = s6; 1802 } 1803 /* backward solve the upper triangular */ 1804 for (i=n-1; i>=0; i--){ 1805 v = aa + 36*diag[i] + 36; 1806 vi = aj + diag[i] + 1; 1807 nz = ai[i+1] - diag[i] - 1; 1808 idt = 6*i; 1809 s1 = t[idt]; s2 = t[1+idt]; 1810 s3 = t[2+idt];s4 = t[3+idt]; 1811 s5 = t[4+idt];s6 = t[5+idt]; 1812 while (nz--) { 1813 idx = 6*(*vi++); 1814 x1 = t[idx]; x2 = t[1+idx]; 1815 x3 = t[2+idx]; x4 = t[3+idx]; 1816 x5 = t[4+idx]; x6 = t[5+idx]; 1817 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1818 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1819 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1820 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1821 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1822 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1823 v += 36; 1824 } 1825 idc = 6*(*c--); 1826 v = aa + 36*diag[i]; 1827 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1828 v[18]*s4+v[24]*s5+v[30]*s6; 1829 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1830 v[19]*s4+v[25]*s5+v[31]*s6; 1831 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1832 v[20]*s4+v[26]*s5+v[32]*s6; 1833 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1834 v[21]*s4+v[27]*s5+v[33]*s6; 1835 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1836 v[22]*s4+v[28]*s5+v[34]*s6; 1837 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1838 v[23]*s4+v[29]*s5+v[35]*s6; 1839 } 1840 1841 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1842 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1843 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1844 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1845 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1846 PetscFunctionReturn(0); 1847 } 1848 1849 #undef __FUNCT__ 1850 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 1851 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 1852 { 1853 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1854 IS iscol=a->col,isrow=a->row; 1855 PetscErrorCode ierr; 1856 const PetscInt *r,*c,*rout,*cout; 1857 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 1858 const MatScalar *aa=a->a,*v; 1859 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1860 const PetscScalar *b; 1861 PetscFunctionBegin; 1862 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1863 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1864 t = a->solve_work; 1865 1866 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1867 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1868 1869 /* forward solve the lower triangular */ 1870 idx = 6*r[0]; 1871 t[0] = b[idx]; t[1] = b[1+idx]; 1872 t[2] = b[2+idx]; t[3] = b[3+idx]; 1873 t[4] = b[4+idx]; t[5] = b[5+idx]; 1874 for (i=1; i<n; i++) { 1875 v = aa + 36*ai[i]; 1876 vi = aj + ai[i]; 1877 nz = ai[i+1] - ai[i]; 1878 idx = 6*r[i]; 1879 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1880 s5 = b[4+idx]; s6 = b[5+idx]; 1881 for(m=0;m<nz;m++){ 1882 idx = 6*vi[m]; 1883 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1884 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1885 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1886 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1887 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1888 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1889 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1890 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1891 v += 36; 1892 } 1893 idx = 6*i; 1894 t[idx] = s1;t[1+idx] = s2; 1895 t[2+idx] = s3;t[3+idx] = s4; 1896 t[4+idx] = s5;t[5+idx] = s6; 1897 } 1898 /* backward solve the upper triangular */ 1899 for (i=n-1; i>=0; i--){ 1900 k = 2*n-i; 1901 v = aa + 36*ai[k]; 1902 vi = aj + ai[k]; 1903 nz = ai[k+1] - ai[k] - 1; 1904 idt = 6*i; 1905 s1 = t[idt]; s2 = t[1+idt]; 1906 s3 = t[2+idt];s4 = t[3+idt]; 1907 s5 = t[4+idt];s6 = t[5+idt]; 1908 for(m=0;m<nz;m++){ 1909 idx = 6*vi[m]; 1910 x1 = t[idx]; x2 = t[1+idx]; 1911 x3 = t[2+idx]; x4 = t[3+idx]; 1912 x5 = t[4+idx]; x6 = t[5+idx]; 1913 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1914 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1915 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1916 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1917 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1918 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1919 v += 36; 1920 } 1921 idc = 6*c[i]; 1922 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1923 v[18]*s4+v[24]*s5+v[30]*s6; 1924 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1925 v[19]*s4+v[25]*s5+v[31]*s6; 1926 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1927 v[20]*s4+v[26]*s5+v[32]*s6; 1928 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1929 v[21]*s4+v[27]*s5+v[33]*s6; 1930 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1931 v[22]*s4+v[28]*s5+v[34]*s6; 1932 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1933 v[23]*s4+v[29]*s5+v[35]*s6; 1934 } 1935 1936 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1937 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1938 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1939 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1940 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1941 PetscFunctionReturn(0); 1942 } 1943 1944 #undef __FUNCT__ 1945 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct_v2" 1946 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct_v2(Mat A,Vec bb,Vec xx) 1947 { 1948 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1949 IS iscol=a->col,isrow=a->row; 1950 PetscErrorCode ierr; 1951 const PetscInt *r,*c,*rout,*cout; 1952 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 1953 const MatScalar *aa=a->a,*v; 1954 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1955 const PetscScalar *b; 1956 PetscFunctionBegin; 1957 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1958 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1959 t = a->solve_work; 1960 1961 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1962 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1963 1964 /* forward solve the lower triangular */ 1965 idx = 6*r[0]; 1966 t[0] = b[idx]; t[1] = b[1+idx]; 1967 t[2] = b[2+idx]; t[3] = b[3+idx]; 1968 t[4] = b[4+idx]; t[5] = b[5+idx]; 1969 for (i=1; i<n; i++) { 1970 v = aa + 36*ai[i]; 1971 vi = aj + ai[i]; 1972 nz = ai[i+1] - ai[i]; 1973 idx = 6*r[i]; 1974 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1975 s5 = b[4+idx]; s6 = b[5+idx]; 1976 for(m=0;m<nz;m++){ 1977 idx = 6*vi[m]; 1978 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1979 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1980 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1981 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1982 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1983 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1984 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1985 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1986 v += 36; 1987 } 1988 idx = 6*i; 1989 t[idx] = s1;t[1+idx] = s2; 1990 t[2+idx] = s3;t[3+idx] = s4; 1991 t[4+idx] = s5;t[5+idx] = s6; 1992 } 1993 /* backward solve the upper triangular */ 1994 for (i=n-1; i>=0; i--){ 1995 v = aa + 36*(adiag[i+1]+1); 1996 vi = aj + adiag[i+1]+1; 1997 nz = adiag[i] - adiag[i+1] - 1; 1998 idt = 6*i; 1999 s1 = t[idt]; s2 = t[1+idt]; 2000 s3 = t[2+idt];s4 = t[3+idt]; 2001 s5 = t[4+idt];s6 = t[5+idt]; 2002 for(m=0;m<nz;m++){ 2003 idx = 6*vi[m]; 2004 x1 = t[idx]; x2 = t[1+idx]; 2005 x3 = t[2+idx]; x4 = t[3+idx]; 2006 x5 = t[4+idx]; x6 = t[5+idx]; 2007 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2008 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2009 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2010 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2011 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2012 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2013 v += 36; 2014 } 2015 idc = 6*c[i]; 2016 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2017 v[18]*s4+v[24]*s5+v[30]*s6; 2018 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2019 v[19]*s4+v[25]*s5+v[31]*s6; 2020 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2021 v[20]*s4+v[26]*s5+v[32]*s6; 2022 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2023 v[21]*s4+v[27]*s5+v[33]*s6; 2024 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2025 v[22]*s4+v[28]*s5+v[34]*s6; 2026 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2027 v[23]*s4+v[29]*s5+v[35]*s6; 2028 } 2029 2030 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2031 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2032 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2033 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2034 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2035 PetscFunctionReturn(0); 2036 } 2037 2038 #undef __FUNCT__ 2039 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 2040 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 2041 { 2042 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2043 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2044 PetscErrorCode ierr; 2045 PetscInt *diag = a->diag,jdx; 2046 const MatScalar *aa=a->a,*v; 2047 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2048 const PetscScalar *b; 2049 2050 PetscFunctionBegin; 2051 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2052 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2053 /* forward solve the lower triangular */ 2054 idx = 0; 2055 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2056 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2057 for (i=1; i<n; i++) { 2058 v = aa + 36*ai[i]; 2059 vi = aj + ai[i]; 2060 nz = diag[i] - ai[i]; 2061 idx = 6*i; 2062 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2063 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2064 while (nz--) { 2065 jdx = 6*(*vi++); 2066 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2067 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2068 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2069 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2070 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2071 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2072 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2073 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2074 v += 36; 2075 } 2076 x[idx] = s1; 2077 x[1+idx] = s2; 2078 x[2+idx] = s3; 2079 x[3+idx] = s4; 2080 x[4+idx] = s5; 2081 x[5+idx] = s6; 2082 } 2083 /* backward solve the upper triangular */ 2084 for (i=n-1; i>=0; i--){ 2085 v = aa + 36*diag[i] + 36; 2086 vi = aj + diag[i] + 1; 2087 nz = ai[i+1] - diag[i] - 1; 2088 idt = 6*i; 2089 s1 = x[idt]; s2 = x[1+idt]; 2090 s3 = x[2+idt]; s4 = x[3+idt]; 2091 s5 = x[4+idt]; s6 = x[5+idt]; 2092 while (nz--) { 2093 idx = 6*(*vi++); 2094 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2095 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2096 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2097 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2098 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2099 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2100 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2101 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2102 v += 36; 2103 } 2104 v = aa + 36*diag[i]; 2105 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2106 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2107 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2108 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2109 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2110 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2111 } 2112 2113 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2114 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2115 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2116 PetscFunctionReturn(0); 2117 } 2118 2119 #undef __FUNCT__ 2120 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2121 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2122 { 2123 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2124 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 2125 PetscErrorCode ierr; 2126 PetscInt idx,jdx,idt; 2127 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2128 const MatScalar *aa=a->a,*v; 2129 PetscScalar *x; 2130 const PetscScalar *b; 2131 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2132 2133 PetscFunctionBegin; 2134 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2135 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2136 /* forward solve the lower triangular */ 2137 idx = 0; 2138 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2139 x[4] = b[4+idx];x[5] = b[5+idx]; 2140 for (i=1; i<n; i++) { 2141 v = aa + bs2*ai[i]; 2142 vi = aj + ai[i]; 2143 nz = ai[i+1] - ai[i]; 2144 idx = bs*i; 2145 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2146 s5 = b[4+idx];s6 = b[5+idx]; 2147 for(k=0;k<nz;k++){ 2148 jdx = bs*vi[k]; 2149 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2150 x5 = x[4+jdx]; x6 = x[5+jdx]; 2151 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2152 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2153 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2154 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2155 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2156 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2157 v += bs2; 2158 } 2159 2160 x[idx] = s1; 2161 x[1+idx] = s2; 2162 x[2+idx] = s3; 2163 x[3+idx] = s4; 2164 x[4+idx] = s5; 2165 x[5+idx] = s6; 2166 } 2167 2168 /* backward solve the upper triangular */ 2169 for (i=n-1; i>=0; i--){ 2170 v = aa + bs2*ai[2*n-i]; 2171 vi = aj + ai[2*n-i]; 2172 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2173 idt = bs*i; 2174 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2175 s5 = x[4+idt];s6 = x[5+idt]; 2176 for(k=0;k<nz;k++){ 2177 idx = bs*vi[k]; 2178 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2179 x5 = x[4+idx];x6 = x[5+idx]; 2180 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2181 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2182 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2183 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2184 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2185 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2186 v += bs2; 2187 } 2188 /* x = inv_diagonal*x */ 2189 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2190 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2191 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2192 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2193 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2194 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2195 } 2196 2197 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2198 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2199 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2200 PetscFunctionReturn(0); 2201 } 2202 2203 #undef __FUNCT__ 2204 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2" 2205 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2206 { 2207 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2208 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2209 PetscErrorCode ierr; 2210 PetscInt idx,jdx,idt; 2211 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2212 const MatScalar *aa=a->a,*v; 2213 PetscScalar *x; 2214 const PetscScalar *b; 2215 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2216 2217 PetscFunctionBegin; 2218 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2219 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2220 /* forward solve the lower triangular */ 2221 idx = 0; 2222 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2223 x[4] = b[4+idx];x[5] = b[5+idx]; 2224 for (i=1; i<n; i++) { 2225 v = aa + bs2*ai[i]; 2226 vi = aj + ai[i]; 2227 nz = ai[i+1] - ai[i]; 2228 idx = bs*i; 2229 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2230 s5 = b[4+idx];s6 = b[5+idx]; 2231 for(k=0;k<nz;k++){ 2232 jdx = bs*vi[k]; 2233 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2234 x5 = x[4+jdx]; x6 = x[5+jdx]; 2235 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2236 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2237 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2238 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2239 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2240 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2241 v += bs2; 2242 } 2243 2244 x[idx] = s1; 2245 x[1+idx] = s2; 2246 x[2+idx] = s3; 2247 x[3+idx] = s4; 2248 x[4+idx] = s5; 2249 x[5+idx] = s6; 2250 } 2251 2252 /* backward solve the upper triangular */ 2253 for (i=n-1; i>=0; i--){ 2254 v = aa + bs2*(adiag[i+1]+1); 2255 vi = aj + adiag[i+1]+1; 2256 nz = adiag[i] - adiag[i+1]-1; 2257 idt = bs*i; 2258 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2259 s5 = x[4+idt];s6 = x[5+idt]; 2260 for(k=0;k<nz;k++){ 2261 idx = bs*vi[k]; 2262 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2263 x5 = x[4+idx];x6 = x[5+idx]; 2264 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2265 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2266 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2267 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2268 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2269 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2270 v += bs2; 2271 } 2272 /* x = inv_diagonal*x */ 2273 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2274 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2275 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2276 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2277 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2278 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2279 } 2280 2281 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2282 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2283 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2284 PetscFunctionReturn(0); 2285 } 2286 2287 #undef __FUNCT__ 2288 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2289 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 2290 { 2291 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2292 IS iscol=a->col,isrow=a->row; 2293 PetscErrorCode ierr; 2294 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 2295 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2296 const MatScalar *aa=a->a,*v; 2297 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2298 const PetscScalar *b; 2299 2300 PetscFunctionBegin; 2301 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2302 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2303 t = a->solve_work; 2304 2305 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2306 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2307 2308 /* forward solve the lower triangular */ 2309 idx = 5*(*r++); 2310 t[0] = b[idx]; t[1] = b[1+idx]; 2311 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2312 for (i=1; i<n; i++) { 2313 v = aa + 25*ai[i]; 2314 vi = aj + ai[i]; 2315 nz = diag[i] - ai[i]; 2316 idx = 5*(*r++); 2317 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2318 s5 = b[4+idx]; 2319 while (nz--) { 2320 idx = 5*(*vi++); 2321 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2322 x4 = t[3+idx];x5 = t[4+idx]; 2323 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2324 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2325 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2326 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2327 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2328 v += 25; 2329 } 2330 idx = 5*i; 2331 t[idx] = s1;t[1+idx] = s2; 2332 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2333 } 2334 /* backward solve the upper triangular */ 2335 for (i=n-1; i>=0; i--){ 2336 v = aa + 25*diag[i] + 25; 2337 vi = aj + diag[i] + 1; 2338 nz = ai[i+1] - diag[i] - 1; 2339 idt = 5*i; 2340 s1 = t[idt]; s2 = t[1+idt]; 2341 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2342 while (nz--) { 2343 idx = 5*(*vi++); 2344 x1 = t[idx]; x2 = t[1+idx]; 2345 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2346 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2347 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2348 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2349 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2350 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2351 v += 25; 2352 } 2353 idc = 5*(*c--); 2354 v = aa + 25*diag[i]; 2355 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2356 v[15]*s4+v[20]*s5; 2357 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2358 v[16]*s4+v[21]*s5; 2359 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2360 v[17]*s4+v[22]*s5; 2361 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2362 v[18]*s4+v[23]*s5; 2363 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2364 v[19]*s4+v[24]*s5; 2365 } 2366 2367 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2368 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2369 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2370 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2371 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2372 PetscFunctionReturn(0); 2373 } 2374 2375 #undef __FUNCT__ 2376 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2377 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 2378 { 2379 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2380 IS iscol=a->col,isrow=a->row; 2381 PetscErrorCode ierr; 2382 const PetscInt *r,*c,*rout,*cout; 2383 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2384 const MatScalar *aa=a->a,*v; 2385 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2386 const PetscScalar *b; 2387 2388 PetscFunctionBegin; 2389 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2390 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2391 t = a->solve_work; 2392 2393 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2394 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2395 2396 /* forward solve the lower triangular */ 2397 idx = 5*r[0]; 2398 t[0] = b[idx]; t[1] = b[1+idx]; 2399 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2400 for (i=1; i<n; i++) { 2401 v = aa + 25*ai[i]; 2402 vi = aj + ai[i]; 2403 nz = ai[i+1] - ai[i]; 2404 idx = 5*r[i]; 2405 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2406 s5 = b[4+idx]; 2407 for(m=0;m<nz;m++){ 2408 idx = 5*vi[m]; 2409 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2410 x4 = t[3+idx];x5 = t[4+idx]; 2411 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2412 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2413 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2414 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2415 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2416 v += 25; 2417 } 2418 idx = 5*i; 2419 t[idx] = s1;t[1+idx] = s2; 2420 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2421 } 2422 /* backward solve the upper triangular */ 2423 for (i=n-1; i>=0; i--){ 2424 k = 2*n-i; 2425 v = aa + 25*ai[k]; 2426 vi = aj + ai[k]; 2427 nz = ai[k+1] - ai[k] - 1; 2428 idt = 5*i; 2429 s1 = t[idt]; s2 = t[1+idt]; 2430 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2431 for(m=0;m<nz;m++){ 2432 idx = 5*vi[m]; 2433 x1 = t[idx]; x2 = t[1+idx]; 2434 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2435 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2436 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2437 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2438 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2439 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2440 v += 25; 2441 } 2442 idc = 5*c[i]; 2443 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2444 v[15]*s4+v[20]*s5; 2445 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2446 v[16]*s4+v[21]*s5; 2447 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2448 v[17]*s4+v[22]*s5; 2449 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2450 v[18]*s4+v[23]*s5; 2451 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2452 v[19]*s4+v[24]*s5; 2453 } 2454 2455 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2456 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2457 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2458 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2459 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2460 PetscFunctionReturn(0); 2461 } 2462 2463 #undef __FUNCT__ 2464 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2" 2465 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2466 { 2467 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2468 IS iscol=a->col,isrow=a->row; 2469 PetscErrorCode ierr; 2470 const PetscInt *r,*c,*rout,*cout; 2471 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2472 const MatScalar *aa=a->a,*v; 2473 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2474 const PetscScalar *b; 2475 2476 PetscFunctionBegin; 2477 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2478 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2479 t = a->solve_work; 2480 2481 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2482 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2483 2484 /* forward solve the lower triangular */ 2485 idx = 5*r[0]; 2486 t[0] = b[idx]; t[1] = b[1+idx]; 2487 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2488 for (i=1; i<n; i++) { 2489 v = aa + 25*ai[i]; 2490 vi = aj + ai[i]; 2491 nz = ai[i+1] - ai[i]; 2492 idx = 5*r[i]; 2493 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2494 s5 = b[4+idx]; 2495 for(m=0;m<nz;m++){ 2496 idx = 5*vi[m]; 2497 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2498 x4 = t[3+idx];x5 = t[4+idx]; 2499 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2500 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2501 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2502 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2503 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2504 v += 25; 2505 } 2506 idx = 5*i; 2507 t[idx] = s1;t[1+idx] = s2; 2508 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2509 } 2510 /* backward solve the upper triangular */ 2511 for (i=n-1; i>=0; i--){ 2512 v = aa + 25*(adiag[i+1]+1); 2513 vi = aj + adiag[i+1]+1; 2514 nz = adiag[i] - adiag[i+1] - 1; 2515 idt = 5*i; 2516 s1 = t[idt]; s2 = t[1+idt]; 2517 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2518 for(m=0;m<nz;m++){ 2519 idx = 5*vi[m]; 2520 x1 = t[idx]; x2 = t[1+idx]; 2521 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2522 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2523 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2524 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2525 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2526 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2527 v += 25; 2528 } 2529 idc = 5*c[i]; 2530 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2531 v[15]*s4+v[20]*s5; 2532 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2533 v[16]*s4+v[21]*s5; 2534 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2535 v[17]*s4+v[22]*s5; 2536 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2537 v[18]*s4+v[23]*s5; 2538 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2539 v[19]*s4+v[24]*s5; 2540 } 2541 2542 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2543 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2544 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2545 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2546 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2547 PetscFunctionReturn(0); 2548 } 2549 2550 #undef __FUNCT__ 2551 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2552 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 2553 { 2554 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2555 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2556 PetscErrorCode ierr; 2557 PetscInt *diag = a->diag,jdx; 2558 const MatScalar *aa=a->a,*v; 2559 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2560 const PetscScalar *b; 2561 2562 PetscFunctionBegin; 2563 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2564 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2565 /* forward solve the lower triangular */ 2566 idx = 0; 2567 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2568 for (i=1; i<n; i++) { 2569 v = aa + 25*ai[i]; 2570 vi = aj + ai[i]; 2571 nz = diag[i] - ai[i]; 2572 idx = 5*i; 2573 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2574 while (nz--) { 2575 jdx = 5*(*vi++); 2576 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2577 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2578 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2579 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2580 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2581 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2582 v += 25; 2583 } 2584 x[idx] = s1; 2585 x[1+idx] = s2; 2586 x[2+idx] = s3; 2587 x[3+idx] = s4; 2588 x[4+idx] = s5; 2589 } 2590 /* backward solve the upper triangular */ 2591 for (i=n-1; i>=0; i--){ 2592 v = aa + 25*diag[i] + 25; 2593 vi = aj + diag[i] + 1; 2594 nz = ai[i+1] - diag[i] - 1; 2595 idt = 5*i; 2596 s1 = x[idt]; s2 = x[1+idt]; 2597 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2598 while (nz--) { 2599 idx = 5*(*vi++); 2600 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2601 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2602 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2603 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2604 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2605 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2606 v += 25; 2607 } 2608 v = aa + 25*diag[i]; 2609 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2610 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2611 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2612 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2613 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2614 } 2615 2616 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2617 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2618 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2619 PetscFunctionReturn(0); 2620 } 2621 2622 #undef __FUNCT__ 2623 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2624 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2625 { 2626 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2627 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2628 PetscErrorCode ierr; 2629 PetscInt jdx; 2630 const MatScalar *aa=a->a,*v; 2631 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2632 const PetscScalar *b; 2633 2634 PetscFunctionBegin; 2635 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2636 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2637 /* forward solve the lower triangular */ 2638 idx = 0; 2639 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2640 for (i=1; i<n; i++) { 2641 v = aa + 25*ai[i]; 2642 vi = aj + ai[i]; 2643 nz = ai[i+1] - ai[i]; 2644 idx = 5*i; 2645 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2646 for(k=0;k<nz;k++) { 2647 jdx = 5*vi[k]; 2648 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2649 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2650 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2651 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2652 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2653 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2654 v += 25; 2655 } 2656 x[idx] = s1; 2657 x[1+idx] = s2; 2658 x[2+idx] = s3; 2659 x[3+idx] = s4; 2660 x[4+idx] = s5; 2661 } 2662 2663 /* backward solve the upper triangular */ 2664 for (i=n-1; i>=0; i--){ 2665 v = aa + 25*ai[2*n-i]; 2666 vi = aj + ai[2*n-i]; 2667 nz = ai[2*n-i +1] - ai[2*n-i]-1; 2668 idt = 5*i; 2669 s1 = x[idt]; s2 = x[1+idt]; 2670 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2671 for(k=0;k<nz;k++){ 2672 idx = 5*vi[k]; 2673 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2674 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2675 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2676 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2677 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2678 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2679 v += 25; 2680 } 2681 /* x = inv_diagonal*x */ 2682 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2683 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2684 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2685 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2686 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2687 } 2688 2689 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2690 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2691 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2692 PetscFunctionReturn(0); 2693 } 2694 2695 #undef __FUNCT__ 2696 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2" 2697 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2698 { 2699 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2700 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 2701 PetscErrorCode ierr; 2702 PetscInt jdx; 2703 const MatScalar *aa=a->a,*v; 2704 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2705 const PetscScalar *b; 2706 2707 PetscFunctionBegin; 2708 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2709 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2710 /* forward solve the lower triangular */ 2711 idx = 0; 2712 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2713 for (i=1; i<n; i++) { 2714 v = aa + 25*ai[i]; 2715 vi = aj + ai[i]; 2716 nz = ai[i+1] - ai[i]; 2717 idx = 5*i; 2718 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2719 for(k=0;k<nz;k++) { 2720 jdx = 5*vi[k]; 2721 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2722 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2723 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2724 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2725 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2726 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2727 v += 25; 2728 } 2729 x[idx] = s1; 2730 x[1+idx] = s2; 2731 x[2+idx] = s3; 2732 x[3+idx] = s4; 2733 x[4+idx] = s5; 2734 } 2735 2736 /* backward solve the upper triangular */ 2737 for (i=n-1; i>=0; i--){ 2738 v = aa + 25*(adiag[i+1]+1); 2739 vi = aj + adiag[i+1]+1; 2740 nz = adiag[i] - adiag[i+1]-1; 2741 idt = 5*i; 2742 s1 = x[idt]; s2 = x[1+idt]; 2743 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2744 for(k=0;k<nz;k++){ 2745 idx = 5*vi[k]; 2746 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2747 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2748 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2749 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2750 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2751 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2752 v += 25; 2753 } 2754 /* x = inv_diagonal*x */ 2755 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2756 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2757 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2758 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2759 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2760 } 2761 2762 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2763 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2764 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2765 PetscFunctionReturn(0); 2766 } 2767 2768 #undef __FUNCT__ 2769 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2770 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 2771 { 2772 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2773 IS iscol=a->col,isrow=a->row; 2774 PetscErrorCode ierr; 2775 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2776 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2777 const MatScalar *aa=a->a,*v; 2778 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2779 const PetscScalar *b; 2780 2781 PetscFunctionBegin; 2782 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2783 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2784 t = a->solve_work; 2785 2786 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2787 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2788 2789 /* forward solve the lower triangular */ 2790 idx = 4*(*r++); 2791 t[0] = b[idx]; t[1] = b[1+idx]; 2792 t[2] = b[2+idx]; t[3] = b[3+idx]; 2793 for (i=1; i<n; i++) { 2794 v = aa + 16*ai[i]; 2795 vi = aj + ai[i]; 2796 nz = diag[i] - ai[i]; 2797 idx = 4*(*r++); 2798 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2799 while (nz--) { 2800 idx = 4*(*vi++); 2801 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2802 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2803 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2804 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2805 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2806 v += 16; 2807 } 2808 idx = 4*i; 2809 t[idx] = s1;t[1+idx] = s2; 2810 t[2+idx] = s3;t[3+idx] = s4; 2811 } 2812 /* backward solve the upper triangular */ 2813 for (i=n-1; i>=0; i--){ 2814 v = aa + 16*diag[i] + 16; 2815 vi = aj + diag[i] + 1; 2816 nz = ai[i+1] - diag[i] - 1; 2817 idt = 4*i; 2818 s1 = t[idt]; s2 = t[1+idt]; 2819 s3 = t[2+idt];s4 = t[3+idt]; 2820 while (nz--) { 2821 idx = 4*(*vi++); 2822 x1 = t[idx]; x2 = t[1+idx]; 2823 x3 = t[2+idx]; x4 = t[3+idx]; 2824 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2825 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2826 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2827 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2828 v += 16; 2829 } 2830 idc = 4*(*c--); 2831 v = aa + 16*diag[i]; 2832 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2833 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2834 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2835 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2836 } 2837 2838 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2839 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2840 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2841 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2842 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2843 PetscFunctionReturn(0); 2844 } 2845 2846 #undef __FUNCT__ 2847 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 2848 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 2849 { 2850 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2851 IS iscol=a->col,isrow=a->row; 2852 PetscErrorCode ierr; 2853 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 2854 const PetscInt *r,*c,*rout,*cout; 2855 const MatScalar *aa=a->a,*v; 2856 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2857 const PetscScalar *b; 2858 2859 PetscFunctionBegin; 2860 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2861 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2862 t = a->solve_work; 2863 2864 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2865 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2866 2867 /* forward solve the lower triangular */ 2868 idx = 4*r[0]; 2869 t[0] = b[idx]; t[1] = b[1+idx]; 2870 t[2] = b[2+idx]; t[3] = b[3+idx]; 2871 for (i=1; i<n; i++) { 2872 v = aa + 16*ai[i]; 2873 vi = aj + ai[i]; 2874 nz = ai[i+1] - ai[i]; 2875 idx = 4*r[i]; 2876 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2877 for(m=0;m<nz;m++){ 2878 idx = 4*vi[m]; 2879 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2880 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2881 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2882 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2883 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2884 v += 16; 2885 } 2886 idx = 4*i; 2887 t[idx] = s1;t[1+idx] = s2; 2888 t[2+idx] = s3;t[3+idx] = s4; 2889 } 2890 /* backward solve the upper triangular */ 2891 for (i=n-1; i>=0; i--){ 2892 k = 2*n-i; 2893 v = aa + 16*ai[k]; 2894 vi = aj + ai[k]; 2895 nz = ai[k+1] - ai[k] - 1; 2896 idt = 4*i; 2897 s1 = t[idt]; s2 = t[1+idt]; 2898 s3 = t[2+idt];s4 = t[3+idt]; 2899 for(m=0;m<nz;m++){ 2900 idx = 4*vi[m]; 2901 x1 = t[idx]; x2 = t[1+idx]; 2902 x3 = t[2+idx]; x4 = t[3+idx]; 2903 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2904 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2905 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2906 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2907 v += 16; 2908 } 2909 idc = 4*c[i]; 2910 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2911 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2912 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2913 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2914 } 2915 2916 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2917 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2918 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2919 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2920 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2921 PetscFunctionReturn(0); 2922 } 2923 2924 #undef __FUNCT__ 2925 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2" 2926 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx) 2927 { 2928 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2929 IS iscol=a->col,isrow=a->row; 2930 PetscErrorCode ierr; 2931 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2932 const PetscInt *r,*c,*rout,*cout; 2933 const MatScalar *aa=a->a,*v; 2934 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2935 const PetscScalar *b; 2936 2937 PetscFunctionBegin; 2938 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2939 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2940 t = a->solve_work; 2941 2942 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2943 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2944 2945 /* forward solve the lower triangular */ 2946 idx = 4*r[0]; 2947 t[0] = b[idx]; t[1] = b[1+idx]; 2948 t[2] = b[2+idx]; t[3] = b[3+idx]; 2949 for (i=1; i<n; i++) { 2950 v = aa + 16*ai[i]; 2951 vi = aj + ai[i]; 2952 nz = ai[i+1] - ai[i]; 2953 idx = 4*r[i]; 2954 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2955 for(m=0;m<nz;m++){ 2956 idx = 4*vi[m]; 2957 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2958 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2959 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2960 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2961 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2962 v += 16; 2963 } 2964 idx = 4*i; 2965 t[idx] = s1;t[1+idx] = s2; 2966 t[2+idx] = s3;t[3+idx] = s4; 2967 } 2968 /* backward solve the upper triangular */ 2969 for (i=n-1; i>=0; i--){ 2970 v = aa + 16*(adiag[i+1]+1); 2971 vi = aj + adiag[i+1]+1; 2972 nz = adiag[i] - adiag[i+1] - 1; 2973 idt = 4*i; 2974 s1 = t[idt]; s2 = t[1+idt]; 2975 s3 = t[2+idt];s4 = t[3+idt]; 2976 for(m=0;m<nz;m++){ 2977 idx = 4*vi[m]; 2978 x1 = t[idx]; x2 = t[1+idx]; 2979 x3 = t[2+idx]; x4 = t[3+idx]; 2980 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2981 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2982 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2983 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2984 v += 16; 2985 } 2986 idc = 4*c[i]; 2987 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2988 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2989 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2990 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2991 } 2992 2993 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2994 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2995 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2996 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2997 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2998 PetscFunctionReturn(0); 2999 } 3000 3001 #undef __FUNCT__ 3002 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3003 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3004 { 3005 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3006 IS iscol=a->col,isrow=a->row; 3007 PetscErrorCode ierr; 3008 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3009 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3010 const MatScalar *aa=a->a,*v; 3011 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3012 PetscScalar *x; 3013 const PetscScalar *b; 3014 3015 PetscFunctionBegin; 3016 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3017 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3018 t = (MatScalar *)a->solve_work; 3019 3020 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3021 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3022 3023 /* forward solve the lower triangular */ 3024 idx = 4*(*r++); 3025 t[0] = (MatScalar)b[idx]; 3026 t[1] = (MatScalar)b[1+idx]; 3027 t[2] = (MatScalar)b[2+idx]; 3028 t[3] = (MatScalar)b[3+idx]; 3029 for (i=1; i<n; i++) { 3030 v = aa + 16*ai[i]; 3031 vi = aj + ai[i]; 3032 nz = diag[i] - ai[i]; 3033 idx = 4*(*r++); 3034 s1 = (MatScalar)b[idx]; 3035 s2 = (MatScalar)b[1+idx]; 3036 s3 = (MatScalar)b[2+idx]; 3037 s4 = (MatScalar)b[3+idx]; 3038 while (nz--) { 3039 idx = 4*(*vi++); 3040 x1 = t[idx]; 3041 x2 = t[1+idx]; 3042 x3 = t[2+idx]; 3043 x4 = t[3+idx]; 3044 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3045 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3046 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3047 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3048 v += 16; 3049 } 3050 idx = 4*i; 3051 t[idx] = s1; 3052 t[1+idx] = s2; 3053 t[2+idx] = s3; 3054 t[3+idx] = s4; 3055 } 3056 /* backward solve the upper triangular */ 3057 for (i=n-1; i>=0; i--){ 3058 v = aa + 16*diag[i] + 16; 3059 vi = aj + diag[i] + 1; 3060 nz = ai[i+1] - diag[i] - 1; 3061 idt = 4*i; 3062 s1 = t[idt]; 3063 s2 = t[1+idt]; 3064 s3 = t[2+idt]; 3065 s4 = t[3+idt]; 3066 while (nz--) { 3067 idx = 4*(*vi++); 3068 x1 = t[idx]; 3069 x2 = t[1+idx]; 3070 x3 = t[2+idx]; 3071 x4 = t[3+idx]; 3072 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3073 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3074 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3075 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3076 v += 16; 3077 } 3078 idc = 4*(*c--); 3079 v = aa + 16*diag[i]; 3080 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3081 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3082 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3083 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3084 x[idc] = (PetscScalar)t[idt]; 3085 x[1+idc] = (PetscScalar)t[1+idt]; 3086 x[2+idc] = (PetscScalar)t[2+idt]; 3087 x[3+idc] = (PetscScalar)t[3+idt]; 3088 } 3089 3090 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3091 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3092 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3093 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3094 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3095 PetscFunctionReturn(0); 3096 } 3097 3098 #if defined (PETSC_HAVE_SSE) 3099 3100 #include PETSC_HAVE_SSE 3101 3102 #undef __FUNCT__ 3103 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3104 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3105 { 3106 /* 3107 Note: This code uses demotion of double 3108 to float when performing the mixed-mode computation. 3109 This may not be numerically reasonable for all applications. 3110 */ 3111 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3112 IS iscol=a->col,isrow=a->row; 3113 PetscErrorCode ierr; 3114 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3115 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3116 MatScalar *aa=a->a,*v; 3117 PetscScalar *x,*b,*t; 3118 3119 /* Make space in temp stack for 16 Byte Aligned arrays */ 3120 float ssealignedspace[11],*tmps,*tmpx; 3121 unsigned long offset; 3122 3123 PetscFunctionBegin; 3124 SSE_SCOPE_BEGIN; 3125 3126 offset = (unsigned long)ssealignedspace % 16; 3127 if (offset) offset = (16 - offset)/4; 3128 tmps = &ssealignedspace[offset]; 3129 tmpx = &ssealignedspace[offset+4]; 3130 PREFETCH_NTA(aa+16*ai[1]); 3131 3132 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3133 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3134 t = a->solve_work; 3135 3136 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3137 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3138 3139 /* forward solve the lower triangular */ 3140 idx = 4*(*r++); 3141 t[0] = b[idx]; t[1] = b[1+idx]; 3142 t[2] = b[2+idx]; t[3] = b[3+idx]; 3143 v = aa + 16*ai[1]; 3144 3145 for (i=1; i<n;) { 3146 PREFETCH_NTA(&v[8]); 3147 vi = aj + ai[i]; 3148 nz = diag[i] - ai[i]; 3149 idx = 4*(*r++); 3150 3151 /* Demote sum from double to float */ 3152 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3153 LOAD_PS(tmps,XMM7); 3154 3155 while (nz--) { 3156 PREFETCH_NTA(&v[16]); 3157 idx = 4*(*vi++); 3158 3159 /* Demote solution (so far) from double to float */ 3160 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3161 3162 /* 4x4 Matrix-Vector product with negative accumulation: */ 3163 SSE_INLINE_BEGIN_2(tmpx,v) 3164 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3165 3166 /* First Column */ 3167 SSE_COPY_PS(XMM0,XMM6) 3168 SSE_SHUFFLE(XMM0,XMM0,0x00) 3169 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3170 SSE_SUB_PS(XMM7,XMM0) 3171 3172 /* Second Column */ 3173 SSE_COPY_PS(XMM1,XMM6) 3174 SSE_SHUFFLE(XMM1,XMM1,0x55) 3175 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3176 SSE_SUB_PS(XMM7,XMM1) 3177 3178 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3179 3180 /* Third Column */ 3181 SSE_COPY_PS(XMM2,XMM6) 3182 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3183 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3184 SSE_SUB_PS(XMM7,XMM2) 3185 3186 /* Fourth Column */ 3187 SSE_COPY_PS(XMM3,XMM6) 3188 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3189 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3190 SSE_SUB_PS(XMM7,XMM3) 3191 SSE_INLINE_END_2 3192 3193 v += 16; 3194 } 3195 idx = 4*i; 3196 v = aa + 16*ai[++i]; 3197 PREFETCH_NTA(v); 3198 STORE_PS(tmps,XMM7); 3199 3200 /* Promote result from float to double */ 3201 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3202 } 3203 /* backward solve the upper triangular */ 3204 idt = 4*(n-1); 3205 ai16 = 16*diag[n-1]; 3206 v = aa + ai16 + 16; 3207 for (i=n-1; i>=0;){ 3208 PREFETCH_NTA(&v[8]); 3209 vi = aj + diag[i] + 1; 3210 nz = ai[i+1] - diag[i] - 1; 3211 3212 /* Demote accumulator from double to float */ 3213 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3214 LOAD_PS(tmps,XMM7); 3215 3216 while (nz--) { 3217 PREFETCH_NTA(&v[16]); 3218 idx = 4*(*vi++); 3219 3220 /* Demote solution (so far) from double to float */ 3221 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 3222 3223 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3224 SSE_INLINE_BEGIN_2(tmpx,v) 3225 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3226 3227 /* First Column */ 3228 SSE_COPY_PS(XMM0,XMM6) 3229 SSE_SHUFFLE(XMM0,XMM0,0x00) 3230 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3231 SSE_SUB_PS(XMM7,XMM0) 3232 3233 /* Second Column */ 3234 SSE_COPY_PS(XMM1,XMM6) 3235 SSE_SHUFFLE(XMM1,XMM1,0x55) 3236 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3237 SSE_SUB_PS(XMM7,XMM1) 3238 3239 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3240 3241 /* Third Column */ 3242 SSE_COPY_PS(XMM2,XMM6) 3243 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3244 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3245 SSE_SUB_PS(XMM7,XMM2) 3246 3247 /* Fourth Column */ 3248 SSE_COPY_PS(XMM3,XMM6) 3249 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3250 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3251 SSE_SUB_PS(XMM7,XMM3) 3252 SSE_INLINE_END_2 3253 v += 16; 3254 } 3255 v = aa + ai16; 3256 ai16 = 16*diag[--i]; 3257 PREFETCH_NTA(aa+ai16+16); 3258 /* 3259 Scale the result by the diagonal 4x4 block, 3260 which was inverted as part of the factorization 3261 */ 3262 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 3263 /* First Column */ 3264 SSE_COPY_PS(XMM0,XMM7) 3265 SSE_SHUFFLE(XMM0,XMM0,0x00) 3266 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3267 3268 /* Second Column */ 3269 SSE_COPY_PS(XMM1,XMM7) 3270 SSE_SHUFFLE(XMM1,XMM1,0x55) 3271 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3272 SSE_ADD_PS(XMM0,XMM1) 3273 3274 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3275 3276 /* Third Column */ 3277 SSE_COPY_PS(XMM2,XMM7) 3278 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3279 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3280 SSE_ADD_PS(XMM0,XMM2) 3281 3282 /* Fourth Column */ 3283 SSE_COPY_PS(XMM3,XMM7) 3284 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3285 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3286 SSE_ADD_PS(XMM0,XMM3) 3287 3288 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3289 SSE_INLINE_END_3 3290 3291 /* Promote solution from float to double */ 3292 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 3293 3294 /* Apply reordering to t and stream into x. */ 3295 /* This way, x doesn't pollute the cache. */ 3296 /* Be careful with size: 2 doubles = 4 floats! */ 3297 idc = 4*(*c--); 3298 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 3299 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 3300 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 3301 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 3302 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 3303 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 3304 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 3305 SSE_INLINE_END_2 3306 v = aa + ai16 + 16; 3307 idt -= 4; 3308 } 3309 3310 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3311 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3312 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3313 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3314 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3315 SSE_SCOPE_END; 3316 PetscFunctionReturn(0); 3317 } 3318 3319 #endif 3320 3321 3322 /* 3323 Special case where the matrix was ILU(0) factored in the natural 3324 ordering. This eliminates the need for the column and row permutation. 3325 */ 3326 #undef __FUNCT__ 3327 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3328 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 3329 { 3330 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3331 PetscInt n=a->mbs; 3332 const PetscInt *ai=a->i,*aj=a->j; 3333 PetscErrorCode ierr; 3334 const PetscInt *diag = a->diag; 3335 const MatScalar *aa=a->a; 3336 PetscScalar *x; 3337 const PetscScalar *b; 3338 3339 PetscFunctionBegin; 3340 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3341 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3342 3343 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 3344 { 3345 static PetscScalar w[2000]; /* very BAD need to fix */ 3346 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 3347 } 3348 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 3349 { 3350 static PetscScalar w[2000]; /* very BAD need to fix */ 3351 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 3352 } 3353 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 3354 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3355 #else 3356 { 3357 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3358 const MatScalar *v; 3359 PetscInt jdx,idt,idx,nz,i,ai16; 3360 const PetscInt *vi; 3361 3362 /* forward solve the lower triangular */ 3363 idx = 0; 3364 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 3365 for (i=1; i<n; i++) { 3366 v = aa + 16*ai[i]; 3367 vi = aj + ai[i]; 3368 nz = diag[i] - ai[i]; 3369 idx += 4; 3370 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3371 while (nz--) { 3372 jdx = 4*(*vi++); 3373 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3374 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3375 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3376 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3377 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3378 v += 16; 3379 } 3380 x[idx] = s1; 3381 x[1+idx] = s2; 3382 x[2+idx] = s3; 3383 x[3+idx] = s4; 3384 } 3385 /* backward solve the upper triangular */ 3386 idt = 4*(n-1); 3387 for (i=n-1; i>=0; i--){ 3388 ai16 = 16*diag[i]; 3389 v = aa + ai16 + 16; 3390 vi = aj + diag[i] + 1; 3391 nz = ai[i+1] - diag[i] - 1; 3392 s1 = x[idt]; s2 = x[1+idt]; 3393 s3 = x[2+idt];s4 = x[3+idt]; 3394 while (nz--) { 3395 idx = 4*(*vi++); 3396 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3397 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3398 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3399 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3400 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3401 v += 16; 3402 } 3403 v = aa + ai16; 3404 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3405 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3406 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3407 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3408 idt -= 4; 3409 } 3410 } 3411 #endif 3412 3413 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3414 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3415 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3416 PetscFunctionReturn(0); 3417 } 3418 3419 #undef __FUNCT__ 3420 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3421 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3422 { 3423 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3424 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3425 PetscErrorCode ierr; 3426 PetscInt idx,jdx,idt; 3427 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3428 const MatScalar *aa=a->a,*v; 3429 PetscScalar *x; 3430 const PetscScalar *b; 3431 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3432 3433 PetscFunctionBegin; 3434 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3435 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3436 /* forward solve the lower triangular */ 3437 idx = 0; 3438 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3439 for (i=1; i<n; i++) { 3440 v = aa + bs2*ai[i]; 3441 vi = aj + ai[i]; 3442 nz = ai[i+1] - ai[i]; 3443 idx = bs*i; 3444 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3445 for(k=0;k<nz;k++) { 3446 jdx = bs*vi[k]; 3447 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3448 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3449 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3450 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3451 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3452 3453 v += bs2; 3454 } 3455 3456 x[idx] = s1; 3457 x[1+idx] = s2; 3458 x[2+idx] = s3; 3459 x[3+idx] = s4; 3460 } 3461 3462 /* backward solve the upper triangular */ 3463 for (i=n-1; i>=0; i--){ 3464 v = aa + bs2*ai[2*n-i]; 3465 vi = aj + ai[2*n-i]; 3466 nz = ai[2*n-i +1] - ai[2*n-i]-1; 3467 idt = bs*i; 3468 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3469 3470 for(k=0;k<nz;k++){ 3471 idx = bs*vi[k]; 3472 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3473 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3474 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3475 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3476 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3477 3478 v += bs2; 3479 } 3480 /* x = inv_diagonal*x */ 3481 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3482 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3483 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3484 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3485 3486 } 3487 3488 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3489 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3490 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3491 PetscFunctionReturn(0); 3492 } 3493 3494 #undef __FUNCT__ 3495 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2" 3496 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 3497 { 3498 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3499 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3500 PetscErrorCode ierr; 3501 PetscInt idx,jdx,idt; 3502 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3503 const MatScalar *aa=a->a,*v; 3504 PetscScalar *x; 3505 const PetscScalar *b; 3506 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3507 3508 PetscFunctionBegin; 3509 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3510 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3511 /* forward solve the lower triangular */ 3512 idx = 0; 3513 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3514 for (i=1; i<n; i++) { 3515 v = aa + bs2*ai[i]; 3516 vi = aj + ai[i]; 3517 nz = ai[i+1] - ai[i]; 3518 idx = bs*i; 3519 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3520 for(k=0;k<nz;k++) { 3521 jdx = bs*vi[k]; 3522 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3523 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3524 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3525 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3526 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3527 3528 v += bs2; 3529 } 3530 3531 x[idx] = s1; 3532 x[1+idx] = s2; 3533 x[2+idx] = s3; 3534 x[3+idx] = s4; 3535 } 3536 3537 /* backward solve the upper triangular */ 3538 for (i=n-1; i>=0; i--){ 3539 v = aa + bs2*(adiag[i+1]+1); 3540 vi = aj + adiag[i+1]+1; 3541 nz = adiag[i] - adiag[i+1]-1; 3542 idt = bs*i; 3543 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3544 3545 for(k=0;k<nz;k++){ 3546 idx = bs*vi[k]; 3547 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3548 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3549 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3550 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3551 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3552 3553 v += bs2; 3554 } 3555 /* x = inv_diagonal*x */ 3556 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3557 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3558 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3559 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3560 3561 } 3562 3563 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3564 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3565 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3566 PetscFunctionReturn(0); 3567 } 3568 3569 #undef __FUNCT__ 3570 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3571 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3572 { 3573 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3574 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3575 PetscErrorCode ierr; 3576 PetscInt *diag = a->diag; 3577 MatScalar *aa=a->a; 3578 PetscScalar *x,*b; 3579 3580 PetscFunctionBegin; 3581 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3582 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3583 3584 { 3585 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3586 MatScalar *v,*t=(MatScalar *)x; 3587 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3588 3589 /* forward solve the lower triangular */ 3590 idx = 0; 3591 t[0] = (MatScalar)b[0]; 3592 t[1] = (MatScalar)b[1]; 3593 t[2] = (MatScalar)b[2]; 3594 t[3] = (MatScalar)b[3]; 3595 for (i=1; i<n; i++) { 3596 v = aa + 16*ai[i]; 3597 vi = aj + ai[i]; 3598 nz = diag[i] - ai[i]; 3599 idx += 4; 3600 s1 = (MatScalar)b[idx]; 3601 s2 = (MatScalar)b[1+idx]; 3602 s3 = (MatScalar)b[2+idx]; 3603 s4 = (MatScalar)b[3+idx]; 3604 while (nz--) { 3605 jdx = 4*(*vi++); 3606 x1 = t[jdx]; 3607 x2 = t[1+jdx]; 3608 x3 = t[2+jdx]; 3609 x4 = t[3+jdx]; 3610 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3611 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3612 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3613 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3614 v += 16; 3615 } 3616 t[idx] = s1; 3617 t[1+idx] = s2; 3618 t[2+idx] = s3; 3619 t[3+idx] = s4; 3620 } 3621 /* backward solve the upper triangular */ 3622 idt = 4*(n-1); 3623 for (i=n-1; i>=0; i--){ 3624 ai16 = 16*diag[i]; 3625 v = aa + ai16 + 16; 3626 vi = aj + diag[i] + 1; 3627 nz = ai[i+1] - diag[i] - 1; 3628 s1 = t[idt]; 3629 s2 = t[1+idt]; 3630 s3 = t[2+idt]; 3631 s4 = t[3+idt]; 3632 while (nz--) { 3633 idx = 4*(*vi++); 3634 x1 = (MatScalar)x[idx]; 3635 x2 = (MatScalar)x[1+idx]; 3636 x3 = (MatScalar)x[2+idx]; 3637 x4 = (MatScalar)x[3+idx]; 3638 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3639 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3640 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3641 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3642 v += 16; 3643 } 3644 v = aa + ai16; 3645 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3646 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3647 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3648 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3649 idt -= 4; 3650 } 3651 } 3652 3653 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3654 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3655 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3656 PetscFunctionReturn(0); 3657 } 3658 3659 #if defined (PETSC_HAVE_SSE) 3660 3661 #include PETSC_HAVE_SSE 3662 #undef __FUNCT__ 3663 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3664 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 3665 { 3666 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3667 unsigned short *aj=(unsigned short *)a->j; 3668 PetscErrorCode ierr; 3669 int *ai=a->i,n=a->mbs,*diag = a->diag; 3670 MatScalar *aa=a->a; 3671 PetscScalar *x,*b; 3672 3673 PetscFunctionBegin; 3674 SSE_SCOPE_BEGIN; 3675 /* 3676 Note: This code currently uses demotion of double 3677 to float when performing the mixed-mode computation. 3678 This may not be numerically reasonable for all applications. 3679 */ 3680 PREFETCH_NTA(aa+16*ai[1]); 3681 3682 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3683 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3684 { 3685 /* x will first be computed in single precision then promoted inplace to double */ 3686 MatScalar *v,*t=(MatScalar *)x; 3687 int nz,i,idt,ai16; 3688 unsigned int jdx,idx; 3689 unsigned short *vi; 3690 /* Forward solve the lower triangular factor. */ 3691 3692 /* First block is the identity. */ 3693 idx = 0; 3694 CONVERT_DOUBLE4_FLOAT4(t,b); 3695 v = aa + 16*((unsigned int)ai[1]); 3696 3697 for (i=1; i<n;) { 3698 PREFETCH_NTA(&v[8]); 3699 vi = aj + ai[i]; 3700 nz = diag[i] - ai[i]; 3701 idx += 4; 3702 3703 /* Demote RHS from double to float. */ 3704 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3705 LOAD_PS(&t[idx],XMM7); 3706 3707 while (nz--) { 3708 PREFETCH_NTA(&v[16]); 3709 jdx = 4*((unsigned int)(*vi++)); 3710 3711 /* 4x4 Matrix-Vector product with negative accumulation: */ 3712 SSE_INLINE_BEGIN_2(&t[jdx],v) 3713 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3714 3715 /* First Column */ 3716 SSE_COPY_PS(XMM0,XMM6) 3717 SSE_SHUFFLE(XMM0,XMM0,0x00) 3718 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3719 SSE_SUB_PS(XMM7,XMM0) 3720 3721 /* Second Column */ 3722 SSE_COPY_PS(XMM1,XMM6) 3723 SSE_SHUFFLE(XMM1,XMM1,0x55) 3724 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3725 SSE_SUB_PS(XMM7,XMM1) 3726 3727 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3728 3729 /* Third Column */ 3730 SSE_COPY_PS(XMM2,XMM6) 3731 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3732 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3733 SSE_SUB_PS(XMM7,XMM2) 3734 3735 /* Fourth Column */ 3736 SSE_COPY_PS(XMM3,XMM6) 3737 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3738 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3739 SSE_SUB_PS(XMM7,XMM3) 3740 SSE_INLINE_END_2 3741 3742 v += 16; 3743 } 3744 v = aa + 16*ai[++i]; 3745 PREFETCH_NTA(v); 3746 STORE_PS(&t[idx],XMM7); 3747 } 3748 3749 /* Backward solve the upper triangular factor.*/ 3750 3751 idt = 4*(n-1); 3752 ai16 = 16*diag[n-1]; 3753 v = aa + ai16 + 16; 3754 for (i=n-1; i>=0;){ 3755 PREFETCH_NTA(&v[8]); 3756 vi = aj + diag[i] + 1; 3757 nz = ai[i+1] - diag[i] - 1; 3758 3759 LOAD_PS(&t[idt],XMM7); 3760 3761 while (nz--) { 3762 PREFETCH_NTA(&v[16]); 3763 idx = 4*((unsigned int)(*vi++)); 3764 3765 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3766 SSE_INLINE_BEGIN_2(&t[idx],v) 3767 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3768 3769 /* First Column */ 3770 SSE_COPY_PS(XMM0,XMM6) 3771 SSE_SHUFFLE(XMM0,XMM0,0x00) 3772 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3773 SSE_SUB_PS(XMM7,XMM0) 3774 3775 /* Second Column */ 3776 SSE_COPY_PS(XMM1,XMM6) 3777 SSE_SHUFFLE(XMM1,XMM1,0x55) 3778 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3779 SSE_SUB_PS(XMM7,XMM1) 3780 3781 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3782 3783 /* Third Column */ 3784 SSE_COPY_PS(XMM2,XMM6) 3785 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3786 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3787 SSE_SUB_PS(XMM7,XMM2) 3788 3789 /* Fourth Column */ 3790 SSE_COPY_PS(XMM3,XMM6) 3791 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3792 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3793 SSE_SUB_PS(XMM7,XMM3) 3794 SSE_INLINE_END_2 3795 v += 16; 3796 } 3797 v = aa + ai16; 3798 ai16 = 16*diag[--i]; 3799 PREFETCH_NTA(aa+ai16+16); 3800 /* 3801 Scale the result by the diagonal 4x4 block, 3802 which was inverted as part of the factorization 3803 */ 3804 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3805 /* First Column */ 3806 SSE_COPY_PS(XMM0,XMM7) 3807 SSE_SHUFFLE(XMM0,XMM0,0x00) 3808 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3809 3810 /* Second Column */ 3811 SSE_COPY_PS(XMM1,XMM7) 3812 SSE_SHUFFLE(XMM1,XMM1,0x55) 3813 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3814 SSE_ADD_PS(XMM0,XMM1) 3815 3816 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3817 3818 /* Third Column */ 3819 SSE_COPY_PS(XMM2,XMM7) 3820 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3821 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3822 SSE_ADD_PS(XMM0,XMM2) 3823 3824 /* Fourth Column */ 3825 SSE_COPY_PS(XMM3,XMM7) 3826 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3827 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3828 SSE_ADD_PS(XMM0,XMM3) 3829 3830 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3831 SSE_INLINE_END_3 3832 3833 v = aa + ai16 + 16; 3834 idt -= 4; 3835 } 3836 3837 /* Convert t from single precision back to double precision (inplace)*/ 3838 idt = 4*(n-1); 3839 for (i=n-1;i>=0;i--) { 3840 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3841 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3842 PetscScalar *xtemp=&x[idt]; 3843 MatScalar *ttemp=&t[idt]; 3844 xtemp[3] = (PetscScalar)ttemp[3]; 3845 xtemp[2] = (PetscScalar)ttemp[2]; 3846 xtemp[1] = (PetscScalar)ttemp[1]; 3847 xtemp[0] = (PetscScalar)ttemp[0]; 3848 idt -= 4; 3849 } 3850 3851 } /* End of artificial scope. */ 3852 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3853 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3854 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3855 SSE_SCOPE_END; 3856 PetscFunctionReturn(0); 3857 } 3858 3859 #undef __FUNCT__ 3860 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3861 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 3862 { 3863 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3864 int *aj=a->j; 3865 PetscErrorCode ierr; 3866 int *ai=a->i,n=a->mbs,*diag = a->diag; 3867 MatScalar *aa=a->a; 3868 PetscScalar *x,*b; 3869 3870 PetscFunctionBegin; 3871 SSE_SCOPE_BEGIN; 3872 /* 3873 Note: This code currently uses demotion of double 3874 to float when performing the mixed-mode computation. 3875 This may not be numerically reasonable for all applications. 3876 */ 3877 PREFETCH_NTA(aa+16*ai[1]); 3878 3879 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3880 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3881 { 3882 /* x will first be computed in single precision then promoted inplace to double */ 3883 MatScalar *v,*t=(MatScalar *)x; 3884 int nz,i,idt,ai16; 3885 int jdx,idx; 3886 int *vi; 3887 /* Forward solve the lower triangular factor. */ 3888 3889 /* First block is the identity. */ 3890 idx = 0; 3891 CONVERT_DOUBLE4_FLOAT4(t,b); 3892 v = aa + 16*ai[1]; 3893 3894 for (i=1; i<n;) { 3895 PREFETCH_NTA(&v[8]); 3896 vi = aj + ai[i]; 3897 nz = diag[i] - ai[i]; 3898 idx += 4; 3899 3900 /* Demote RHS from double to float. */ 3901 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3902 LOAD_PS(&t[idx],XMM7); 3903 3904 while (nz--) { 3905 PREFETCH_NTA(&v[16]); 3906 jdx = 4*(*vi++); 3907 /* jdx = *vi++; */ 3908 3909 /* 4x4 Matrix-Vector product with negative accumulation: */ 3910 SSE_INLINE_BEGIN_2(&t[jdx],v) 3911 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3912 3913 /* First Column */ 3914 SSE_COPY_PS(XMM0,XMM6) 3915 SSE_SHUFFLE(XMM0,XMM0,0x00) 3916 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3917 SSE_SUB_PS(XMM7,XMM0) 3918 3919 /* Second Column */ 3920 SSE_COPY_PS(XMM1,XMM6) 3921 SSE_SHUFFLE(XMM1,XMM1,0x55) 3922 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3923 SSE_SUB_PS(XMM7,XMM1) 3924 3925 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3926 3927 /* Third Column */ 3928 SSE_COPY_PS(XMM2,XMM6) 3929 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3930 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3931 SSE_SUB_PS(XMM7,XMM2) 3932 3933 /* Fourth Column */ 3934 SSE_COPY_PS(XMM3,XMM6) 3935 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3936 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3937 SSE_SUB_PS(XMM7,XMM3) 3938 SSE_INLINE_END_2 3939 3940 v += 16; 3941 } 3942 v = aa + 16*ai[++i]; 3943 PREFETCH_NTA(v); 3944 STORE_PS(&t[idx],XMM7); 3945 } 3946 3947 /* Backward solve the upper triangular factor.*/ 3948 3949 idt = 4*(n-1); 3950 ai16 = 16*diag[n-1]; 3951 v = aa + ai16 + 16; 3952 for (i=n-1; i>=0;){ 3953 PREFETCH_NTA(&v[8]); 3954 vi = aj + diag[i] + 1; 3955 nz = ai[i+1] - diag[i] - 1; 3956 3957 LOAD_PS(&t[idt],XMM7); 3958 3959 while (nz--) { 3960 PREFETCH_NTA(&v[16]); 3961 idx = 4*(*vi++); 3962 /* idx = *vi++; */ 3963 3964 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3965 SSE_INLINE_BEGIN_2(&t[idx],v) 3966 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3967 3968 /* First Column */ 3969 SSE_COPY_PS(XMM0,XMM6) 3970 SSE_SHUFFLE(XMM0,XMM0,0x00) 3971 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3972 SSE_SUB_PS(XMM7,XMM0) 3973 3974 /* Second Column */ 3975 SSE_COPY_PS(XMM1,XMM6) 3976 SSE_SHUFFLE(XMM1,XMM1,0x55) 3977 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3978 SSE_SUB_PS(XMM7,XMM1) 3979 3980 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3981 3982 /* Third Column */ 3983 SSE_COPY_PS(XMM2,XMM6) 3984 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3985 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3986 SSE_SUB_PS(XMM7,XMM2) 3987 3988 /* Fourth Column */ 3989 SSE_COPY_PS(XMM3,XMM6) 3990 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3991 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3992 SSE_SUB_PS(XMM7,XMM3) 3993 SSE_INLINE_END_2 3994 v += 16; 3995 } 3996 v = aa + ai16; 3997 ai16 = 16*diag[--i]; 3998 PREFETCH_NTA(aa+ai16+16); 3999 /* 4000 Scale the result by the diagonal 4x4 block, 4001 which was inverted as part of the factorization 4002 */ 4003 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4004 /* First Column */ 4005 SSE_COPY_PS(XMM0,XMM7) 4006 SSE_SHUFFLE(XMM0,XMM0,0x00) 4007 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4008 4009 /* Second Column */ 4010 SSE_COPY_PS(XMM1,XMM7) 4011 SSE_SHUFFLE(XMM1,XMM1,0x55) 4012 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4013 SSE_ADD_PS(XMM0,XMM1) 4014 4015 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4016 4017 /* Third Column */ 4018 SSE_COPY_PS(XMM2,XMM7) 4019 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4020 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4021 SSE_ADD_PS(XMM0,XMM2) 4022 4023 /* Fourth Column */ 4024 SSE_COPY_PS(XMM3,XMM7) 4025 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4026 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4027 SSE_ADD_PS(XMM0,XMM3) 4028 4029 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4030 SSE_INLINE_END_3 4031 4032 v = aa + ai16 + 16; 4033 idt -= 4; 4034 } 4035 4036 /* Convert t from single precision back to double precision (inplace)*/ 4037 idt = 4*(n-1); 4038 for (i=n-1;i>=0;i--) { 4039 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4040 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4041 PetscScalar *xtemp=&x[idt]; 4042 MatScalar *ttemp=&t[idt]; 4043 xtemp[3] = (PetscScalar)ttemp[3]; 4044 xtemp[2] = (PetscScalar)ttemp[2]; 4045 xtemp[1] = (PetscScalar)ttemp[1]; 4046 xtemp[0] = (PetscScalar)ttemp[0]; 4047 idt -= 4; 4048 } 4049 4050 } /* End of artificial scope. */ 4051 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4052 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4053 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4054 SSE_SCOPE_END; 4055 PetscFunctionReturn(0); 4056 } 4057 4058 #endif 4059 4060 #undef __FUNCT__ 4061 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4062 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4063 { 4064 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4065 IS iscol=a->col,isrow=a->row; 4066 PetscErrorCode ierr; 4067 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4068 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4069 const MatScalar *aa=a->a,*v; 4070 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4071 const PetscScalar *b; 4072 4073 PetscFunctionBegin; 4074 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4075 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4076 t = a->solve_work; 4077 4078 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4079 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4080 4081 /* forward solve the lower triangular */ 4082 idx = 3*(*r++); 4083 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4084 for (i=1; i<n; i++) { 4085 v = aa + 9*ai[i]; 4086 vi = aj + ai[i]; 4087 nz = diag[i] - ai[i]; 4088 idx = 3*(*r++); 4089 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4090 while (nz--) { 4091 idx = 3*(*vi++); 4092 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4093 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4094 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4095 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4096 v += 9; 4097 } 4098 idx = 3*i; 4099 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4100 } 4101 /* backward solve the upper triangular */ 4102 for (i=n-1; i>=0; i--){ 4103 v = aa + 9*diag[i] + 9; 4104 vi = aj + diag[i] + 1; 4105 nz = ai[i+1] - diag[i] - 1; 4106 idt = 3*i; 4107 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4108 while (nz--) { 4109 idx = 3*(*vi++); 4110 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4111 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4112 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4113 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4114 v += 9; 4115 } 4116 idc = 3*(*c--); 4117 v = aa + 9*diag[i]; 4118 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4119 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4120 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4121 } 4122 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4123 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4124 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4125 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4126 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4127 PetscFunctionReturn(0); 4128 } 4129 4130 #undef __FUNCT__ 4131 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 4132 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 4133 { 4134 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4135 IS iscol=a->col,isrow=a->row; 4136 PetscErrorCode ierr; 4137 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m; 4138 const PetscInt *r,*c,*rout,*cout; 4139 const MatScalar *aa=a->a,*v; 4140 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4141 const PetscScalar *b; 4142 4143 PetscFunctionBegin; 4144 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4145 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4146 t = a->solve_work; 4147 4148 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4149 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4150 4151 /* forward solve the lower triangular */ 4152 idx = 3*r[0]; 4153 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4154 for (i=1; i<n; i++) { 4155 v = aa + 9*ai[i]; 4156 vi = aj + ai[i]; 4157 nz = ai[i+1] - ai[i]; 4158 idx = 3*r[i]; 4159 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4160 for(m=0;m<nz;m++){ 4161 idx = 3*vi[m]; 4162 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4163 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4164 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4165 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4166 v += 9; 4167 } 4168 idx = 3*i; 4169 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4170 } 4171 /* backward solve the upper triangular */ 4172 for (i=n-1; i>=0; i--){ 4173 k = 2*n-i; 4174 v = aa + 9*ai[k]; 4175 vi = aj + ai[k]; 4176 nz = ai[k +1] - ai[k] - 1; 4177 idt = 3*i; 4178 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4179 for(m=0;m<nz;m++){ 4180 idx = 3*vi[m]; 4181 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4182 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4183 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4184 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4185 v += 9; 4186 } 4187 idc = 3*c[i]; 4188 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4189 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4190 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4191 } 4192 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4193 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4194 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4195 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4196 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4197 PetscFunctionReturn(0); 4198 } 4199 4200 #undef __FUNCT__ 4201 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2" 4202 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4203 { 4204 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4205 IS iscol=a->col,isrow=a->row; 4206 PetscErrorCode ierr; 4207 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 4208 const PetscInt *r,*c,*rout,*cout; 4209 const MatScalar *aa=a->a,*v; 4210 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4211 const PetscScalar *b; 4212 4213 PetscFunctionBegin; 4214 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4215 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4216 t = a->solve_work; 4217 4218 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4219 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4220 4221 /* forward solve the lower triangular */ 4222 idx = 3*r[0]; 4223 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4224 for (i=1; i<n; i++) { 4225 v = aa + 9*ai[i]; 4226 vi = aj + ai[i]; 4227 nz = ai[i+1] - ai[i]; 4228 idx = 3*r[i]; 4229 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4230 for(m=0;m<nz;m++){ 4231 idx = 3*vi[m]; 4232 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4233 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4234 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4235 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4236 v += 9; 4237 } 4238 idx = 3*i; 4239 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4240 } 4241 /* backward solve the upper triangular */ 4242 for (i=n-1; i>=0; i--){ 4243 v = aa + 9*(adiag[i+1]+1); 4244 vi = aj + adiag[i+1]+1; 4245 nz = adiag[i] - adiag[i+1] - 1; 4246 idt = 3*i; 4247 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4248 for(m=0;m<nz;m++){ 4249 idx = 3*vi[m]; 4250 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4251 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4252 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4253 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4254 v += 9; 4255 } 4256 idc = 3*c[i]; 4257 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4258 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4259 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4260 } 4261 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4262 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4263 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4264 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4265 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4266 PetscFunctionReturn(0); 4267 } 4268 4269 /* 4270 Special case where the matrix was ILU(0) factored in the natural 4271 ordering. This eliminates the need for the column and row permutation. 4272 */ 4273 #undef __FUNCT__ 4274 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4275 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4276 { 4277 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4278 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4279 PetscErrorCode ierr; 4280 PetscInt *diag = a->diag; 4281 const MatScalar *aa=a->a,*v; 4282 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4283 const PetscScalar *b; 4284 PetscInt jdx,idt,idx,nz,*vi,i; 4285 4286 PetscFunctionBegin; 4287 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4288 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4289 4290 /* forward solve the lower triangular */ 4291 idx = 0; 4292 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4293 for (i=1; i<n; i++) { 4294 v = aa + 9*ai[i]; 4295 vi = aj + ai[i]; 4296 nz = diag[i] - ai[i]; 4297 idx += 3; 4298 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4299 while (nz--) { 4300 jdx = 3*(*vi++); 4301 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4302 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4303 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4304 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4305 v += 9; 4306 } 4307 x[idx] = s1; 4308 x[1+idx] = s2; 4309 x[2+idx] = s3; 4310 } 4311 /* backward solve the upper triangular */ 4312 for (i=n-1; i>=0; i--){ 4313 v = aa + 9*diag[i] + 9; 4314 vi = aj + diag[i] + 1; 4315 nz = ai[i+1] - diag[i] - 1; 4316 idt = 3*i; 4317 s1 = x[idt]; s2 = x[1+idt]; 4318 s3 = x[2+idt]; 4319 while (nz--) { 4320 idx = 3*(*vi++); 4321 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4322 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4323 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4324 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4325 v += 9; 4326 } 4327 v = aa + 9*diag[i]; 4328 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4329 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4330 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4331 } 4332 4333 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4334 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4335 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4336 PetscFunctionReturn(0); 4337 } 4338 4339 #undef __FUNCT__ 4340 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4341 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4342 { 4343 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4344 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4345 PetscErrorCode ierr; 4346 PetscInt idx,jdx,idt; 4347 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4348 const MatScalar *aa=a->a,*v; 4349 PetscScalar *x; 4350 const PetscScalar *b; 4351 PetscScalar s1,s2,s3,x1,x2,x3; 4352 4353 PetscFunctionBegin; 4354 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4355 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4356 /* forward solve the lower triangular */ 4357 idx = 0; 4358 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4359 for (i=1; i<n; i++) { 4360 v = aa + bs2*ai[i]; 4361 vi = aj + ai[i]; 4362 nz = ai[i+1] - ai[i]; 4363 idx = bs*i; 4364 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4365 for(k=0;k<nz;k++){ 4366 jdx = bs*vi[k]; 4367 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4368 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4369 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4370 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4371 4372 v += bs2; 4373 } 4374 4375 x[idx] = s1; 4376 x[1+idx] = s2; 4377 x[2+idx] = s3; 4378 } 4379 4380 /* backward solve the upper triangular */ 4381 for (i=n-1; i>=0; i--){ 4382 v = aa + bs2*ai[2*n-i]; 4383 vi = aj + ai[2*n-i]; 4384 nz = ai[2*n-i +1] - ai[2*n-i]-1; 4385 idt = bs*i; 4386 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4387 4388 for(k=0;k<nz;k++){ 4389 idx = bs*vi[k]; 4390 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4391 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4392 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4393 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4394 4395 v += bs2; 4396 } 4397 /* x = inv_diagonal*x */ 4398 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4399 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4400 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4401 4402 } 4403 4404 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4405 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4406 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4407 PetscFunctionReturn(0); 4408 } 4409 4410 #undef __FUNCT__ 4411 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2" 4412 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4413 { 4414 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4415 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4416 PetscErrorCode ierr; 4417 PetscInt idx,jdx,idt; 4418 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4419 const MatScalar *aa=a->a,*v; 4420 PetscScalar *x; 4421 const PetscScalar *b; 4422 PetscScalar s1,s2,s3,x1,x2,x3; 4423 4424 PetscFunctionBegin; 4425 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4426 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4427 /* forward solve the lower triangular */ 4428 idx = 0; 4429 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4430 for (i=1; i<n; i++) { 4431 v = aa + bs2*ai[i]; 4432 vi = aj + ai[i]; 4433 nz = ai[i+1] - ai[i]; 4434 idx = bs*i; 4435 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4436 for(k=0;k<nz;k++){ 4437 jdx = bs*vi[k]; 4438 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4439 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4440 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4441 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4442 4443 v += bs2; 4444 } 4445 4446 x[idx] = s1; 4447 x[1+idx] = s2; 4448 x[2+idx] = s3; 4449 } 4450 4451 /* backward solve the upper triangular */ 4452 for (i=n-1; i>=0; i--){ 4453 v = aa + bs2*(adiag[i+1]+1); 4454 vi = aj + adiag[i+1]+1; 4455 nz = adiag[i] - adiag[i+1]-1; 4456 idt = bs*i; 4457 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4458 4459 for(k=0;k<nz;k++){ 4460 idx = bs*vi[k]; 4461 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4462 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4463 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4464 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4465 4466 v += bs2; 4467 } 4468 /* x = inv_diagonal*x */ 4469 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4470 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4471 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4472 4473 } 4474 4475 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4476 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4477 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4478 PetscFunctionReturn(0); 4479 } 4480 4481 #undef __FUNCT__ 4482 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4483 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 4484 { 4485 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4486 IS iscol=a->col,isrow=a->row; 4487 PetscErrorCode ierr; 4488 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4489 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4490 const MatScalar *aa=a->a,*v; 4491 PetscScalar *x,s1,s2,x1,x2,*t; 4492 const PetscScalar *b; 4493 4494 PetscFunctionBegin; 4495 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4496 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4497 t = a->solve_work; 4498 4499 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4500 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4501 4502 /* forward solve the lower triangular */ 4503 idx = 2*(*r++); 4504 t[0] = b[idx]; t[1] = b[1+idx]; 4505 for (i=1; i<n; i++) { 4506 v = aa + 4*ai[i]; 4507 vi = aj + ai[i]; 4508 nz = diag[i] - ai[i]; 4509 idx = 2*(*r++); 4510 s1 = b[idx]; s2 = b[1+idx]; 4511 while (nz--) { 4512 idx = 2*(*vi++); 4513 x1 = t[idx]; x2 = t[1+idx]; 4514 s1 -= v[0]*x1 + v[2]*x2; 4515 s2 -= v[1]*x1 + v[3]*x2; 4516 v += 4; 4517 } 4518 idx = 2*i; 4519 t[idx] = s1; t[1+idx] = s2; 4520 } 4521 /* backward solve the upper triangular */ 4522 for (i=n-1; i>=0; i--){ 4523 v = aa + 4*diag[i] + 4; 4524 vi = aj + diag[i] + 1; 4525 nz = ai[i+1] - diag[i] - 1; 4526 idt = 2*i; 4527 s1 = t[idt]; s2 = t[1+idt]; 4528 while (nz--) { 4529 idx = 2*(*vi++); 4530 x1 = t[idx]; x2 = t[1+idx]; 4531 s1 -= v[0]*x1 + v[2]*x2; 4532 s2 -= v[1]*x1 + v[3]*x2; 4533 v += 4; 4534 } 4535 idc = 2*(*c--); 4536 v = aa + 4*diag[i]; 4537 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4538 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4539 } 4540 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4541 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4542 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4543 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4544 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4545 PetscFunctionReturn(0); 4546 } 4547 4548 #undef __FUNCT__ 4549 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 4550 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 4551 { 4552 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4553 IS iscol=a->col,isrow=a->row; 4554 PetscErrorCode ierr; 4555 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m; 4556 const PetscInt *r,*c,*rout,*cout; 4557 const MatScalar *aa=a->a,*v; 4558 PetscScalar *x,s1,s2,x1,x2,*t; 4559 const PetscScalar *b; 4560 4561 PetscFunctionBegin; 4562 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4563 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4564 t = a->solve_work; 4565 4566 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4567 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4568 4569 /* forward solve the lower triangular */ 4570 idx = 2*r[0]; 4571 t[0] = b[idx]; t[1] = b[1+idx]; 4572 for (i=1; i<n; i++) { 4573 v = aa + 4*ai[i]; 4574 vi = aj + ai[i]; 4575 nz = ai[i+1] - ai[i]; 4576 idx = 2*r[i]; 4577 s1 = b[idx]; s2 = b[1+idx]; 4578 for(m=0;m<nz;m++){ 4579 jdx = 2*vi[m]; 4580 x1 = t[jdx]; x2 = t[1+jdx]; 4581 s1 -= v[0]*x1 + v[2]*x2; 4582 s2 -= v[1]*x1 + v[3]*x2; 4583 v += 4; 4584 } 4585 idx = 2*i; 4586 t[idx] = s1; t[1+idx] = s2; 4587 } 4588 /* backward solve the upper triangular */ 4589 for (i=n-1; i>=0; i--){ 4590 k = 2*n-i; 4591 v = aa + 4*ai[k]; 4592 vi = aj + ai[k]; 4593 nz = ai[k +1] - ai[k] - 1; 4594 idt = 2*i; 4595 s1 = t[idt]; s2 = t[1+idt]; 4596 for(m=0;m<nz;m++){ 4597 idx = 2*vi[m]; 4598 x1 = t[idx]; x2 = t[1+idx]; 4599 s1 -= v[0]*x1 + v[2]*x2; 4600 s2 -= v[1]*x1 + v[3]*x2; 4601 v += 4; 4602 } 4603 idc = 2*c[i]; 4604 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4605 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4606 } 4607 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4608 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4609 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4610 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4611 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4612 PetscFunctionReturn(0); 4613 } 4614 4615 #undef __FUNCT__ 4616 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2" 4617 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4618 { 4619 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4620 IS iscol=a->col,isrow=a->row; 4621 PetscErrorCode ierr; 4622 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 4623 const PetscInt *r,*c,*rout,*cout; 4624 const MatScalar *aa=a->a,*v; 4625 PetscScalar *x,s1,s2,x1,x2,*t; 4626 const PetscScalar *b; 4627 4628 PetscFunctionBegin; 4629 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4630 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4631 t = a->solve_work; 4632 4633 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4634 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4635 4636 /* forward solve the lower triangular */ 4637 idx = 2*r[0]; 4638 t[0] = b[idx]; t[1] = b[1+idx]; 4639 for (i=1; i<n; i++) { 4640 v = aa + 4*ai[i]; 4641 vi = aj + ai[i]; 4642 nz = ai[i+1] - ai[i]; 4643 idx = 2*r[i]; 4644 s1 = b[idx]; s2 = b[1+idx]; 4645 for(m=0;m<nz;m++){ 4646 jdx = 2*vi[m]; 4647 x1 = t[jdx]; x2 = t[1+jdx]; 4648 s1 -= v[0]*x1 + v[2]*x2; 4649 s2 -= v[1]*x1 + v[3]*x2; 4650 v += 4; 4651 } 4652 idx = 2*i; 4653 t[idx] = s1; t[1+idx] = s2; 4654 } 4655 /* backward solve the upper triangular */ 4656 for (i=n-1; i>=0; i--){ 4657 v = aa + 4*(adiag[i+1]+1); 4658 vi = aj + adiag[i+1]+1; 4659 nz = adiag[i] - adiag[i+1] - 1; 4660 idt = 2*i; 4661 s1 = t[idt]; s2 = t[1+idt]; 4662 for(m=0;m<nz;m++){ 4663 idx = 2*vi[m]; 4664 x1 = t[idx]; x2 = t[1+idx]; 4665 s1 -= v[0]*x1 + v[2]*x2; 4666 s2 -= v[1]*x1 + v[3]*x2; 4667 v += 4; 4668 } 4669 idc = 2*c[i]; 4670 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4671 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4672 } 4673 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4674 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4675 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4676 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4677 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4678 PetscFunctionReturn(0); 4679 } 4680 4681 /* 4682 Special case where the matrix was ILU(0) factored in the natural 4683 ordering. This eliminates the need for the column and row permutation. 4684 */ 4685 #undef __FUNCT__ 4686 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4687 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 4688 { 4689 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4690 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4691 PetscErrorCode ierr; 4692 PetscInt *diag = a->diag; 4693 const MatScalar *aa=a->a,*v; 4694 PetscScalar *x,s1,s2,x1,x2; 4695 const PetscScalar *b; 4696 PetscInt jdx,idt,idx,nz,*vi,i; 4697 4698 PetscFunctionBegin; 4699 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4700 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4701 4702 /* forward solve the lower triangular */ 4703 idx = 0; 4704 x[0] = b[0]; x[1] = b[1]; 4705 for (i=1; i<n; i++) { 4706 v = aa + 4*ai[i]; 4707 vi = aj + ai[i]; 4708 nz = diag[i] - ai[i]; 4709 idx += 2; 4710 s1 = b[idx];s2 = b[1+idx]; 4711 while (nz--) { 4712 jdx = 2*(*vi++); 4713 x1 = x[jdx];x2 = x[1+jdx]; 4714 s1 -= v[0]*x1 + v[2]*x2; 4715 s2 -= v[1]*x1 + v[3]*x2; 4716 v += 4; 4717 } 4718 x[idx] = s1; 4719 x[1+idx] = s2; 4720 } 4721 /* backward solve the upper triangular */ 4722 for (i=n-1; i>=0; i--){ 4723 v = aa + 4*diag[i] + 4; 4724 vi = aj + diag[i] + 1; 4725 nz = ai[i+1] - diag[i] - 1; 4726 idt = 2*i; 4727 s1 = x[idt]; s2 = x[1+idt]; 4728 while (nz--) { 4729 idx = 2*(*vi++); 4730 x1 = x[idx]; x2 = x[1+idx]; 4731 s1 -= v[0]*x1 + v[2]*x2; 4732 s2 -= v[1]*x1 + v[3]*x2; 4733 v += 4; 4734 } 4735 v = aa + 4*diag[i]; 4736 x[idt] = v[0]*s1 + v[2]*s2; 4737 x[1+idt] = v[1]*s1 + v[3]*s2; 4738 } 4739 4740 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4741 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4742 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4743 PetscFunctionReturn(0); 4744 } 4745 4746 #undef __FUNCT__ 4747 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4748 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4749 { 4750 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4751 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 4752 PetscErrorCode ierr; 4753 PetscInt jdx; 4754 const MatScalar *aa=a->a,*v; 4755 PetscScalar *x,s1,s2,x1,x2; 4756 const PetscScalar *b; 4757 4758 PetscFunctionBegin; 4759 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4760 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4761 /* forward solve the lower triangular */ 4762 idx = 0; 4763 x[0] = b[idx]; x[1] = b[1+idx]; 4764 for (i=1; i<n; i++) { 4765 v = aa + 4*ai[i]; 4766 vi = aj + ai[i]; 4767 nz = ai[i+1] - ai[i]; 4768 idx = 2*i; 4769 s1 = b[idx];s2 = b[1+idx]; 4770 for(k=0;k<nz;k++){ 4771 jdx = 2*vi[k]; 4772 x1 = x[jdx];x2 = x[1+jdx]; 4773 s1 -= v[0]*x1 + v[2]*x2; 4774 s2 -= v[1]*x1 + v[3]*x2; 4775 v += 4; 4776 } 4777 x[idx] = s1; 4778 x[1+idx] = s2; 4779 } 4780 4781 /* backward solve the upper triangular */ 4782 for (i=n-1; i>=0; i--){ 4783 v = aa + 4*ai[2*n-i]; 4784 vi = aj + ai[2*n-i]; 4785 nz = ai[2*n-i +1] - ai[2*n-i]-1; 4786 idt = 2*i; 4787 s1 = x[idt]; s2 = x[1+idt]; 4788 for(k=0;k<nz;k++){ 4789 idx = 2*vi[k]; 4790 x1 = x[idx]; x2 = x[1+idx]; 4791 s1 -= v[0]*x1 + v[2]*x2; 4792 s2 -= v[1]*x1 + v[3]*x2; 4793 v += 4; 4794 } 4795 /* x = inv_diagonal*x */ 4796 x[idt] = v[0]*s1 + v[2]*s2; 4797 x[1+idt] = v[1]*s1 + v[3]*s2; 4798 } 4799 4800 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4801 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4802 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4803 PetscFunctionReturn(0); 4804 } 4805 4806 #undef __FUNCT__ 4807 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2" 4808 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 4809 { 4810 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4811 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4812 PetscErrorCode ierr; 4813 PetscInt jdx; 4814 const MatScalar *aa=a->a,*v; 4815 PetscScalar *x,s1,s2,x1,x2; 4816 const PetscScalar *b; 4817 4818 PetscFunctionBegin; 4819 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4820 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4821 /* forward solve the lower triangular */ 4822 idx = 0; 4823 x[0] = b[idx]; x[1] = b[1+idx]; 4824 for (i=1; i<n; i++) { 4825 v = aa + 4*ai[i]; 4826 vi = aj + ai[i]; 4827 nz = ai[i+1] - ai[i]; 4828 idx = 2*i; 4829 s1 = b[idx];s2 = b[1+idx]; 4830 for(k=0;k<nz;k++){ 4831 jdx = 2*vi[k]; 4832 x1 = x[jdx];x2 = x[1+jdx]; 4833 s1 -= v[0]*x1 + v[2]*x2; 4834 s2 -= v[1]*x1 + v[3]*x2; 4835 v += 4; 4836 } 4837 x[idx] = s1; 4838 x[1+idx] = s2; 4839 } 4840 4841 /* backward solve the upper triangular */ 4842 for (i=n-1; i>=0; i--){ 4843 v = aa + 4*(adiag[i+1]+1); 4844 vi = aj + adiag[i+1]+1; 4845 nz = adiag[i] - adiag[i+1]-1; 4846 idt = 2*i; 4847 s1 = x[idt]; s2 = x[1+idt]; 4848 for(k=0;k<nz;k++){ 4849 idx = 2*vi[k]; 4850 x1 = x[idx]; x2 = x[1+idx]; 4851 s1 -= v[0]*x1 + v[2]*x2; 4852 s2 -= v[1]*x1 + v[3]*x2; 4853 v += 4; 4854 } 4855 /* x = inv_diagonal*x */ 4856 x[idt] = v[0]*s1 + v[2]*s2; 4857 x[1+idt] = v[1]*s1 + v[3]*s2; 4858 } 4859 4860 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4861 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4862 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4863 PetscFunctionReturn(0); 4864 } 4865 4866 #undef __FUNCT__ 4867 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4868 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 4869 { 4870 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4871 IS iscol=a->col,isrow=a->row; 4872 PetscErrorCode ierr; 4873 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4874 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4875 MatScalar *aa=a->a,*v; 4876 PetscScalar *x,*b,s1,*t; 4877 4878 PetscFunctionBegin; 4879 if (!n) PetscFunctionReturn(0); 4880 4881 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4882 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4883 t = a->solve_work; 4884 4885 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4886 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4887 4888 /* forward solve the lower triangular */ 4889 t[0] = b[*r++]; 4890 for (i=1; i<n; i++) { 4891 v = aa + ai[i]; 4892 vi = aj + ai[i]; 4893 nz = diag[i] - ai[i]; 4894 s1 = b[*r++]; 4895 while (nz--) { 4896 s1 -= (*v++)*t[*vi++]; 4897 } 4898 t[i] = s1; 4899 } 4900 /* backward solve the upper triangular */ 4901 for (i=n-1; i>=0; i--){ 4902 v = aa + diag[i] + 1; 4903 vi = aj + diag[i] + 1; 4904 nz = ai[i+1] - diag[i] - 1; 4905 s1 = t[i]; 4906 while (nz--) { 4907 s1 -= (*v++)*t[*vi++]; 4908 } 4909 x[*c--] = t[i] = aa[diag[i]]*s1; 4910 } 4911 4912 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4913 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4914 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4915 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4916 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4917 PetscFunctionReturn(0); 4918 } 4919 /* 4920 Special case where the matrix was ILU(0) factored in the natural 4921 ordering. This eliminates the need for the column and row permutation. 4922 */ 4923 #undef __FUNCT__ 4924 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4925 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 4926 { 4927 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4928 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4929 PetscErrorCode ierr; 4930 PetscInt *diag = a->diag; 4931 MatScalar *aa=a->a; 4932 PetscScalar *x,*b; 4933 PetscScalar s1,x1; 4934 MatScalar *v; 4935 PetscInt jdx,idt,idx,nz,*vi,i; 4936 4937 PetscFunctionBegin; 4938 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4939 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4940 4941 /* forward solve the lower triangular */ 4942 idx = 0; 4943 x[0] = b[0]; 4944 for (i=1; i<n; i++) { 4945 v = aa + ai[i]; 4946 vi = aj + ai[i]; 4947 nz = diag[i] - ai[i]; 4948 idx += 1; 4949 s1 = b[idx]; 4950 while (nz--) { 4951 jdx = *vi++; 4952 x1 = x[jdx]; 4953 s1 -= v[0]*x1; 4954 v += 1; 4955 } 4956 x[idx] = s1; 4957 } 4958 /* backward solve the upper triangular */ 4959 for (i=n-1; i>=0; i--){ 4960 v = aa + diag[i] + 1; 4961 vi = aj + diag[i] + 1; 4962 nz = ai[i+1] - diag[i] - 1; 4963 idt = i; 4964 s1 = x[idt]; 4965 while (nz--) { 4966 idx = *vi++; 4967 x1 = x[idx]; 4968 s1 -= v[0]*x1; 4969 v += 1; 4970 } 4971 v = aa + diag[i]; 4972 x[idt] = v[0]*s1; 4973 } 4974 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4975 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4976 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4977 PetscFunctionReturn(0); 4978 } 4979 4980 /* ----------------------------------------------------------------*/ 4981 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 4982 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 4983 4984 #undef __FUNCT__ 4985 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 4986 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 4987 { 4988 Mat C=B; 4989 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 4990 IS isrow = b->row,isicol = b->icol; 4991 PetscErrorCode ierr; 4992 const PetscInt *r,*ic,*ics; 4993 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 4994 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4995 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4996 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4997 MatScalar *v_work; 4998 4999 PetscFunctionBegin; 5000 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5001 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5002 ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5003 ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 5004 ics = ic; 5005 5006 /* generate work space needed by dense LU factorization */ 5007 ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 5008 mwork = v_work + bs; 5009 v_pivots = (PetscInt*)(mwork + bs2); 5010 5011 for (i=0; i<n; i++){ 5012 /* zero rtmp */ 5013 /* L part */ 5014 nz = bi[i+1] - bi[i]; 5015 bjtmp = bj + bi[i]; 5016 for (j=0; j<nz; j++){ 5017 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5018 } 5019 5020 /* U part */ 5021 nz = bdiag[i] - bdiag[i+1]; 5022 bjtmp = bj + bdiag[i+1]+1; 5023 for (j=0; j<nz; j++){ 5024 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5025 } 5026 5027 /* load in initial (unfactored row) */ 5028 nz = ai[r[i]+1] - ai[r[i]]; 5029 ajtmp = aj + ai[r[i]]; 5030 v = aa + bs2*ai[r[i]]; 5031 for (j=0; j<nz; j++) { 5032 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5033 } 5034 5035 /* elimination */ 5036 bjtmp = bj + bi[i]; 5037 nzL = bi[i+1] - bi[i]; 5038 for(k=0;k < nzL;k++) { 5039 row = bjtmp[k]; 5040 pc = rtmp + bs2*row; 5041 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5042 if (flg) { 5043 pv = b->a + bs2*bdiag[row]; 5044 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5045 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5046 pv = b->a + bs2*(bdiag[row+1]+1); 5047 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5048 for (j=0; j<nz; j++) { 5049 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5050 } 5051 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5052 } 5053 } 5054 5055 /* finished row so stick it into b->a */ 5056 /* L part */ 5057 pv = b->a + bs2*bi[i] ; 5058 pj = b->j + bi[i] ; 5059 nz = bi[i+1] - bi[i]; 5060 for (j=0; j<nz; j++) { 5061 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5062 } 5063 5064 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5065 pv = b->a + bs2*bdiag[i]; 5066 pj = b->j + bdiag[i]; 5067 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5068 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5069 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5070 5071 /* U part */ 5072 pv = b->a + bs2*(bdiag[i+1]+1); 5073 pj = b->j + bdiag[i+1]+1; 5074 nz = bdiag[i] - bdiag[i+1] - 1; 5075 for (j=0; j<nz; j++){ 5076 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5077 } 5078 } 5079 5080 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5081 ierr = PetscFree(v_work);CHKERRQ(ierr); 5082 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5083 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5084 5085 C->assembled = PETSC_TRUE; 5086 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5087 PetscFunctionReturn(0); 5088 } 5089 5090 /* 5091 ilu(0) with natural ordering under new data structure. 5092 See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 5093 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 5094 */ 5095 5096 #undef __FUNCT__ 5097 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 5098 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5099 { 5100 5101 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5102 PetscErrorCode ierr; 5103 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5104 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5105 5106 PetscFunctionBegin; 5107 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5108 b = (Mat_SeqBAIJ*)(fact)->data; 5109 5110 /* allocate matrix arrays for new data structure */ 5111 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5112 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5113 b->singlemalloc = PETSC_TRUE; 5114 if (!b->diag){ 5115 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5116 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5117 } 5118 bdiag = b->diag; 5119 5120 if (n > 0) { 5121 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5122 } 5123 5124 /* set bi and bj with new data structure */ 5125 bi = b->i; 5126 bj = b->j; 5127 5128 /* L part */ 5129 bi[0] = 0; 5130 for (i=0; i<n; i++){ 5131 nz = adiag[i] - ai[i]; 5132 bi[i+1] = bi[i] + nz; 5133 aj = a->j + ai[i]; 5134 for (j=0; j<nz; j++){ 5135 *bj = aj[j]; bj++; 5136 } 5137 } 5138 5139 /* U part */ 5140 bi_temp = bi[n]; 5141 bdiag[n] = bi[n]-1; 5142 for (i=n-1; i>=0; i--){ 5143 nz = ai[i+1] - adiag[i] - 1; 5144 bi_temp = bi_temp + nz + 1; 5145 aj = a->j + adiag[i] + 1; 5146 for (j=0; j<nz; j++){ 5147 *bj = aj[j]; bj++; 5148 } 5149 /* diag[i] */ 5150 *bj = i; bj++; 5151 bdiag[i] = bi_temp - 1; 5152 } 5153 PetscFunctionReturn(0); 5154 } 5155 5156 #undef __FUNCT__ 5157 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 5158 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5159 { 5160 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5161 IS isicol; 5162 PetscErrorCode ierr; 5163 const PetscInt *r,*ic; 5164 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5165 PetscInt *bi,*cols,nnz,*cols_lvl; 5166 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5167 PetscInt i,levels,diagonal_fill; 5168 PetscTruth col_identity,row_identity,both_identity; 5169 PetscReal f; 5170 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5171 PetscBT lnkbt; 5172 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5173 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5174 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5175 PetscTruth missing; 5176 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5177 5178 PetscFunctionBegin; 5179 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5180 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5181 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5182 5183 f = info->fill; 5184 levels = (PetscInt)info->levels; 5185 diagonal_fill = (PetscInt)info->diagonal_fill; 5186 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5187 5188 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5189 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5190 both_identity = (PetscTruth) (row_identity && col_identity); 5191 5192 if (!levels && both_identity) { 5193 /* special case: ilu(0) with natural ordering */ 5194 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5195 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 5196 /* set MatSolve routines */ 5197 switch (bs){ 5198 case 2: 5199 fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2; 5200 break; 5201 case 3: 5202 fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2; 5203 break; 5204 case 4: 5205 fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2; 5206 break; 5207 case 5: 5208 fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2; 5209 break; 5210 case 6: 5211 fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2; 5212 break; 5213 case 7: 5214 fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2; 5215 break; 5216 default: 5217 fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2; 5218 break; 5219 } 5220 5221 fact->factor = MAT_FACTOR_ILU; 5222 (fact)->info.factor_mallocs = 0; 5223 (fact)->info.fill_ratio_given = info->fill; 5224 (fact)->info.fill_ratio_needed = 1.0; 5225 b = (Mat_SeqBAIJ*)(fact)->data; 5226 b->row = isrow; 5227 b->col = iscol; 5228 b->icol = isicol; 5229 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5230 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5231 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5232 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5233 PetscFunctionReturn(0); 5234 } 5235 5236 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5237 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5238 5239 /* get new row pointers */ 5240 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5241 bi[0] = 0; 5242 /* bdiag is location of diagonal in factor */ 5243 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5244 bdiag[0] = 0; 5245 5246 ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr); 5247 bjlvl_ptr = (PetscInt**)(bj_ptr + n); 5248 5249 /* create a linked list for storing column indices of the active row */ 5250 nlnk = n + 1; 5251 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5252 5253 /* initial FreeSpace size is f*(ai[n]+1) */ 5254 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5255 current_space = free_space; 5256 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5257 current_space_lvl = free_space_lvl; 5258 5259 for (i=0; i<n; i++) { 5260 nzi = 0; 5261 /* copy current row into linked list */ 5262 nnz = ai[r[i]+1] - ai[r[i]]; 5263 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5264 cols = aj + ai[r[i]]; 5265 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5266 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5267 nzi += nlnk; 5268 5269 /* make sure diagonal entry is included */ 5270 if (diagonal_fill && lnk[i] == -1) { 5271 fm = n; 5272 while (lnk[fm] < i) fm = lnk[fm]; 5273 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5274 lnk[fm] = i; 5275 lnk_lvl[i] = 0; 5276 nzi++; dcount++; 5277 } 5278 5279 /* add pivot rows into the active row */ 5280 nzbd = 0; 5281 prow = lnk[n]; 5282 while (prow < i) { 5283 nnz = bdiag[prow]; 5284 cols = bj_ptr[prow] + nnz + 1; 5285 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5286 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5287 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5288 nzi += nlnk; 5289 prow = lnk[prow]; 5290 nzbd++; 5291 } 5292 bdiag[i] = nzbd; 5293 bi[i+1] = bi[i] + nzi; 5294 5295 /* if free space is not available, make more free space */ 5296 if (current_space->local_remaining<nzi) { 5297 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5298 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5299 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5300 reallocs++; 5301 } 5302 5303 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5304 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5305 bj_ptr[i] = current_space->array; 5306 bjlvl_ptr[i] = current_space_lvl->array; 5307 5308 /* make sure the active row i has diagonal entry */ 5309 if (*(bj_ptr[i]+bdiag[i]) != i) { 5310 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5311 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5312 } 5313 5314 current_space->array += nzi; 5315 current_space->local_used += nzi; 5316 current_space->local_remaining -= nzi; 5317 current_space_lvl->array += nzi; 5318 current_space_lvl->local_used += nzi; 5319 current_space_lvl->local_remaining -= nzi; 5320 } 5321 5322 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5323 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5324 5325 /* destroy list of free space and other temporary arrays */ 5326 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5327 5328 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5329 ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5330 5331 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5332 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5333 ierr = PetscFree(bj_ptr);CHKERRQ(ierr); 5334 5335 #if defined(PETSC_USE_INFO) 5336 { 5337 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 5338 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 5339 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5340 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 5341 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5342 if (diagonal_fill) { 5343 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 5344 } 5345 } 5346 #endif 5347 5348 /* put together the new matrix */ 5349 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5350 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5351 b = (Mat_SeqBAIJ*)(fact)->data; 5352 b->free_a = PETSC_TRUE; 5353 b->free_ij = PETSC_TRUE; 5354 b->singlemalloc = PETSC_FALSE; 5355 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5356 b->j = bj; 5357 b->i = bi; 5358 b->diag = bdiag; 5359 b->free_diag = PETSC_TRUE; 5360 b->ilen = 0; 5361 b->imax = 0; 5362 b->row = isrow; 5363 b->col = iscol; 5364 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5365 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5366 b->icol = isicol; 5367 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5368 /* In b structure: Free imax, ilen, old a, old j. 5369 Allocate bdiag, solve_work, new a, new j */ 5370 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 5371 b->maxnz = b->nz = bdiag[0]+1; 5372 (fact)->info.factor_mallocs = reallocs; 5373 (fact)->info.fill_ratio_given = f; 5374 (fact)->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5375 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 5376 /* set MatSolve routines */ 5377 if (both_identity){ 5378 switch (bs){ 5379 case 2: 5380 fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2; 5381 break; 5382 case 3: 5383 fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2; 5384 break; 5385 case 4: 5386 fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2; 5387 break; 5388 case 5: 5389 fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2; 5390 break; 5391 case 6: 5392 fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2; 5393 break; 5394 case 7: 5395 fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2; 5396 break; 5397 default: 5398 fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2; 5399 break; 5400 } 5401 } else { 5402 switch (bs){ 5403 case 2: 5404 fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct_v2; 5405 break; 5406 case 3: 5407 fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct_v2; 5408 break; 5409 case 4: 5410 fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct_v2; 5411 break; 5412 case 5: 5413 fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct_v2; 5414 break; 5415 case 6: 5416 fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct_v2; 5417 break; 5418 case 7: 5419 fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct_v2; 5420 break; 5421 default: 5422 fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct_v2; 5423 break; 5424 } 5425 } 5426 PetscFunctionReturn(0); 5427 } 5428 5429 5430 /* 5431 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 5432 except that the data structure of Mat_SeqAIJ is slightly different. 5433 Not a good example of code reuse. 5434 */ 5435 #undef __FUNCT__ 5436 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5437 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5438 { 5439 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5440 IS isicol; 5441 PetscErrorCode ierr; 5442 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 5443 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5444 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5445 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 5446 PetscTruth col_identity,row_identity,both_identity,flg; 5447 PetscReal f; 5448 PetscTruth newdatastruct = PETSC_FALSE; 5449 5450 PetscFunctionBegin; 5451 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 5452 if (newdatastruct){ 5453 ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5454 PetscFunctionReturn(0); 5455 } 5456 5457 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 5458 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 5459 5460 f = info->fill; 5461 levels = (PetscInt)info->levels; 5462 diagonal_fill = (PetscInt)info->diagonal_fill; 5463 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5464 5465 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5466 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5467 both_identity = (PetscTruth) (row_identity && col_identity); 5468 5469 if (!levels && both_identity) { /* special case copy the nonzero structure */ 5470 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 5471 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5472 5473 fact->factor = MAT_FACTOR_ILU; 5474 b = (Mat_SeqBAIJ*)(fact)->data; 5475 b->row = isrow; 5476 b->col = iscol; 5477 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5478 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5479 b->icol = isicol; 5480 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5481 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5482 PetscFunctionReturn(0); 5483 } 5484 5485 /* general case perform the symbolic factorization */ 5486 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5487 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5488 5489 /* get new row pointers */ 5490 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 5491 ainew[0] = 0; 5492 /* don't know how many column pointers are needed so estimate */ 5493 jmax = (PetscInt)(f*ai[n] + 1); 5494 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 5495 /* ajfill is level of fill for each fill entry */ 5496 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 5497 /* fill is a linked list of nonzeros in active row */ 5498 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 5499 /* im is level for each filled value */ 5500 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 5501 /* dloc is location of diagonal in factor */ 5502 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 5503 dloc[0] = 0; 5504 for (prow=0; prow<n; prow++) { 5505 5506 /* copy prow into linked list */ 5507 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 5508 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 5509 xi = aj + ai[r[prow]]; 5510 fill[n] = n; 5511 fill[prow] = -1; /* marker for diagonal entry */ 5512 while (nz--) { 5513 fm = n; 5514 idx = ic[*xi++]; 5515 do { 5516 m = fm; 5517 fm = fill[m]; 5518 } while (fm < idx); 5519 fill[m] = idx; 5520 fill[idx] = fm; 5521 im[idx] = 0; 5522 } 5523 5524 /* make sure diagonal entry is included */ 5525 if (diagonal_fill && fill[prow] == -1) { 5526 fm = n; 5527 while (fill[fm] < prow) fm = fill[fm]; 5528 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5529 fill[fm] = prow; 5530 im[prow] = 0; 5531 nzf++; 5532 dcount++; 5533 } 5534 5535 nzi = 0; 5536 row = fill[n]; 5537 while (row < prow) { 5538 incrlev = im[row] + 1; 5539 nz = dloc[row]; 5540 xi = ajnew + ainew[row] + nz + 1; 5541 flev = ajfill + ainew[row] + nz + 1; 5542 nnz = ainew[row+1] - ainew[row] - nz - 1; 5543 fm = row; 5544 while (nnz-- > 0) { 5545 idx = *xi++; 5546 if (*flev + incrlev > levels) { 5547 flev++; 5548 continue; 5549 } 5550 do { 5551 m = fm; 5552 fm = fill[m]; 5553 } while (fm < idx); 5554 if (fm != idx) { 5555 im[idx] = *flev + incrlev; 5556 fill[m] = idx; 5557 fill[idx] = fm; 5558 fm = idx; 5559 nzf++; 5560 } else { 5561 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 5562 } 5563 flev++; 5564 } 5565 row = fill[row]; 5566 nzi++; 5567 } 5568 /* copy new filled row into permanent storage */ 5569 ainew[prow+1] = ainew[prow] + nzf; 5570 if (ainew[prow+1] > jmax) { 5571 5572 /* estimate how much additional space we will need */ 5573 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5574 /* just double the memory each time */ 5575 PetscInt maxadd = jmax; 5576 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 5577 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 5578 jmax += maxadd; 5579 5580 /* allocate a longer ajnew and ajfill */ 5581 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5582 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5583 ierr = PetscFree(ajnew);CHKERRQ(ierr); 5584 ajnew = xitmp; 5585 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5586 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5587 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5588 ajfill = xitmp; 5589 reallocate++; /* count how many reallocations are needed */ 5590 } 5591 xitmp = ajnew + ainew[prow]; 5592 flev = ajfill + ainew[prow]; 5593 dloc[prow] = nzi; 5594 fm = fill[n]; 5595 while (nzf--) { 5596 *xitmp++ = fm; 5597 *flev++ = im[fm]; 5598 fm = fill[fm]; 5599 } 5600 /* make sure row has diagonal entry */ 5601 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 5602 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5603 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5604 } 5605 } 5606 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5607 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5608 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5609 ierr = PetscFree(fill);CHKERRQ(ierr); 5610 ierr = PetscFree(im);CHKERRQ(ierr); 5611 5612 #if defined(PETSC_USE_INFO) 5613 { 5614 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5615 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5616 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5617 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5618 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5619 if (diagonal_fill) { 5620 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5621 } 5622 } 5623 #endif 5624 5625 /* put together the new matrix */ 5626 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5627 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5628 b = (Mat_SeqBAIJ*)(fact)->data; 5629 b->free_a = PETSC_TRUE; 5630 b->free_ij = PETSC_TRUE; 5631 b->singlemalloc = PETSC_FALSE; 5632 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5633 b->j = ajnew; 5634 b->i = ainew; 5635 for (i=0; i<n; i++) dloc[i] += ainew[i]; 5636 b->diag = dloc; 5637 b->free_diag = PETSC_TRUE; 5638 b->ilen = 0; 5639 b->imax = 0; 5640 b->row = isrow; 5641 b->col = iscol; 5642 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5643 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5644 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5645 b->icol = isicol; 5646 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5647 /* In b structure: Free imax, ilen, old a, old j. 5648 Allocate dloc, solve_work, new a, new j */ 5649 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 5650 b->maxnz = b->nz = ainew[n]; 5651 5652 (fact)->info.factor_mallocs = reallocate; 5653 (fact)->info.fill_ratio_given = f; 5654 (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 5655 5656 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5657 PetscFunctionReturn(0); 5658 } 5659 5660 #undef __FUNCT__ 5661 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5662 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 5663 { 5664 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 5665 /* int i,*AJ=a->j,nz=a->nz; */ 5666 PetscFunctionBegin; 5667 /* Undo Column scaling */ 5668 /* while (nz--) { */ 5669 /* AJ[i] = AJ[i]/4; */ 5670 /* } */ 5671 /* This should really invoke a push/pop logic, but we don't have that yet. */ 5672 A->ops->setunfactored = PETSC_NULL; 5673 PetscFunctionReturn(0); 5674 } 5675 5676 #undef __FUNCT__ 5677 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5678 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 5679 { 5680 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5681 PetscInt *AJ=a->j,nz=a->nz; 5682 unsigned short *aj=(unsigned short *)AJ; 5683 PetscFunctionBegin; 5684 /* Is this really necessary? */ 5685 while (nz--) { 5686 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 5687 } 5688 A->ops->setunfactored = PETSC_NULL; 5689 PetscFunctionReturn(0); 5690 } 5691 5692 5693