1 #define PETSCMAT_DLL 2 3 #include "../src/mat/impls/baij/seq/baij.h" 4 #include "../src/mat/impls/sbaij/seq/sbaij.h" 5 #include "../src/inline/ilu.h" 6 #include "petscis.h" 7 8 #if !defined(PETSC_USE_COMPLEX) 9 /* 10 input: 11 F -- numeric factor 12 output: 13 nneg, nzero, npos: matrix inertia 14 */ 15 16 #undef __FUNCT__ 17 #define __FUNCT__ "MatGetInertia_SeqSBAIJ" 18 PetscErrorCode MatGetInertia_SeqSBAIJ(Mat F,PetscInt *nneig,PetscInt *nzero,PetscInt *npos) 19 { 20 Mat_SeqSBAIJ *fact_ptr = (Mat_SeqSBAIJ*)F->data; 21 MatScalar *dd = fact_ptr->a; 22 PetscInt mbs=fact_ptr->mbs,bs=F->rmap->bs,i,nneig_tmp,npos_tmp,*fi = fact_ptr->i; 23 24 PetscFunctionBegin; 25 if (bs != 1) SETERRQ1(PETSC_ERR_SUP,"No support for bs: %D >1 yet",bs); 26 nneig_tmp = 0; npos_tmp = 0; 27 for (i=0; i<mbs; i++){ 28 if (PetscRealPart(dd[*fi]) > 0.0){ 29 npos_tmp++; 30 } else if (PetscRealPart(dd[*fi]) < 0.0){ 31 nneig_tmp++; 32 } 33 fi++; 34 } 35 if (nneig) *nneig = nneig_tmp; 36 if (npos) *npos = npos_tmp; 37 if (nzero) *nzero = mbs - nneig_tmp - npos_tmp; 38 39 PetscFunctionReturn(0); 40 } 41 #endif /* !defined(PETSC_USE_COMPLEX) */ 42 43 extern PetscErrorCode MatSeqSBAIJSetNumericFactorization(Mat,PetscTruth); 44 45 /* 46 Symbolic U^T*D*U factorization for SBAIJ format. Modified from SSF of YSMP. 47 Use Modified Sparse Row (MSR) storage for u and ju. See page 85, "Iterative Methods ..." by Saad. 48 */ 49 #undef __FUNCT__ 50 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ_MSR" 51 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(Mat F,Mat A,IS perm,const MatFactorInfo *info) 52 { 53 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 54 PetscErrorCode ierr; 55 const PetscInt *rip,*ai,*aj; 56 PetscInt i,mbs = a->mbs,*jutmp,bs = A->rmap->bs,bs2=a->bs2; 57 PetscInt m,reallocs = 0,prow; 58 PetscInt *jl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 59 PetscReal f = info->fill; 60 PetscTruth perm_identity; 61 62 PetscFunctionBegin; 63 /* check whether perm is the identity mapping */ 64 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 65 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 66 67 if (perm_identity){ /* without permutation */ 68 a->permute = PETSC_FALSE; 69 ai = a->i; aj = a->j; 70 } else { /* non-trivial permutation */ 71 a->permute = PETSC_TRUE; 72 ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr); 73 ai = a->inew; aj = a->jnew; 74 } 75 76 /* initialization */ 77 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr); 78 umax = (PetscInt)(f*ai[mbs] + 1); umax += mbs + 1; 79 ierr = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr); 80 iu[0] = mbs+1; 81 juidx = mbs + 1; /* index for ju */ 82 ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&jl);CHKERRQ(ierr); /* linked list for pivot row */ 83 q = jl + mbs; /* linked list for col index */ 84 for (i=0; i<mbs; i++){ 85 jl[i] = mbs; 86 q[i] = 0; 87 } 88 89 /* for each row k */ 90 for (k=0; k<mbs; k++){ 91 for (i=0; i<mbs; i++) q[i] = 0; /* to be removed! */ 92 nzk = 0; /* num. of nz blocks in k-th block row with diagonal block excluded */ 93 q[k] = mbs; 94 /* initialize nonzero structure of k-th row to row rip[k] of A */ 95 jmin = ai[rip[k]] +1; /* exclude diag[k] */ 96 jmax = ai[rip[k]+1]; 97 for (j=jmin; j<jmax; j++){ 98 vj = rip[aj[j]]; /* col. value */ 99 if(vj > k){ 100 qm = k; 101 do { 102 m = qm; qm = q[m]; 103 } while(qm < vj); 104 if (qm == vj) { 105 SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n"); 106 } 107 nzk++; 108 q[m] = vj; 109 q[vj] = qm; 110 } /* if(vj > k) */ 111 } /* for (j=jmin; j<jmax; j++) */ 112 113 /* modify nonzero structure of k-th row by computing fill-in 114 for each row i to be merged in */ 115 prow = k; 116 prow = jl[prow]; /* next pivot row (== mbs for symbolic factorization) */ 117 118 while (prow < k){ 119 /* merge row prow into k-th row */ 120 jmin = iu[prow] + 1; jmax = iu[prow+1]; 121 qm = k; 122 for (j=jmin; j<jmax; j++){ 123 vj = ju[j]; 124 do { 125 m = qm; qm = q[m]; 126 } while (qm < vj); 127 if (qm != vj){ 128 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 129 } 130 } 131 prow = jl[prow]; /* next pivot row */ 132 } 133 134 /* add k to row list for first nonzero element in k-th row */ 135 if (nzk > 0){ 136 i = q[k]; /* col value of first nonzero element in U(k, k+1:mbs-1) */ 137 jl[k] = jl[i]; jl[i] = k; 138 } 139 iu[k+1] = iu[k] + nzk; 140 141 /* allocate more space to ju if needed */ 142 if (iu[k+1] > umax) { 143 /* estimate how much additional space we will need */ 144 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 145 /* just double the memory each time */ 146 maxadd = umax; 147 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 148 umax += maxadd; 149 150 /* allocate a longer ju */ 151 ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr); 152 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr); 153 ierr = PetscFree(ju);CHKERRQ(ierr); 154 ju = jutmp; 155 reallocs++; /* count how many times we realloc */ 156 } 157 158 /* save nonzero structure of k-th row in ju */ 159 i=k; 160 while (nzk --) { 161 i = q[i]; 162 ju[juidx++] = i; 163 } 164 } 165 166 #if defined(PETSC_USE_INFO) 167 if (ai[mbs] != 0) { 168 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 169 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 170 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 171 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 172 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 173 } else { 174 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 175 } 176 #endif 177 178 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 179 ierr = PetscFree(jl);CHKERRQ(ierr); 180 181 /* put together the new matrix */ 182 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(F,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 183 184 /* ierr = PetscLogObjectParent(B,iperm);CHKERRQ(ierr); */ 185 b = (Mat_SeqSBAIJ*)(F)->data; 186 b->singlemalloc = PETSC_FALSE; 187 b->free_a = PETSC_TRUE; 188 b->free_ij = PETSC_TRUE; 189 ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr); 190 b->j = ju; 191 b->i = iu; 192 b->diag = 0; 193 b->ilen = 0; 194 b->imax = 0; 195 b->row = perm; 196 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 197 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 198 b->icol = perm; 199 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 200 ierr = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 201 /* In b structure: Free imax, ilen, old a, old j. 202 Allocate idnew, solve_work, new a, new j */ 203 ierr = PetscLogObjectMemory(F,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 204 b->maxnz = b->nz = iu[mbs]; 205 206 (F)->info.factor_mallocs = reallocs; 207 (F)->info.fill_ratio_given = f; 208 if (ai[mbs] != 0) { 209 (F)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 210 } else { 211 (F)->info.fill_ratio_needed = 0.0; 212 } 213 ierr = MatSeqSBAIJSetNumericFactorization(F,perm_identity);CHKERRQ(ierr); 214 PetscFunctionReturn(0); 215 } 216 /* 217 Symbolic U^T*D*U factorization for SBAIJ format. 218 */ 219 #include "petscbt.h" 220 #include "../src/mat/utils/freespace.h" 221 #undef __FUNCT__ 222 #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqSBAIJ" 223 PetscErrorCode MatCholeskyFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 224 { 225 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 226 Mat_SeqSBAIJ *b; 227 PetscErrorCode ierr; 228 PetscTruth perm_identity,missing; 229 PetscReal fill = info->fill; 230 const PetscInt *rip,*ai,*aj; 231 PetscInt i,mbs=a->mbs,bs=A->rmap->bs,reallocs=0,prow,d; 232 PetscInt *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow; 233 PetscInt nlnk,*lnk,ncols,*cols,*uj,**ui_ptr,*uj_ptr; 234 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 235 PetscBT lnkbt; 236 237 PetscFunctionBegin; 238 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 239 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 240 241 /* 242 This code originally uses Modified Sparse Row (MSR) storage 243 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choise! 244 Then it is rewritten so the factor B takes seqsbaij format. However the associated 245 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1 or !perm_identity, 246 thus the original code in MSR format is still used for these cases. 247 The code below should replace MatCholeskyFactorSymbolic_SeqSBAIJ_MSR() whenever 248 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 249 */ 250 if (bs > 1){ 251 ierr = MatCholeskyFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr); 252 PetscFunctionReturn(0); 253 } 254 255 /* check whether perm is the identity mapping */ 256 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 257 258 if (perm_identity){ 259 a->permute = PETSC_FALSE; 260 ai = a->i; aj = a->j; 261 } else { 262 SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 263 /* There are bugs for reordeing. Needs further work. 264 MatReordering for sbaij cannot be efficient. User should use aij formt! */ 265 a->permute = PETSC_TRUE; 266 ierr = MatReorderingSeqSBAIJ(A,perm);CHKERRQ(ierr); 267 ai = a->inew; aj = a->jnew; 268 } 269 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 270 271 /* initialization */ 272 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr); 273 ui[0] = 0; 274 275 /* jl: linked list for storing indices of the pivot rows 276 il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 277 ierr = PetscMalloc((3*mbs+1)*sizeof(PetscInt)+mbs*sizeof(PetscInt*),&jl);CHKERRQ(ierr); 278 il = jl + mbs; 279 cols = il + mbs; 280 ui_ptr = (PetscInt**)(cols + mbs); 281 282 for (i=0; i<mbs; i++){ 283 jl[i] = mbs; il[i] = 0; 284 } 285 286 /* create and initialize a linked list for storing column indices of the active row k */ 287 nlnk = mbs + 1; 288 ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr); 289 290 /* initial FreeSpace size is fill*(ai[mbs]+1) */ 291 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr); 292 current_space = free_space; 293 294 for (k=0; k<mbs; k++){ /* for each active row k */ 295 /* initialize lnk by the column indices of row rip[k] of A */ 296 nzk = 0; 297 ncols = ai[rip[k]+1] - ai[rip[k]]; 298 for (j=0; j<ncols; j++){ 299 i = *(aj + ai[rip[k]] + j); 300 cols[j] = rip[i]; 301 } 302 ierr = PetscLLAdd(ncols,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr); 303 nzk += nlnk; 304 305 /* update lnk by computing fill-in for each pivot row to be merged in */ 306 prow = jl[k]; /* 1st pivot row */ 307 308 while (prow < k){ 309 nextprow = jl[prow]; 310 /* merge prow into k-th row */ 311 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 312 jmax = ui[prow+1]; 313 ncols = jmax-jmin; 314 uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 315 ierr = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr); 316 nzk += nlnk; 317 318 /* update il and jl for prow */ 319 if (jmin < jmax){ 320 il[prow] = jmin; 321 j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow; 322 } 323 prow = nextprow; 324 } 325 326 /* if free space is not available, make more free space */ 327 if (current_space->local_remaining<nzk) { 328 i = mbs - k + 1; /* num of unfactored rows */ 329 i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 330 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 331 reallocs++; 332 } 333 334 /* copy data into free space, then initialize lnk */ 335 ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 336 337 /* add the k-th row into il and jl */ 338 if (nzk-1 > 0){ 339 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 340 jl[k] = jl[i]; jl[i] = k; 341 il[k] = ui[k] + 1; 342 } 343 ui_ptr[k] = current_space->array; 344 current_space->array += nzk; 345 current_space->local_used += nzk; 346 current_space->local_remaining -= nzk; 347 348 ui[k+1] = ui[k] + nzk; 349 } 350 351 #if defined(PETSC_USE_INFO) 352 if (ai[mbs] != 0) { 353 PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]); 354 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr); 355 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 356 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr); 357 } else { 358 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 359 } 360 #endif 361 362 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 363 ierr = PetscFree(jl);CHKERRQ(ierr); 364 365 /* destroy list of free space and other temporary array(s) */ 366 ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr); 367 ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr); 368 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 369 370 /* put together the new matrix in MATSEQSBAIJ format */ 371 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 372 373 b = (Mat_SeqSBAIJ*)(fact)->data; 374 b->singlemalloc = PETSC_FALSE; 375 b->free_a = PETSC_TRUE; 376 b->free_ij = PETSC_TRUE; 377 ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 378 b->j = uj; 379 b->i = ui; 380 b->diag = 0; 381 b->ilen = 0; 382 b->imax = 0; 383 b->row = perm; 384 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 385 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 386 b->icol = perm; 387 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 388 ierr = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 389 ierr = PetscLogObjectMemory(fact,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 390 b->maxnz = b->nz = ui[mbs]; 391 392 (fact)->info.factor_mallocs = reallocs; 393 (fact)->info.fill_ratio_given = fill; 394 if (ai[mbs] != 0) { 395 (fact)->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]); 396 } else { 397 (fact)->info.fill_ratio_needed = 0.0; 398 } 399 ierr = MatSeqSBAIJSetNumericFactorization(fact,perm_identity);CHKERRQ(ierr); 400 PetscFunctionReturn(0); 401 } 402 #undef __FUNCT__ 403 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N" 404 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N(Mat C,Mat A,const MatFactorInfo *info) 405 { 406 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 407 IS perm = b->row; 408 PetscErrorCode ierr; 409 const PetscInt *ai,*aj,*perm_ptr,mbs=a->mbs,*bi=b->i,*bj=b->j; 410 PetscInt i,j; 411 PetscInt *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 412 PetscInt bs=A->rmap->bs,bs2 = a->bs2,bslog = 0; 413 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 414 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 415 MatScalar *work; 416 PetscInt *pivots; 417 418 PetscFunctionBegin; 419 /* initialization */ 420 ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 421 ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr); 422 ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr); 423 jl = il + mbs; 424 for (i=0; i<mbs; i++) { 425 jl[i] = mbs; il[0] = 0; 426 } 427 ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr); 428 uik = dk + bs2; 429 work = uik + bs2; 430 ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr); 431 432 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 433 434 /* check permutation */ 435 if (!a->permute){ 436 ai = a->i; aj = a->j; aa = a->a; 437 } else { 438 ai = a->inew; aj = a->jnew; 439 ierr = PetscMalloc(bs2*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr); 440 ierr = PetscMemcpy(aa,a->a,bs2*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 441 ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr); 442 ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr); 443 444 /* flops in while loop */ 445 bslog = 2*bs*bs2; 446 447 for (i=0; i<mbs; i++){ 448 jmin = ai[i]; jmax = ai[i+1]; 449 for (j=jmin; j<jmax; j++){ 450 while (a2anew[j] != j){ 451 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 452 for (k1=0; k1<bs2; k1++){ 453 dk[k1] = aa[k*bs2+k1]; 454 aa[k*bs2+k1] = aa[j*bs2+k1]; 455 aa[j*bs2+k1] = dk[k1]; 456 } 457 } 458 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 459 if (i > aj[j]){ 460 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 461 ap = aa + j*bs2; /* ptr to the beginning of j-th block of aa */ 462 for (k=0; k<bs2; k++) dk[k] = ap[k]; /* dk <- j-th block of aa */ 463 for (k=0; k<bs; k++){ /* j-th block of aa <- dk^T */ 464 for (k1=0; k1<bs; k1++) *ap++ = dk[k + bs*k1]; 465 } 466 } 467 } 468 } 469 ierr = PetscFree(a2anew);CHKERRQ(ierr); 470 } 471 472 /* for each row k */ 473 for (k = 0; k<mbs; k++){ 474 475 /*initialize k-th row with elements nonzero in row perm(k) of A */ 476 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 477 478 ap = aa + jmin*bs2; 479 for (j = jmin; j < jmax; j++){ 480 vj = perm_ptr[aj[j]]; /* block col. index */ 481 rtmp_ptr = rtmp + vj*bs2; 482 for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++; 483 } 484 485 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 486 ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 487 i = jl[k]; /* first row to be added to k_th row */ 488 489 while (i < k){ 490 nexti = jl[i]; /* next row to be added to k_th row */ 491 492 /* compute multiplier */ 493 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 494 495 /* uik = -inv(Di)*U_bar(i,k) */ 496 diag = ba + i*bs2; 497 u = ba + ili*bs2; 498 ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 499 Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u); 500 501 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 502 Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u); 503 ierr = PetscLogFlops(bslog*2);CHKERRQ(ierr); 504 505 /* update -U(i,k) */ 506 ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 507 508 /* add multiple of row i to k-th row ... */ 509 jmin = ili + 1; jmax = bi[i+1]; 510 if (jmin < jmax){ 511 for (j=jmin; j<jmax; j++) { 512 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 513 rtmp_ptr = rtmp + bj[j]*bs2; 514 u = ba + j*bs2; 515 Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u); 516 } 517 ierr = PetscLogFlops(bslog*(jmax-jmin));CHKERRQ(ierr); 518 519 /* ... add i to row list for next nonzero entry */ 520 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 521 j = bj[jmin]; 522 jl[i] = jl[j]; jl[j] = i; /* update jl */ 523 } 524 i = nexti; 525 } 526 527 /* save nonzero entries in k-th row of U ... */ 528 529 /* invert diagonal block */ 530 diag = ba+k*bs2; 531 ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr); 532 ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr); 533 534 jmin = bi[k]; jmax = bi[k+1]; 535 if (jmin < jmax) { 536 for (j=jmin; j<jmax; j++){ 537 vj = bj[j]; /* block col. index of U */ 538 u = ba + j*bs2; 539 rtmp_ptr = rtmp + vj*bs2; 540 for (k1=0; k1<bs2; k1++){ 541 *u++ = *rtmp_ptr; 542 *rtmp_ptr++ = 0.0; 543 } 544 } 545 546 /* ... add k to row list for first nonzero entry in k-th row */ 547 il[k] = jmin; 548 i = bj[jmin]; 549 jl[k] = jl[i]; jl[i] = k; 550 } 551 } 552 553 ierr = PetscFree(rtmp);CHKERRQ(ierr); 554 ierr = PetscFree(il);CHKERRQ(ierr); 555 ierr = PetscFree(dk);CHKERRQ(ierr); 556 ierr = PetscFree(pivots);CHKERRQ(ierr); 557 if (a->permute){ 558 ierr = PetscFree(aa);CHKERRQ(ierr); 559 } 560 561 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 562 C->ops->solve = MatSolve_SeqSBAIJ_N; 563 C->ops->solvetranspose = MatSolve_SeqSBAIJ_N; 564 C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_N; 565 C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_N; 566 567 C->assembled = PETSC_TRUE; 568 C->preallocated = PETSC_TRUE; 569 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 570 PetscFunctionReturn(0); 571 } 572 573 #undef __FUNCT__ 574 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering" 575 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 576 { 577 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 578 PetscErrorCode ierr; 579 PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 580 PetscInt *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 581 PetscInt bs=A->rmap->bs,bs2 = a->bs2,bslog; 582 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 583 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 584 MatScalar *work; 585 PetscInt *pivots; 586 587 PetscFunctionBegin; 588 ierr = PetscMalloc(bs2*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 589 ierr = PetscMemzero(rtmp,bs2*mbs*sizeof(MatScalar));CHKERRQ(ierr); 590 ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr); 591 jl = il + mbs; 592 for (i=0; i<mbs; i++) { 593 jl[i] = mbs; il[0] = 0; 594 } 595 ierr = PetscMalloc((2*bs2+bs)*sizeof(MatScalar),&dk);CHKERRQ(ierr); 596 uik = dk + bs2; 597 work = uik + bs2; 598 ierr = PetscMalloc(bs*sizeof(PetscInt),&pivots);CHKERRQ(ierr); 599 600 ai = a->i; aj = a->j; aa = a->a; 601 602 /* flops in while loop */ 603 bslog = 2*bs*bs2; 604 605 /* for each row k */ 606 for (k = 0; k<mbs; k++){ 607 608 /*initialize k-th row with elements nonzero in row k of A */ 609 jmin = ai[k]; jmax = ai[k+1]; 610 ap = aa + jmin*bs2; 611 for (j = jmin; j < jmax; j++){ 612 vj = aj[j]; /* block col. index */ 613 rtmp_ptr = rtmp + vj*bs2; 614 for (i=0; i<bs2; i++) *rtmp_ptr++ = *ap++; 615 } 616 617 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 618 ierr = PetscMemcpy(dk,rtmp+k*bs2,bs2*sizeof(MatScalar));CHKERRQ(ierr); 619 i = jl[k]; /* first row to be added to k_th row */ 620 621 while (i < k){ 622 nexti = jl[i]; /* next row to be added to k_th row */ 623 624 /* compute multiplier */ 625 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 626 627 /* uik = -inv(Di)*U_bar(i,k) */ 628 diag = ba + i*bs2; 629 u = ba + ili*bs2; 630 ierr = PetscMemzero(uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 631 Kernel_A_gets_A_minus_B_times_C(bs,uik,diag,u); 632 633 /* update D(k) += -U(i,k)^T * U_bar(i,k) */ 634 Kernel_A_gets_A_plus_Btranspose_times_C(bs,dk,uik,u); 635 ierr = PetscLogFlops(bslog*2);CHKERRQ(ierr); 636 637 /* update -U(i,k) */ 638 ierr = PetscMemcpy(ba+ili*bs2,uik,bs2*sizeof(MatScalar));CHKERRQ(ierr); 639 640 /* add multiple of row i to k-th row ... */ 641 jmin = ili + 1; jmax = bi[i+1]; 642 if (jmin < jmax){ 643 for (j=jmin; j<jmax; j++) { 644 /* rtmp += -U(i,k)^T * U_bar(i,j) */ 645 rtmp_ptr = rtmp + bj[j]*bs2; 646 u = ba + j*bs2; 647 Kernel_A_gets_A_plus_Btranspose_times_C(bs,rtmp_ptr,uik,u); 648 } 649 ierr = PetscLogFlops(bslog*(jmax-jmin));CHKERRQ(ierr); 650 651 /* ... add i to row list for next nonzero entry */ 652 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 653 j = bj[jmin]; 654 jl[i] = jl[j]; jl[j] = i; /* update jl */ 655 } 656 i = nexti; 657 } 658 659 /* save nonzero entries in k-th row of U ... */ 660 661 /* invert diagonal block */ 662 diag = ba+k*bs2; 663 ierr = PetscMemcpy(diag,dk,bs2*sizeof(MatScalar));CHKERRQ(ierr); 664 ierr = Kernel_A_gets_inverse_A(bs,diag,pivots,work);CHKERRQ(ierr); 665 666 jmin = bi[k]; jmax = bi[k+1]; 667 if (jmin < jmax) { 668 for (j=jmin; j<jmax; j++){ 669 vj = bj[j]; /* block col. index of U */ 670 u = ba + j*bs2; 671 rtmp_ptr = rtmp + vj*bs2; 672 for (k1=0; k1<bs2; k1++){ 673 *u++ = *rtmp_ptr; 674 *rtmp_ptr++ = 0.0; 675 } 676 } 677 678 /* ... add k to row list for first nonzero entry in k-th row */ 679 il[k] = jmin; 680 i = bj[jmin]; 681 jl[k] = jl[i]; jl[i] = k; 682 } 683 } 684 685 ierr = PetscFree(rtmp);CHKERRQ(ierr); 686 ierr = PetscFree(il);CHKERRQ(ierr); 687 ierr = PetscFree(dk);CHKERRQ(ierr); 688 ierr = PetscFree(pivots);CHKERRQ(ierr); 689 690 C->ops->solve = MatSolve_SeqSBAIJ_N_NaturalOrdering; 691 C->ops->solvetranspose = MatSolve_SeqSBAIJ_N_NaturalOrdering; 692 C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering; 693 C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering; 694 C->assembled = PETSC_TRUE; 695 C->preallocated = PETSC_TRUE; 696 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 697 PetscFunctionReturn(0); 698 } 699 700 /* 701 Numeric U^T*D*U factorization for SBAIJ format. Modified from SNF of YSMP. 702 Version for blocks 2 by 2. 703 */ 704 #undef __FUNCT__ 705 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2" 706 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2(Mat C,Mat A,const MatFactorInfo *info) 707 { 708 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 709 IS perm = b->row; 710 PetscErrorCode ierr; 711 const PetscInt *ai,*aj,*perm_ptr; 712 PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 713 PetscInt *a2anew,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 714 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 715 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 716 PetscReal shift = info->shiftinblocks; 717 718 PetscFunctionBegin; 719 /* initialization */ 720 /* il and jl record the first nonzero element in each row of the accessing 721 window U(0:k, k:mbs-1). 722 jl: list of rows to be added to uneliminated rows 723 i>= k: jl(i) is the first row to be added to row i 724 i< k: jl(i) is the row following row i in some list of rows 725 jl(i) = mbs indicates the end of a list 726 il(i): points to the first nonzero element in columns k,...,mbs-1 of 727 row i of U */ 728 ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 729 ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr); 730 ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr); 731 jl = il + mbs; 732 for (i=0; i<mbs; i++) { 733 jl[i] = mbs; il[0] = 0; 734 } 735 ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr); 736 uik = dk + 4; 737 ierr = ISGetIndices(perm,&perm_ptr);CHKERRQ(ierr); 738 739 /* check permutation */ 740 if (!a->permute){ 741 ai = a->i; aj = a->j; aa = a->a; 742 } else { 743 ai = a->inew; aj = a->jnew; 744 ierr = PetscMalloc(4*ai[mbs]*sizeof(MatScalar),&aa);CHKERRQ(ierr); 745 ierr = PetscMemcpy(aa,a->a,4*ai[mbs]*sizeof(MatScalar));CHKERRQ(ierr); 746 ierr = PetscMalloc(ai[mbs]*sizeof(PetscInt),&a2anew);CHKERRQ(ierr); 747 ierr = PetscMemcpy(a2anew,a->a2anew,(ai[mbs])*sizeof(PetscInt));CHKERRQ(ierr); 748 749 for (i=0; i<mbs; i++){ 750 jmin = ai[i]; jmax = ai[i+1]; 751 for (j=jmin; j<jmax; j++){ 752 while (a2anew[j] != j){ 753 k = a2anew[j]; a2anew[j] = a2anew[k]; a2anew[k] = k; 754 for (k1=0; k1<4; k1++){ 755 dk[k1] = aa[k*4+k1]; 756 aa[k*4+k1] = aa[j*4+k1]; 757 aa[j*4+k1] = dk[k1]; 758 } 759 } 760 /* transform columnoriented blocks that lie in the lower triangle to roworiented blocks */ 761 if (i > aj[j]){ 762 /* printf("change orientation, row: %d, col: %d\n",i,aj[j]); */ 763 ap = aa + j*4; /* ptr to the beginning of the block */ 764 dk[1] = ap[1]; /* swap ap[1] and ap[2] */ 765 ap[1] = ap[2]; 766 ap[2] = dk[1]; 767 } 768 } 769 } 770 ierr = PetscFree(a2anew);CHKERRQ(ierr); 771 } 772 773 /* for each row k */ 774 for (k = 0; k<mbs; k++){ 775 776 /*initialize k-th row with elements nonzero in row perm(k) of A */ 777 jmin = ai[perm_ptr[k]]; jmax = ai[perm_ptr[k]+1]; 778 ap = aa + jmin*4; 779 for (j = jmin; j < jmax; j++){ 780 vj = perm_ptr[aj[j]]; /* block col. index */ 781 rtmp_ptr = rtmp + vj*4; 782 for (i=0; i<4; i++) *rtmp_ptr++ = *ap++; 783 } 784 785 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 786 ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr); 787 i = jl[k]; /* first row to be added to k_th row */ 788 789 while (i < k){ 790 nexti = jl[i]; /* next row to be added to k_th row */ 791 792 /* compute multiplier */ 793 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 794 795 /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */ 796 diag = ba + i*4; 797 u = ba + ili*4; 798 uik[0] = -(diag[0]*u[0] + diag[2]*u[1]); 799 uik[1] = -(diag[1]*u[0] + diag[3]*u[1]); 800 uik[2] = -(diag[0]*u[2] + diag[2]*u[3]); 801 uik[3] = -(diag[1]*u[2] + diag[3]*u[3]); 802 803 /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */ 804 dk[0] += uik[0]*u[0] + uik[1]*u[1]; 805 dk[1] += uik[2]*u[0] + uik[3]*u[1]; 806 dk[2] += uik[0]*u[2] + uik[1]*u[3]; 807 dk[3] += uik[2]*u[2] + uik[3]*u[3]; 808 809 ierr = PetscLogFlops(16*2);CHKERRQ(ierr); 810 811 /* update -U(i,k): ba[ili] = uik */ 812 ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr); 813 814 /* add multiple of row i to k-th row ... */ 815 jmin = ili + 1; jmax = bi[i+1]; 816 if (jmin < jmax){ 817 for (j=jmin; j<jmax; j++) { 818 /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */ 819 rtmp_ptr = rtmp + bj[j]*4; 820 u = ba + j*4; 821 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1]; 822 rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1]; 823 rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3]; 824 rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3]; 825 } 826 ierr = PetscLogFlops(16*(jmax-jmin));CHKERRQ(ierr); 827 828 /* ... add i to row list for next nonzero entry */ 829 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 830 j = bj[jmin]; 831 jl[i] = jl[j]; jl[j] = i; /* update jl */ 832 } 833 i = nexti; 834 } 835 836 /* save nonzero entries in k-th row of U ... */ 837 838 /* invert diagonal block */ 839 diag = ba+k*4; 840 ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr); 841 ierr = Kernel_A_gets_inverse_A_2(diag,shift);CHKERRQ(ierr); 842 843 jmin = bi[k]; jmax = bi[k+1]; 844 if (jmin < jmax) { 845 for (j=jmin; j<jmax; j++){ 846 vj = bj[j]; /* block col. index of U */ 847 u = ba + j*4; 848 rtmp_ptr = rtmp + vj*4; 849 for (k1=0; k1<4; k1++){ 850 *u++ = *rtmp_ptr; 851 *rtmp_ptr++ = 0.0; 852 } 853 } 854 855 /* ... add k to row list for first nonzero entry in k-th row */ 856 il[k] = jmin; 857 i = bj[jmin]; 858 jl[k] = jl[i]; jl[i] = k; 859 } 860 } 861 862 ierr = PetscFree(rtmp);CHKERRQ(ierr); 863 ierr = PetscFree(il);CHKERRQ(ierr); 864 ierr = PetscFree(dk);CHKERRQ(ierr); 865 if (a->permute) { 866 ierr = PetscFree(aa);CHKERRQ(ierr); 867 } 868 ierr = ISRestoreIndices(perm,&perm_ptr);CHKERRQ(ierr); 869 C->ops->solve = MatSolve_SeqSBAIJ_2; 870 C->ops->solvetranspose = MatSolve_SeqSBAIJ_2; 871 C->assembled = PETSC_TRUE; 872 C->preallocated = PETSC_TRUE; 873 ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 874 PetscFunctionReturn(0); 875 } 876 877 /* 878 Version for when blocks are 2 by 2 Using natural ordering 879 */ 880 #undef __FUNCT__ 881 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering" 882 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 883 { 884 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ *)C->data; 885 PetscErrorCode ierr; 886 PetscInt i,j,mbs=a->mbs,*bi=b->i,*bj=b->j; 887 PetscInt *ai,*aj,k,k1,jmin,jmax,*jl,*il,vj,nexti,ili; 888 MatScalar *ba = b->a,*aa,*ap,*dk,*uik; 889 MatScalar *u,*diag,*rtmp,*rtmp_ptr; 890 PetscReal shift = info->shiftinblocks; 891 892 PetscFunctionBegin; 893 /* initialization */ 894 /* il and jl record the first nonzero element in each row of the accessing 895 window U(0:k, k:mbs-1). 896 jl: list of rows to be added to uneliminated rows 897 i>= k: jl(i) is the first row to be added to row i 898 i< k: jl(i) is the row following row i in some list of rows 899 jl(i) = mbs indicates the end of a list 900 il(i): points to the first nonzero element in columns k,...,mbs-1 of 901 row i of U */ 902 ierr = PetscMalloc(4*mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 903 ierr = PetscMemzero(rtmp,4*mbs*sizeof(MatScalar));CHKERRQ(ierr); 904 ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr); 905 jl = il + mbs; 906 for (i=0; i<mbs; i++) { 907 jl[i] = mbs; il[0] = 0; 908 } 909 ierr = PetscMalloc(8*sizeof(MatScalar),&dk);CHKERRQ(ierr); 910 uik = dk + 4; 911 912 ai = a->i; aj = a->j; aa = a->a; 913 914 /* for each row k */ 915 for (k = 0; k<mbs; k++){ 916 917 /*initialize k-th row with elements nonzero in row k of A */ 918 jmin = ai[k]; jmax = ai[k+1]; 919 ap = aa + jmin*4; 920 for (j = jmin; j < jmax; j++){ 921 vj = aj[j]; /* block col. index */ 922 rtmp_ptr = rtmp + vj*4; 923 for (i=0; i<4; i++) *rtmp_ptr++ = *ap++; 924 } 925 926 /* modify k-th row by adding in those rows i with U(i,k) != 0 */ 927 ierr = PetscMemcpy(dk,rtmp+k*4,4*sizeof(MatScalar));CHKERRQ(ierr); 928 i = jl[k]; /* first row to be added to k_th row */ 929 930 while (i < k){ 931 nexti = jl[i]; /* next row to be added to k_th row */ 932 933 /* compute multiplier */ 934 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 935 936 /* uik = -inv(Di)*U_bar(i,k): - ba[ili]*ba[i] */ 937 diag = ba + i*4; 938 u = ba + ili*4; 939 uik[0] = -(diag[0]*u[0] + diag[2]*u[1]); 940 uik[1] = -(diag[1]*u[0] + diag[3]*u[1]); 941 uik[2] = -(diag[0]*u[2] + diag[2]*u[3]); 942 uik[3] = -(diag[1]*u[2] + diag[3]*u[3]); 943 944 /* update D(k) += -U(i,k)^T * U_bar(i,k): dk += uik*ba[ili] */ 945 dk[0] += uik[0]*u[0] + uik[1]*u[1]; 946 dk[1] += uik[2]*u[0] + uik[3]*u[1]; 947 dk[2] += uik[0]*u[2] + uik[1]*u[3]; 948 dk[3] += uik[2]*u[2] + uik[3]*u[3]; 949 950 ierr = PetscLogFlops(16*2);CHKERRQ(ierr); 951 952 /* update -U(i,k): ba[ili] = uik */ 953 ierr = PetscMemcpy(ba+ili*4,uik,4*sizeof(MatScalar));CHKERRQ(ierr); 954 955 /* add multiple of row i to k-th row ... */ 956 jmin = ili + 1; jmax = bi[i+1]; 957 if (jmin < jmax){ 958 for (j=jmin; j<jmax; j++) { 959 /* rtmp += -U(i,k)^T * U_bar(i,j): rtmp[bj[j]] += uik*ba[j]; */ 960 rtmp_ptr = rtmp + bj[j]*4; 961 u = ba + j*4; 962 rtmp_ptr[0] += uik[0]*u[0] + uik[1]*u[1]; 963 rtmp_ptr[1] += uik[2]*u[0] + uik[3]*u[1]; 964 rtmp_ptr[2] += uik[0]*u[2] + uik[1]*u[3]; 965 rtmp_ptr[3] += uik[2]*u[2] + uik[3]*u[3]; 966 } 967 ierr = PetscLogFlops(16*(jmax-jmin));CHKERRQ(ierr); 968 969 /* ... add i to row list for next nonzero entry */ 970 il[i] = jmin; /* update il(i) in column k+1, ... mbs-1 */ 971 j = bj[jmin]; 972 jl[i] = jl[j]; jl[j] = i; /* update jl */ 973 } 974 i = nexti; 975 } 976 977 /* save nonzero entries in k-th row of U ... */ 978 979 /* invert diagonal block */ 980 diag = ba+k*4; 981 ierr = PetscMemcpy(diag,dk,4*sizeof(MatScalar));CHKERRQ(ierr); 982 ierr = Kernel_A_gets_inverse_A_2(diag,shift);CHKERRQ(ierr); 983 984 jmin = bi[k]; jmax = bi[k+1]; 985 if (jmin < jmax) { 986 for (j=jmin; j<jmax; j++){ 987 vj = bj[j]; /* block col. index of U */ 988 u = ba + j*4; 989 rtmp_ptr = rtmp + vj*4; 990 for (k1=0; k1<4; k1++){ 991 *u++ = *rtmp_ptr; 992 *rtmp_ptr++ = 0.0; 993 } 994 } 995 996 /* ... add k to row list for first nonzero entry in k-th row */ 997 il[k] = jmin; 998 i = bj[jmin]; 999 jl[k] = jl[i]; jl[i] = k; 1000 } 1001 } 1002 1003 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1004 ierr = PetscFree(il);CHKERRQ(ierr); 1005 ierr = PetscFree(dk);CHKERRQ(ierr); 1006 1007 C->ops->solve = MatSolve_SeqSBAIJ_2_NaturalOrdering; 1008 C->ops->solvetranspose = MatSolve_SeqSBAIJ_2_NaturalOrdering; 1009 C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering; 1010 C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering; 1011 C->assembled = PETSC_TRUE; 1012 C->preallocated = PETSC_TRUE; 1013 ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 1014 PetscFunctionReturn(0); 1015 } 1016 1017 /* 1018 Numeric U^T*D*U factorization for SBAIJ format. 1019 Version for blocks are 1 by 1. 1020 */ 1021 #undef __FUNCT__ 1022 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1" 1023 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1(Mat C,Mat A,const MatFactorInfo *info) 1024 { 1025 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data; 1026 IS ip=b->row; 1027 PetscErrorCode ierr; 1028 const PetscInt *ai,*aj,*rip; 1029 PetscInt *a2anew,i,j,mbs=a->mbs,*bi=b->i,*bj=b->j,*bcol; 1030 PetscInt k,jmin,jmax,*jl,*il,col,nexti,ili,nz; 1031 MatScalar *rtmp,*ba=b->a,*bval,*aa,dk,uikdi; 1032 PetscReal zeropivot,rs,shiftnz; 1033 PetscReal shiftpd; 1034 ChShift_Ctx sctx; 1035 PetscInt newshift; 1036 1037 PetscFunctionBegin; 1038 /* initialization */ 1039 shiftnz = info->shiftnz; 1040 shiftpd = info->shiftpd; 1041 zeropivot = info->zeropivot; 1042 1043 ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 1044 if (!a->permute){ 1045 ai = a->i; aj = a->j; aa = a->a; 1046 } else { 1047 ai = a->inew; aj = a->jnew; 1048 nz = ai[mbs]; 1049 ierr = PetscMalloc(nz*sizeof(MatScalar),&aa);CHKERRQ(ierr); 1050 a2anew = a->a2anew; 1051 bval = a->a; 1052 for (j=0; j<nz; j++){ 1053 aa[a2anew[j]] = *(bval++); 1054 } 1055 } 1056 1057 /* initialization */ 1058 /* il and jl record the first nonzero element in each row of the accessing 1059 window U(0:k, k:mbs-1). 1060 jl: list of rows to be added to uneliminated rows 1061 i>= k: jl(i) is the first row to be added to row i 1062 i< k: jl(i) is the row following row i in some list of rows 1063 jl(i) = mbs indicates the end of a list 1064 il(i): points to the first nonzero element in columns k,...,mbs-1 of 1065 row i of U */ 1066 nz = (2*mbs+1)*sizeof(PetscInt)+mbs*sizeof(MatScalar); 1067 ierr = PetscMalloc(nz,&il);CHKERRQ(ierr); 1068 jl = il + mbs; 1069 rtmp = (MatScalar*)(jl + mbs); 1070 1071 sctx.shift_amount = 0; 1072 sctx.nshift = 0; 1073 do { 1074 sctx.chshift = PETSC_FALSE; 1075 for (i=0; i<mbs; i++) { 1076 rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0; 1077 } 1078 1079 for (k = 0; k<mbs; k++){ 1080 /*initialize k-th row by the perm[k]-th row of A */ 1081 jmin = ai[rip[k]]; jmax = ai[rip[k]+1]; 1082 bval = ba + bi[k]; 1083 for (j = jmin; j < jmax; j++){ 1084 col = rip[aj[j]]; 1085 rtmp[col] = aa[j]; 1086 *bval++ = 0.0; /* for in-place factorization */ 1087 } 1088 1089 /* shift the diagonal of the matrix */ 1090 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 1091 1092 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 1093 dk = rtmp[k]; 1094 i = jl[k]; /* first row to be added to k_th row */ 1095 1096 while (i < k){ 1097 nexti = jl[i]; /* next row to be added to k_th row */ 1098 1099 /* compute multiplier, update diag(k) and U(i,k) */ 1100 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 1101 uikdi = - ba[ili]*ba[bi[i]]; /* diagonal(k) */ 1102 dk += uikdi*ba[ili]; 1103 ba[ili] = uikdi; /* -U(i,k) */ 1104 1105 /* add multiple of row i to k-th row */ 1106 jmin = ili + 1; jmax = bi[i+1]; 1107 if (jmin < jmax){ 1108 for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j]; 1109 ierr = PetscLogFlops(2*(jmax-jmin));CHKERRQ(ierr); 1110 1111 /* update il and jl for row i */ 1112 il[i] = jmin; 1113 j = bj[jmin]; jl[i] = jl[j]; jl[j] = i; 1114 } 1115 i = nexti; 1116 } 1117 1118 /* shift the diagonals when zero pivot is detected */ 1119 /* compute rs=sum of abs(off-diagonal) */ 1120 rs = 0.0; 1121 jmin = bi[k]+1; 1122 nz = bi[k+1] - jmin; 1123 if (nz){ 1124 bcol = bj + jmin; 1125 while (nz--){ 1126 rs += PetscAbsScalar(rtmp[*bcol]); 1127 bcol++; 1128 } 1129 } 1130 1131 sctx.rs = rs; 1132 sctx.pv = dk; 1133 ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr); 1134 if (newshift == 1) break; /* sctx.shift_amount is updated */ 1135 1136 /* copy data into U(k,:) */ 1137 ba[bi[k]] = 1.0/dk; /* U(k,k) */ 1138 jmin = bi[k]+1; jmax = bi[k+1]; 1139 if (jmin < jmax) { 1140 for (j=jmin; j<jmax; j++){ 1141 col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0; 1142 } 1143 /* add the k-th row into il and jl */ 1144 il[k] = jmin; 1145 i = bj[jmin]; jl[k] = jl[i]; jl[i] = k; 1146 } 1147 } 1148 } while (sctx.chshift); 1149 ierr = PetscFree(il);CHKERRQ(ierr); 1150 if (a->permute){ierr = PetscFree(aa);CHKERRQ(ierr);} 1151 1152 ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 1153 C->ops->solve = MatSolve_SeqSBAIJ_1; 1154 C->ops->solves = MatSolves_SeqSBAIJ_1; 1155 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1; 1156 C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1; 1157 C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1; 1158 C->assembled = PETSC_TRUE; 1159 C->preallocated = PETSC_TRUE; 1160 ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr); 1161 if (sctx.nshift){ 1162 if (shiftnz) { 1163 ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1164 } else if (shiftpd) { 1165 ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1166 } 1167 } 1168 PetscFunctionReturn(0); 1169 } 1170 1171 /* 1172 Version for when blocks are 1 by 1 Using natural ordering 1173 */ 1174 #undef __FUNCT__ 1175 #define __FUNCT__ "MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering" 1176 PetscErrorCode MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 1177 { 1178 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data,*b=(Mat_SeqSBAIJ *)C->data; 1179 PetscErrorCode ierr; 1180 PetscInt i,j,mbs = a->mbs; 1181 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 1182 PetscInt k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz; 1183 MatScalar *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval; 1184 PetscReal zeropivot,rs,shiftnz; 1185 PetscReal shiftpd; 1186 ChShift_Ctx sctx; 1187 PetscInt newshift; 1188 1189 PetscFunctionBegin; 1190 /* initialization */ 1191 shiftnz = info->shiftnz; 1192 shiftpd = info->shiftpd; 1193 zeropivot = info->zeropivot; 1194 1195 /* il and jl record the first nonzero element in each row of the accessing 1196 window U(0:k, k:mbs-1). 1197 jl: list of rows to be added to uneliminated rows 1198 i>= k: jl(i) is the first row to be added to row i 1199 i< k: jl(i) is the row following row i in some list of rows 1200 jl(i) = mbs indicates the end of a list 1201 il(i): points to the first nonzero element in U(i,k:mbs-1) 1202 */ 1203 ierr = PetscMalloc(mbs*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1204 ierr = PetscMalloc(2*mbs*sizeof(PetscInt),&il);CHKERRQ(ierr); 1205 jl = il + mbs; 1206 1207 sctx.shift_amount = 0; 1208 sctx.nshift = 0; 1209 do { 1210 sctx.chshift = PETSC_FALSE; 1211 for (i=0; i<mbs; i++) { 1212 rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0; 1213 } 1214 1215 for (k = 0; k<mbs; k++){ 1216 /*initialize k-th row with elements nonzero in row perm(k) of A */ 1217 nz = ai[k+1] - ai[k]; 1218 acol = aj + ai[k]; 1219 aval = aa + ai[k]; 1220 bval = ba + bi[k]; 1221 while (nz -- ){ 1222 rtmp[*acol++] = *aval++; 1223 *bval++ = 0.0; /* for in-place factorization */ 1224 } 1225 1226 /* shift the diagonal of the matrix */ 1227 if (sctx.nshift) rtmp[k] += sctx.shift_amount; 1228 1229 /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 1230 dk = rtmp[k]; 1231 i = jl[k]; /* first row to be added to k_th row */ 1232 1233 while (i < k){ 1234 nexti = jl[i]; /* next row to be added to k_th row */ 1235 /* compute multiplier, update D(k) and U(i,k) */ 1236 ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 1237 uikdi = - ba[ili]*ba[bi[i]]; 1238 dk += uikdi*ba[ili]; 1239 ba[ili] = uikdi; /* -U(i,k) */ 1240 1241 /* add multiple of row i to k-th row ... */ 1242 jmin = ili + 1; 1243 nz = bi[i+1] - jmin; 1244 if (nz > 0){ 1245 bcol = bj + jmin; 1246 bval = ba + jmin; 1247 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1248 while (nz --) rtmp[*bcol++] += uikdi*(*bval++); 1249 1250 /* update il and jl for i-th row */ 1251 il[i] = jmin; 1252 j = bj[jmin]; jl[i] = jl[j]; jl[j] = i; 1253 } 1254 i = nexti; 1255 } 1256 1257 /* shift the diagonals when zero pivot is detected */ 1258 /* compute rs=sum of abs(off-diagonal) */ 1259 rs = 0.0; 1260 jmin = bi[k]+1; 1261 nz = bi[k+1] - jmin; 1262 if (nz){ 1263 bcol = bj + jmin; 1264 while (nz--){ 1265 rs += PetscAbsScalar(rtmp[*bcol]); 1266 bcol++; 1267 } 1268 } 1269 1270 sctx.rs = rs; 1271 sctx.pv = dk; 1272 ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr); 1273 if (newshift == 1) break; /* sctx.shift_amount is updated */ 1274 1275 /* copy data into U(k,:) */ 1276 ba[bi[k]] = 1.0/dk; 1277 jmin = bi[k]+1; 1278 nz = bi[k+1] - jmin; 1279 if (nz){ 1280 bcol = bj + jmin; 1281 bval = ba + jmin; 1282 while (nz--){ 1283 *bval++ = rtmp[*bcol]; 1284 rtmp[*bcol++] = 0.0; 1285 } 1286 /* add k-th row into il and jl */ 1287 il[k] = jmin; 1288 i = bj[jmin]; jl[k] = jl[i]; jl[i] = k; 1289 } 1290 } /* end of for (k = 0; k<mbs; k++) */ 1291 } while (sctx.chshift); 1292 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1293 ierr = PetscFree(il);CHKERRQ(ierr); 1294 1295 C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering; 1296 C->ops->solves = MatSolves_SeqSBAIJ_1; 1297 C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering; 1298 C->ops->forwardsolve = MatForwardSolve_SeqSBAIJ_1_NaturalOrdering; 1299 C->ops->backwardsolve = MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering; 1300 1301 C->assembled = PETSC_TRUE; 1302 C->preallocated = PETSC_TRUE; 1303 ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr); 1304 if (sctx.nshift){ 1305 if (shiftnz) { 1306 ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1307 } else if (shiftpd) { 1308 ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1309 } 1310 } 1311 PetscFunctionReturn(0); 1312 } 1313 1314 #undef __FUNCT__ 1315 #define __FUNCT__ "MatCholeskyFactor_SeqSBAIJ" 1316 PetscErrorCode MatCholeskyFactor_SeqSBAIJ(Mat A,IS perm,const MatFactorInfo *info) 1317 { 1318 PetscErrorCode ierr; 1319 Mat C; 1320 1321 PetscFunctionBegin; 1322 ierr = MatGetFactor(A,"petsc",MAT_FACTOR_CHOLESKY,&C);CHKERRQ(ierr); 1323 ierr = MatCholeskyFactorSymbolic(C,A,perm,info);CHKERRQ(ierr); 1324 ierr = MatCholeskyFactorNumeric(C,A,info);CHKERRQ(ierr); 1325 A->ops->solve = C->ops->solve; 1326 A->ops->solvetranspose = C->ops->solvetranspose; 1327 ierr = MatHeaderCopy(A,C);CHKERRQ(ierr); 1328 PetscFunctionReturn(0); 1329 } 1330 1331 1332