1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include <../src/mat/impls/baij/seq/baij.h> 7 #include <petsc/private/kernels/blockinvert.h> 8 #include <petscbt.h> 9 #include <../src/mat/utils/freespace.h> 10 11 /* ----------------------------------------------------------------*/ 12 extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool); 13 14 /* 15 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 16 */ 17 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 18 { 19 Mat C =B; 20 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 21 PetscErrorCode ierr; 22 PetscInt i,j,k,ipvt[15]; 23 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 24 PetscInt nz,nzL,row; 25 MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 26 const MatScalar *v,*aa=a->a; 27 PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 28 PetscInt sol_ver; 29 PetscBool allowzeropivot,zeropivotdetected; 30 31 PetscFunctionBegin; 32 allowzeropivot = PetscNot(A->erroriffailure); 33 ierr = PetscOptionsGetInt(NULL,((PetscObject)A)->prefix,"-sol_ver",&sol_ver,NULL);CHKERRQ(ierr); 34 35 /* generate work space needed by the factorization */ 36 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 37 ierr = PetscArrayzero(rtmp,bs2*n);CHKERRQ(ierr); 38 39 for (i=0; i<n; i++) { 40 /* zero rtmp */ 41 /* L part */ 42 nz = bi[i+1] - bi[i]; 43 bjtmp = bj + bi[i]; 44 for (j=0; j<nz; j++) { 45 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 46 } 47 48 /* U part */ 49 nz = bdiag[i] - bdiag[i+1]; 50 bjtmp = bj + bdiag[i+1]+1; 51 for (j=0; j<nz; j++) { 52 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 53 } 54 55 /* load in initial (unfactored row) */ 56 nz = ai[i+1] - ai[i]; 57 ajtmp = aj + ai[i]; 58 v = aa + bs2*ai[i]; 59 for (j=0; j<nz; j++) { 60 ierr = PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2);CHKERRQ(ierr); 61 } 62 63 /* elimination */ 64 bjtmp = bj + bi[i]; 65 nzL = bi[i+1] - bi[i]; 66 for (k=0; k < nzL; k++) { 67 row = bjtmp[k]; 68 pc = rtmp + bs2*row; 69 for (flg=0,j=0; j<bs2; j++) { 70 if (pc[j]!=0.0) { 71 flg = 1; 72 break; 73 } 74 } 75 if (flg) { 76 pv = b->a + bs2*bdiag[row]; 77 PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); 78 /*ierr = PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 79 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 80 pv = b->a + bs2*(bdiag[row+1]+1); 81 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 82 for (j=0; j<nz; j++) { 83 vv = rtmp + bs2*pj[j]; 84 PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 85 /* ierr = PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 86 pv += bs2; 87 } 88 ierr = PetscLogFlops(2.0*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 89 } 90 } 91 92 /* finished row so stick it into b->a */ 93 /* L part */ 94 pv = b->a + bs2*bi[i]; 95 pj = b->j + bi[i]; 96 nz = bi[i+1] - bi[i]; 97 for (j=0; j<nz; j++) { 98 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 99 } 100 101 /* Mark diagonal and invert diagonal for simpler triangular solves */ 102 pv = b->a + bs2*bdiag[i]; 103 pj = b->j + bdiag[i]; 104 ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);CHKERRQ(ierr); 105 ierr = PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 106 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 107 108 /* U part */ 109 pv = b->a + bs2*(bdiag[i+1]+1); 110 pj = b->j + bdiag[i+1]+1; 111 nz = bdiag[i] - bdiag[i+1] - 1; 112 for (j=0; j<nz; j++) { 113 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 114 } 115 } 116 117 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 118 119 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 120 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 121 C->assembled = PETSC_TRUE; 122 123 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 124 PetscFunctionReturn(0); 125 } 126 127 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 128 { 129 Mat C =B; 130 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 131 IS isrow = b->row,isicol = b->icol; 132 PetscErrorCode ierr; 133 const PetscInt *r,*ic; 134 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 135 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 136 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 137 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 138 MatScalar *v_work; 139 PetscBool col_identity,row_identity,both_identity; 140 PetscBool allowzeropivot,zeropivotdetected; 141 142 PetscFunctionBegin; 143 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 144 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 145 allowzeropivot = PetscNot(A->erroriffailure); 146 147 ierr = PetscCalloc1(bs2*n,&rtmp);CHKERRQ(ierr); 148 149 /* generate work space needed by dense LU factorization */ 150 ierr = PetscMalloc3(bs,&v_work,bs2,&mwork,bs,&v_pivots);CHKERRQ(ierr); 151 152 for (i=0; i<n; i++) { 153 /* zero rtmp */ 154 /* L part */ 155 nz = bi[i+1] - bi[i]; 156 bjtmp = bj + bi[i]; 157 for (j=0; j<nz; j++) { 158 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 159 } 160 161 /* U part */ 162 nz = bdiag[i] - bdiag[i+1]; 163 bjtmp = bj + bdiag[i+1]+1; 164 for (j=0; j<nz; j++) { 165 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 166 } 167 168 /* load in initial (unfactored row) */ 169 nz = ai[r[i]+1] - ai[r[i]]; 170 ajtmp = aj + ai[r[i]]; 171 v = aa + bs2*ai[r[i]]; 172 for (j=0; j<nz; j++) { 173 ierr = PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2);CHKERRQ(ierr); 174 } 175 176 /* elimination */ 177 bjtmp = bj + bi[i]; 178 nzL = bi[i+1] - bi[i]; 179 for (k=0; k < nzL; k++) { 180 row = bjtmp[k]; 181 pc = rtmp + bs2*row; 182 for (flg=0,j=0; j<bs2; j++) { 183 if (pc[j]!=0.0) { 184 flg = 1; 185 break; 186 } 187 } 188 if (flg) { 189 pv = b->a + bs2*bdiag[row]; 190 PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 191 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 192 pv = b->a + bs2*(bdiag[row+1]+1); 193 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 194 for (j=0; j<nz; j++) { 195 PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 196 } 197 ierr = PetscLogFlops(2.0*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 198 } 199 } 200 201 /* finished row so stick it into b->a */ 202 /* L part */ 203 pv = b->a + bs2*bi[i]; 204 pj = b->j + bi[i]; 205 nz = bi[i+1] - bi[i]; 206 for (j=0; j<nz; j++) { 207 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 208 } 209 210 /* Mark diagonal and invert diagonal for simpler triangular solves */ 211 pv = b->a + bs2*bdiag[i]; 212 pj = b->j + bdiag[i]; 213 ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);CHKERRQ(ierr); 214 215 ierr = PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 216 if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 217 218 /* U part */ 219 pv = b->a + bs2*(bdiag[i+1]+1); 220 pj = b->j + bdiag[i+1]+1; 221 nz = bdiag[i] - bdiag[i+1] - 1; 222 for (j=0; j<nz; j++) { 223 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 224 } 225 } 226 227 ierr = PetscFree(rtmp);CHKERRQ(ierr); 228 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 229 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 230 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 231 232 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 233 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 234 235 both_identity = (PetscBool) (row_identity && col_identity); 236 if (both_identity) { 237 switch (bs) { 238 case 9: 239 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES) 240 C->ops->solve = MatSolve_SeqBAIJ_9_NaturalOrdering; 241 #else 242 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 243 #endif 244 break; 245 case 11: 246 C->ops->solve = MatSolve_SeqBAIJ_11_NaturalOrdering; 247 break; 248 case 12: 249 C->ops->solve = MatSolve_SeqBAIJ_12_NaturalOrdering; 250 break; 251 case 13: 252 C->ops->solve = MatSolve_SeqBAIJ_13_NaturalOrdering; 253 break; 254 case 14: 255 C->ops->solve = MatSolve_SeqBAIJ_14_NaturalOrdering; 256 break; 257 default: 258 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 259 break; 260 } 261 } else { 262 C->ops->solve = MatSolve_SeqBAIJ_N; 263 } 264 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 265 266 C->assembled = PETSC_TRUE; 267 268 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 269 PetscFunctionReturn(0); 270 } 271 272 /* 273 ilu(0) with natural ordering under new data structure. 274 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 275 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 276 */ 277 278 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 279 { 280 281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 282 PetscErrorCode ierr; 283 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 284 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 285 286 PetscFunctionBegin; 287 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 288 b = (Mat_SeqBAIJ*)(fact)->data; 289 290 /* allocate matrix arrays for new data structure */ 291 ierr = PetscMalloc3(bs2*ai[n]+1,&b->a,ai[n]+1,&b->j,n+1,&b->i);CHKERRQ(ierr); 292 ierr = PetscLogObjectMemory((PetscObject)fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 293 294 b->singlemalloc = PETSC_TRUE; 295 b->free_a = PETSC_TRUE; 296 b->free_ij = PETSC_TRUE; 297 fact->preallocated = PETSC_TRUE; 298 fact->assembled = PETSC_TRUE; 299 if (!b->diag) { 300 ierr = PetscMalloc1(n+1,&b->diag);CHKERRQ(ierr); 301 ierr = PetscLogObjectMemory((PetscObject)fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 302 } 303 bdiag = b->diag; 304 305 if (n > 0) { 306 ierr = PetscArrayzero(b->a,bs2*ai[n]);CHKERRQ(ierr); 307 } 308 309 /* set bi and bj with new data structure */ 310 bi = b->i; 311 bj = b->j; 312 313 /* L part */ 314 bi[0] = 0; 315 for (i=0; i<n; i++) { 316 nz = adiag[i] - ai[i]; 317 bi[i+1] = bi[i] + nz; 318 aj = a->j + ai[i]; 319 for (j=0; j<nz; j++) { 320 *bj = aj[j]; bj++; 321 } 322 } 323 324 /* U part */ 325 bi_temp = bi[n]; 326 bdiag[n] = bi[n]-1; 327 for (i=n-1; i>=0; i--) { 328 nz = ai[i+1] - adiag[i] - 1; 329 bi_temp = bi_temp + nz + 1; 330 aj = a->j + adiag[i] + 1; 331 for (j=0; j<nz; j++) { 332 *bj = aj[j]; bj++; 333 } 334 /* diag[i] */ 335 *bj = i; bj++; 336 bdiag[i] = bi_temp - 1; 337 } 338 PetscFunctionReturn(0); 339 } 340 341 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 342 { 343 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 344 IS isicol; 345 PetscErrorCode ierr; 346 const PetscInt *r,*ic; 347 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 348 PetscInt *bi,*cols,nnz,*cols_lvl; 349 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 350 PetscInt i,levels,diagonal_fill; 351 PetscBool col_identity,row_identity,both_identity; 352 PetscReal f; 353 PetscInt nlnk,*lnk,*lnk_lvl=NULL; 354 PetscBT lnkbt; 355 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 356 PetscFreeSpaceList free_space =NULL,current_space=NULL; 357 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 358 PetscBool missing; 359 PetscInt bs=A->rmap->bs,bs2=a->bs2; 360 361 PetscFunctionBegin; 362 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 363 if (bs>1) { /* check shifttype */ 364 if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 365 } 366 367 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 368 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 369 370 f = info->fill; 371 levels = (PetscInt)info->levels; 372 diagonal_fill = (PetscInt)info->diagonal_fill; 373 374 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 375 376 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 377 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 378 379 both_identity = (PetscBool) (row_identity && col_identity); 380 381 if (!levels && both_identity) { 382 /* special case: ilu(0) with natural ordering */ 383 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 384 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 385 386 fact->factortype = MAT_FACTOR_ILU; 387 (fact)->info.factor_mallocs = 0; 388 (fact)->info.fill_ratio_given = info->fill; 389 (fact)->info.fill_ratio_needed = 1.0; 390 391 b = (Mat_SeqBAIJ*)(fact)->data; 392 b->row = isrow; 393 b->col = iscol; 394 b->icol = isicol; 395 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 396 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 397 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 398 399 ierr = PetscMalloc1((n+1)*bs,&b->solve_work);CHKERRQ(ierr); 400 PetscFunctionReturn(0); 401 } 402 403 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 404 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 405 406 /* get new row pointers */ 407 ierr = PetscMalloc1(n+1,&bi);CHKERRQ(ierr); 408 bi[0] = 0; 409 /* bdiag is location of diagonal in factor */ 410 ierr = PetscMalloc1(n+1,&bdiag);CHKERRQ(ierr); 411 bdiag[0] = 0; 412 413 ierr = PetscMalloc2(n,&bj_ptr,n,&bjlvl_ptr);CHKERRQ(ierr); 414 415 /* create a linked list for storing column indices of the active row */ 416 nlnk = n + 1; 417 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 418 419 /* initial FreeSpace size is f*(ai[n]+1) */ 420 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(f,ai[n]+1),&free_space);CHKERRQ(ierr); 421 current_space = free_space; 422 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(f,ai[n]+1),&free_space_lvl);CHKERRQ(ierr); 423 current_space_lvl = free_space_lvl; 424 425 for (i=0; i<n; i++) { 426 nzi = 0; 427 /* copy current row into linked list */ 428 nnz = ai[r[i]+1] - ai[r[i]]; 429 if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 430 cols = aj + ai[r[i]]; 431 lnk[i] = -1; /* marker to indicate if diagonal exists */ 432 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 433 nzi += nlnk; 434 435 /* make sure diagonal entry is included */ 436 if (diagonal_fill && lnk[i] == -1) { 437 fm = n; 438 while (lnk[fm] < i) fm = lnk[fm]; 439 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 440 lnk[fm] = i; 441 lnk_lvl[i] = 0; 442 nzi++; dcount++; 443 } 444 445 /* add pivot rows into the active row */ 446 nzbd = 0; 447 prow = lnk[n]; 448 while (prow < i) { 449 nnz = bdiag[prow]; 450 cols = bj_ptr[prow] + nnz + 1; 451 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 452 nnz = bi[prow+1] - bi[prow] - nnz - 1; 453 454 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 455 nzi += nlnk; 456 prow = lnk[prow]; 457 nzbd++; 458 } 459 bdiag[i] = nzbd; 460 bi[i+1] = bi[i] + nzi; 461 462 /* if free space is not available, make more free space */ 463 if (current_space->local_remaining<nzi) { 464 nnz = PetscIntMultTruncate(2,PetscIntMultTruncate(nzi,(n - i))); /* estimated and max additional space needed */ 465 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 466 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 467 reallocs++; 468 } 469 470 /* copy data into free_space and free_space_lvl, then initialize lnk */ 471 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 472 473 bj_ptr[i] = current_space->array; 474 bjlvl_ptr[i] = current_space_lvl->array; 475 476 /* make sure the active row i has diagonal entry */ 477 if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 478 479 current_space->array += nzi; 480 current_space->local_used += nzi; 481 current_space->local_remaining -= nzi; 482 483 current_space_lvl->array += nzi; 484 current_space_lvl->local_used += nzi; 485 current_space_lvl->local_remaining -= nzi; 486 } 487 488 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 489 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 490 491 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 492 ierr = PetscMalloc1(bi[n]+1,&bj);CHKERRQ(ierr); 493 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 494 495 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 496 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 497 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 498 499 #if defined(PETSC_USE_INFO) 500 { 501 PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 502 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr); 503 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 504 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%g);\n",(double)af);CHKERRQ(ierr); 505 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 506 if (diagonal_fill) { 507 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 508 } 509 } 510 #endif 511 512 /* put together the new matrix */ 513 ierr = MatSeqBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 514 ierr = PetscLogObjectParent((PetscObject)fact,(PetscObject)isicol);CHKERRQ(ierr); 515 516 b = (Mat_SeqBAIJ*)(fact)->data; 517 b->free_a = PETSC_TRUE; 518 b->free_ij = PETSC_TRUE; 519 b->singlemalloc = PETSC_FALSE; 520 521 ierr = PetscMalloc1(bs2*(bdiag[0]+1),&b->a);CHKERRQ(ierr); 522 523 b->j = bj; 524 b->i = bi; 525 b->diag = bdiag; 526 b->free_diag = PETSC_TRUE; 527 b->ilen = NULL; 528 b->imax = NULL; 529 b->row = isrow; 530 b->col = iscol; 531 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 532 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 533 b->icol = isicol; 534 535 ierr = PetscMalloc1(bs*n+bs,&b->solve_work);CHKERRQ(ierr); 536 /* In b structure: Free imax, ilen, old a, old j. 537 Allocate bdiag, solve_work, new a, new j */ 538 ierr = PetscLogObjectMemory((PetscObject)fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 539 b->maxnz = b->nz = bdiag[0]+1; 540 541 fact->info.factor_mallocs = reallocs; 542 fact->info.fill_ratio_given = f; 543 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 544 545 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 546 PetscFunctionReturn(0); 547 } 548 549 /* 550 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 551 except that the data structure of Mat_SeqAIJ is slightly different. 552 Not a good example of code reuse. 553 */ 554 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 555 { 556 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 557 IS isicol; 558 PetscErrorCode ierr; 559 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 560 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 561 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 562 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 563 PetscBool col_identity,row_identity,both_identity,flg; 564 PetscReal f; 565 566 PetscFunctionBegin; 567 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 568 if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 569 570 f = info->fill; 571 levels = (PetscInt)info->levels; 572 diagonal_fill = (PetscInt)info->diagonal_fill; 573 574 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 575 576 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 577 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 578 both_identity = (PetscBool) (row_identity && col_identity); 579 580 if (!levels && both_identity) { /* special case copy the nonzero structure */ 581 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 582 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 583 584 fact->factortype = MAT_FACTOR_ILU; 585 b = (Mat_SeqBAIJ*)fact->data; 586 b->row = isrow; 587 b->col = iscol; 588 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 589 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 590 b->icol = isicol; 591 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 592 593 ierr = PetscMalloc1((n+1)*bs,&b->solve_work);CHKERRQ(ierr); 594 PetscFunctionReturn(0); 595 } 596 597 /* general case perform the symbolic factorization */ 598 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 599 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 600 601 /* get new row pointers */ 602 ierr = PetscMalloc1(n+1,&ainew);CHKERRQ(ierr); 603 ainew[0] = 0; 604 /* don't know how many column pointers are needed so estimate */ 605 jmax = (PetscInt)(f*ai[n] + 1); 606 ierr = PetscMalloc1(jmax,&ajnew);CHKERRQ(ierr); 607 /* ajfill is level of fill for each fill entry */ 608 ierr = PetscMalloc1(jmax,&ajfill);CHKERRQ(ierr); 609 /* fill is a linked list of nonzeros in active row */ 610 ierr = PetscMalloc1(n+1,&fill);CHKERRQ(ierr); 611 /* im is level for each filled value */ 612 ierr = PetscMalloc1(n+1,&im);CHKERRQ(ierr); 613 /* dloc is location of diagonal in factor */ 614 ierr = PetscMalloc1(n+1,&dloc);CHKERRQ(ierr); 615 dloc[0] = 0; 616 for (prow=0; prow<n; prow++) { 617 618 /* copy prow into linked list */ 619 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 620 if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 621 xi = aj + ai[r[prow]]; 622 fill[n] = n; 623 fill[prow] = -1; /* marker for diagonal entry */ 624 while (nz--) { 625 fm = n; 626 idx = ic[*xi++]; 627 do { 628 m = fm; 629 fm = fill[m]; 630 } while (fm < idx); 631 fill[m] = idx; 632 fill[idx] = fm; 633 im[idx] = 0; 634 } 635 636 /* make sure diagonal entry is included */ 637 if (diagonal_fill && fill[prow] == -1) { 638 fm = n; 639 while (fill[fm] < prow) fm = fill[fm]; 640 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 641 fill[fm] = prow; 642 im[prow] = 0; 643 nzf++; 644 dcount++; 645 } 646 647 nzi = 0; 648 row = fill[n]; 649 while (row < prow) { 650 incrlev = im[row] + 1; 651 nz = dloc[row]; 652 xi = ajnew + ainew[row] + nz + 1; 653 flev = ajfill + ainew[row] + nz + 1; 654 nnz = ainew[row+1] - ainew[row] - nz - 1; 655 fm = row; 656 while (nnz-- > 0) { 657 idx = *xi++; 658 if (*flev + incrlev > levels) { 659 flev++; 660 continue; 661 } 662 do { 663 m = fm; 664 fm = fill[m]; 665 } while (fm < idx); 666 if (fm != idx) { 667 im[idx] = *flev + incrlev; 668 fill[m] = idx; 669 fill[idx] = fm; 670 fm = idx; 671 nzf++; 672 } else if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 673 flev++; 674 } 675 row = fill[row]; 676 nzi++; 677 } 678 /* copy new filled row into permanent storage */ 679 ainew[prow+1] = ainew[prow] + nzf; 680 if (ainew[prow+1] > jmax) { 681 682 /* estimate how much additional space we will need */ 683 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 684 /* just double the memory each time */ 685 PetscInt maxadd = jmax; 686 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 687 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 688 jmax += maxadd; 689 690 /* allocate a longer ajnew and ajfill */ 691 ierr = PetscMalloc1(jmax,&xitmp);CHKERRQ(ierr); 692 ierr = PetscArraycpy(xitmp,ajnew,ainew[prow]);CHKERRQ(ierr); 693 ierr = PetscFree(ajnew);CHKERRQ(ierr); 694 ajnew = xitmp; 695 ierr = PetscMalloc1(jmax,&xitmp);CHKERRQ(ierr); 696 ierr = PetscArraycpy(xitmp,ajfill,ainew[prow]);CHKERRQ(ierr); 697 ierr = PetscFree(ajfill);CHKERRQ(ierr); 698 ajfill = xitmp; 699 reallocate++; /* count how many reallocations are needed */ 700 } 701 xitmp = ajnew + ainew[prow]; 702 flev = ajfill + ainew[prow]; 703 dloc[prow] = nzi; 704 fm = fill[n]; 705 while (nzf--) { 706 *xitmp++ = fm; 707 *flev++ = im[fm]; 708 fm = fill[fm]; 709 } 710 /* make sure row has diagonal entry */ 711 if (ajnew[ainew[prow]+dloc[prow]] != prow) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 712 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 713 } 714 ierr = PetscFree(ajfill);CHKERRQ(ierr); 715 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 716 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 717 ierr = PetscFree(fill);CHKERRQ(ierr); 718 ierr = PetscFree(im);CHKERRQ(ierr); 719 720 #if defined(PETSC_USE_INFO) 721 { 722 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 723 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocate,(double)f,(double)af);CHKERRQ(ierr); 724 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 725 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr); 726 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 727 if (diagonal_fill) { 728 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 729 } 730 } 731 #endif 732 733 /* put together the new matrix */ 734 ierr = MatSeqBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 735 ierr = PetscLogObjectParent((PetscObject)fact,(PetscObject)isicol);CHKERRQ(ierr); 736 b = (Mat_SeqBAIJ*)fact->data; 737 738 b->free_a = PETSC_TRUE; 739 b->free_ij = PETSC_TRUE; 740 b->singlemalloc = PETSC_FALSE; 741 742 ierr = PetscMalloc1(bs2*ainew[n],&b->a);CHKERRQ(ierr); 743 744 b->j = ajnew; 745 b->i = ainew; 746 for (i=0; i<n; i++) dloc[i] += ainew[i]; 747 b->diag = dloc; 748 b->free_diag = PETSC_TRUE; 749 b->ilen = NULL; 750 b->imax = NULL; 751 b->row = isrow; 752 b->col = iscol; 753 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 754 755 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 756 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 757 b->icol = isicol; 758 ierr = PetscMalloc1(bs*n+bs,&b->solve_work);CHKERRQ(ierr); 759 /* In b structure: Free imax, ilen, old a, old j. 760 Allocate dloc, solve_work, new a, new j */ 761 ierr = PetscLogObjectMemory((PetscObject)fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 762 b->maxnz = b->nz = ainew[n]; 763 764 fact->info.factor_mallocs = reallocate; 765 fact->info.fill_ratio_given = f; 766 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 767 768 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 769 PetscFunctionReturn(0); 770 } 771 772 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 773 { 774 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */ 775 /* int i,*AJ=a->j,nz=a->nz; */ 776 777 PetscFunctionBegin; 778 /* Undo Column scaling */ 779 /* while (nz--) { */ 780 /* AJ[i] = AJ[i]/4; */ 781 /* } */ 782 /* This should really invoke a push/pop logic, but we don't have that yet. */ 783 A->ops->setunfactored = NULL; 784 PetscFunctionReturn(0); 785 } 786 787 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 788 { 789 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 790 PetscInt *AJ=a->j,nz=a->nz; 791 unsigned short *aj=(unsigned short*)AJ; 792 793 PetscFunctionBegin; 794 /* Is this really necessary? */ 795 while (nz--) { 796 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 797 } 798 A->ops->setunfactored = NULL; 799 PetscFunctionReturn(0); 800 } 801 802