/* This file provides high performance routines for the Inode format (compressed sparse row) by taking advantage of rows with identical nonzero structure (I-nodes). */ #include <../src/mat/impls/aij/seq/aij.h> static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; PetscFunctionBegin; n = A->cmap->n; m = A->rmap->n; if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); ns_row = a->inode.size; min_mn = (m < n) ? m : n; if (!ns) { for (count=0,i=0; count n) { /* Adjust for the over estimation */ ns_col[i-1] += n - count; } *size = i; *ns = ns_col; PetscFunctionReturn(0); } /* This builds symmetric version of nonzero structure, */ static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; PetscFunctionBegin; nslim_row = a->inode.node_count; m = A->rmap->n; n = A->cmap->n; if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); /* Use the row_inode as column_inode */ nslim_col = nslim_row; ns_col = ns_row; /* allocate space for reformated inode structure */ ierr = PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);CHKERRQ(ierr); for (i1=0,tns[0]=0; i1data; PetscErrorCode ierr; PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; PetscInt *tns,*tvc,nsz,i1,i2; const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; PetscFunctionBegin; if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); nslim_row = a->inode.node_count; n = A->cmap->n; /* Create The column_inode for this matrix */ ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); /* allocate space for reformated column_inode structure */ ierr = PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);CHKERRQ(ierr); for (i1=0,tns[0]=0; i1 0) { /* off-diagonal elemets */ ia[i1+1]++; i2++; /* Start col of next node */ while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; if (nz > 0) i2 = tvc[col]; } } /* shift ia[i] to point to next row */ for (i1=1; i1 0) { ja[work[i1]++] = i2 + oshift; ++i2; while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; if (nz > 0) i2 = tvc[col]; } } ierr = PetscFree(ns_col);CHKERRQ(ierr); ierr = PetscFree(work);CHKERRQ(ierr); ierr = PetscFree2(tns,tvc);CHKERRQ(ierr); PetscFunctionReturn(0); } static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscFunctionBegin; *n = a->inode.node_count; if (!ia) PetscFunctionReturn(0); if (!blockcompressed) { ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); } else if (symmetric) { ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); } else { ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); } PetscFunctionReturn(0); } static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) { PetscErrorCode ierr; PetscFunctionBegin; if (!ia) PetscFunctionReturn(0); if (!blockcompressed) { ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); } else { ierr = PetscFree(*ia);CHKERRQ(ierr); ierr = PetscFree(*ja);CHKERRQ(ierr); } PetscFunctionReturn(0); } /* ----------------------------------------------------------- */ static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; PetscFunctionBegin; if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); nslim_row = a->inode.node_count; n = A->cmap->n; /* Create The column_inode for this matrix */ ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); /* allocate space for reformated column_inode structure */ ierr = PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);CHKERRQ(ierr); for (i1=0,tns[0]=0; i1 0) { /* off-diagonal elemets */ /* ia[i1+1]++; */ ia[i2+1]++; i2++; while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; if (nz > 0) i2 = tvc[col]; } } /* shift ia[i] to point to next col */ for (i1=1; i1 0) { /* ja[work[i1]++] = i2 + oshift; */ ja[work[i2]++] = i1 + oshift; i2++; while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; if (nz > 0) i2 = tvc[col]; } } ierr = PetscFree(ns_col);CHKERRQ(ierr); ierr = PetscFree(work);CHKERRQ(ierr); ierr = PetscFree2(tns,tvc);CHKERRQ(ierr); PetscFunctionReturn(0); } static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) { PetscErrorCode ierr; PetscFunctionBegin; ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr); if (!ia) PetscFunctionReturn(0); if (!blockcompressed) { ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); } else if (symmetric) { /* Since the indices are symmetric it doesn't matter */ ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); } else { ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); } PetscFunctionReturn(0); } static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) { PetscErrorCode ierr; PetscFunctionBegin; if (!ia) PetscFunctionReturn(0); if (!blockcompressed) { ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); } else { ierr = PetscFree(*ia);CHKERRQ(ierr); ierr = PetscFree(*ja);CHKERRQ(ierr); } PetscFunctionReturn(0); } /* ----------------------------------------------------------- */ PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; PetscScalar *y; const PetscScalar *x; const MatScalar *v1,*v2,*v3,*v4,*v5; PetscErrorCode ierr; PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; const PetscInt *idx,*ns,*ii; #if defined(PETSC_HAVE_PRAGMA_DISJOINT) #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) #endif PetscFunctionBegin; if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); node_max = a->inode.node_count; ns = a->inode.size; /* Node Size array */ ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); ierr = VecGetArray(yy,&y);CHKERRQ(ierr); idx = a->j; v1 = a->a; ii = a->i; for (i = 0,row = 0; i< node_max; ++i) { nsz = ns[i]; n = ii[1] - ii[0]; nonzerorow += (n>0)*nsz; ii += nsz; PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ sz = n; /* No of non zeros in this row */ /* Switch on the size of Node */ switch (nsz) { /* Each loop in 'case' is unrolled */ case 1: sum1 = 0.; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; /* The instructions are ordered to */ i2 = idx[1]; /* make the compiler's job easy */ idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; } if (n == sz-1) { /* Take care of the last nonzero */ tmp0 = x[*idx++]; sum1 += *v1++ *tmp0; } y[row++]=sum1; break; case 2: sum1 = 0.; sum2 = 0.; v2 = v1 + n; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; } if (n == sz-1) { tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; } y[row++]=sum1; y[row++]=sum2; v1 =v2; /* Since the next block to be processed starts there*/ idx +=sz; break; case 3: sum1 = 0.; sum2 = 0.; sum3 = 0.; v2 = v1 + n; v3 = v2 + n; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; } if (n == sz-1) { tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; sum3 += *v3++ * tmp0; } y[row++]=sum1; y[row++]=sum2; y[row++]=sum3; v1 =v3; /* Since the next block to be processed starts there*/ idx +=2*sz; break; case 4: sum1 = 0.; sum2 = 0.; sum3 = 0.; sum4 = 0.; v2 = v1 + n; v3 = v2 + n; v4 = v3 + n; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; } if (n == sz-1) { tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; sum3 += *v3++ * tmp0; sum4 += *v4++ * tmp0; } y[row++]=sum1; y[row++]=sum2; y[row++]=sum3; y[row++]=sum4; v1 =v4; /* Since the next block to be processed starts there*/ idx +=3*sz; break; case 5: sum1 = 0.; sum2 = 0.; sum3 = 0.; sum4 = 0.; sum5 = 0.; v2 = v1 + n; v3 = v2 + n; v4 = v3 + n; v5 = v4 + n; for (n = 0; nnz - nonzerorow);CHKERRQ(ierr); PetscFunctionReturn(0); } /* ----------------------------------------------------------- */ /* Almost same code as the MatMult_SeqAIJ_Inode() */ PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; const MatScalar *v1,*v2,*v3,*v4,*v5; const PetscScalar *x; PetscScalar *y,*z,*zt; PetscErrorCode ierr; PetscInt i1,i2,n,i,row,node_max,nsz,sz; const PetscInt *idx,*ns,*ii; PetscFunctionBegin; if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); node_max = a->inode.node_count; ns = a->inode.size; /* Node Size array */ ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); zt = z; idx = a->j; v1 = a->a; ii = a->i; for (i = 0,row = 0; i< node_max; ++i) { nsz = ns[i]; n = ii[1] - ii[0]; ii += nsz; sz = n; /* No of non zeros in this row */ /* Switch on the size of Node */ switch (nsz) { /* Each loop in 'case' is unrolled */ case 1: sum1 = *zt++; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; /* The instructions are ordered to */ i2 = idx[1]; /* make the compiler's job easy */ idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; } if (n == sz-1) { /* Take care of the last nonzero */ tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; } y[row++]=sum1; break; case 2: sum1 = *zt++; sum2 = *zt++; v2 = v1 + n; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; } if (n == sz-1) { tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; } y[row++]=sum1; y[row++]=sum2; v1 =v2; /* Since the next block to be processed starts there*/ idx +=sz; break; case 3: sum1 = *zt++; sum2 = *zt++; sum3 = *zt++; v2 = v1 + n; v3 = v2 + n; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; } if (n == sz-1) { tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; sum3 += *v3++ * tmp0; } y[row++]=sum1; y[row++]=sum2; y[row++]=sum3; v1 =v3; /* Since the next block to be processed starts there*/ idx +=2*sz; break; case 4: sum1 = *zt++; sum2 = *zt++; sum3 = *zt++; sum4 = *zt++; v2 = v1 + n; v3 = v2 + n; v4 = v3 + n; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; } if (n == sz-1) { tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; sum3 += *v3++ * tmp0; sum4 += *v4++ * tmp0; } y[row++]=sum1; y[row++]=sum2; y[row++]=sum3; y[row++]=sum4; v1 =v4; /* Since the next block to be processed starts there*/ idx +=3*sz; break; case 5: sum1 = *zt++; sum2 = *zt++; sum3 = *zt++; sum4 = *zt++; sum5 = *zt++; v2 = v1 + n; v3 = v2 + n; v4 = v3 + n; v5 = v4 + n; for (n = 0; nnz);CHKERRQ(ierr); PetscFunctionReturn(0); } /* ----------------------------------------------------------- */ PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; IS iscol = a->col,isrow = a->row; PetscErrorCode ierr; const PetscInt *r,*c,*rout,*cout; PetscInt i,j,n = A->rmap->n,nz; PetscInt node_max,*ns,row,nsz,aii,i0,i1; const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; PetscScalar *x,*tmp,*tmps,tmp0,tmp1; PetscScalar sum1,sum2,sum3,sum4,sum5; const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; const PetscScalar *b; PetscFunctionBegin; if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); node_max = a->inode.node_count; ns = a->inode.size; /* Node Size array */ ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr); tmp = a->solve_work; ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); /* forward solve the lower triangular */ tmps = tmp; aa = a_a; aj = a_j; ad = a->diag; for (i = 0,row = 0; i< node_max; ++i) { nsz = ns[i]; aii = ai[row]; v1 = aa + aii; vi = aj + aii; nz = ad[row]- aii; if (i < node_max-1) { /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, * but our indexing to determine it's size could. */ PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* for (j=0; j=0; i--) { nsz = ns[i]; aii = ai[row+1] -1; v1 = aa + aii; vi = aj + aii; nz = aii- ad[row]; switch (nsz) { /* Each loop in 'case' is unrolled */ case 1: sum1 = tmp[row]; for (j=nz; j>1; j-=2) { vi -=2; i0 = vi[2]; i1 = vi[1]; tmp0 = tmps[i0]; tmp1 = tmps[i1]; v1 -= 2; sum1 -= v1[2] * tmp0 + v1[1] * tmp1; } if (j==1) { tmp0 = tmps[*vi--]; sum1 -= *v1-- * tmp0; } x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; break; case 2: sum1 = tmp[row]; sum2 = tmp[row -1]; v2 = aa + ai[row]-1; for (j=nz; j>1; j-=2) { vi -=2; i0 = vi[2]; i1 = vi[1]; tmp0 = tmps[i0]; tmp1 = tmps[i1]; v1 -= 2; v2 -= 2; sum1 -= v1[2] * tmp0 + v1[1] * tmp1; sum2 -= v2[2] * tmp0 + v2[1] * tmp1; } if (j==1) { tmp0 = tmps[*vi--]; sum1 -= *v1-- * tmp0; sum2 -= *v2-- * tmp0; } tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; sum2 -= *v2-- * tmp0; x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; break; case 3: sum1 = tmp[row]; sum2 = tmp[row -1]; sum3 = tmp[row -2]; v2 = aa + ai[row]-1; v3 = aa + ai[row -1]-1; for (j=nz; j>1; j-=2) { vi -=2; i0 = vi[2]; i1 = vi[1]; tmp0 = tmps[i0]; tmp1 = tmps[i1]; v1 -= 2; v2 -= 2; v3 -= 2; sum1 -= v1[2] * tmp0 + v1[1] * tmp1; sum2 -= v2[2] * tmp0 + v2[1] * tmp1; sum3 -= v3[2] * tmp0 + v3[1] * tmp1; } if (j==1) { tmp0 = tmps[*vi--]; sum1 -= *v1-- * tmp0; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; } tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; sum3 -= *v3-- * tmp0; x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; break; case 4: sum1 = tmp[row]; sum2 = tmp[row -1]; sum3 = tmp[row -2]; sum4 = tmp[row -3]; v2 = aa + ai[row]-1; v3 = aa + ai[row -1]-1; v4 = aa + ai[row -2]-1; for (j=nz; j>1; j-=2) { vi -=2; i0 = vi[2]; i1 = vi[1]; tmp0 = tmps[i0]; tmp1 = tmps[i1]; v1 -= 2; v2 -= 2; v3 -= 2; v4 -= 2; sum1 -= v1[2] * tmp0 + v1[1] * tmp1; sum2 -= v2[2] * tmp0 + v2[1] * tmp1; sum3 -= v3[2] * tmp0 + v3[1] * tmp1; sum4 -= v4[2] * tmp0 + v4[1] * tmp1; } if (j==1) { tmp0 = tmps[*vi--]; sum1 -= *v1-- * tmp0; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; } tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; sum4 -= *v4-- * tmp0; x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; break; case 5: sum1 = tmp[row]; sum2 = tmp[row -1]; sum3 = tmp[row -2]; sum4 = tmp[row -3]; sum5 = tmp[row -4]; v2 = aa + ai[row]-1; v3 = aa + ai[row -1]-1; v4 = aa + ai[row -2]-1; v5 = aa + ai[row -3]-1; for (j=nz; j>1; j-=2) { vi -= 2; i0 = vi[2]; i1 = vi[1]; tmp0 = tmps[i0]; tmp1 = tmps[i1]; v1 -= 2; v2 -= 2; v3 -= 2; v4 -= 2; v5 -= 2; sum1 -= v1[2] * tmp0 + v1[1] * tmp1; sum2 -= v2[2] * tmp0 + v2[1] * tmp1; sum3 -= v3[2] * tmp0 + v3[1] * tmp1; sum4 -= v4[2] * tmp0 + v4[1] * tmp1; sum5 -= v5[2] * tmp0 + v5[1] * tmp1; } if (j==1) { tmp0 = tmps[*vi--]; sum1 -= *v1-- * tmp0; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; sum5 -= *v5-- * tmp0; } tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; sum5 -= *v5-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; sum5 -= *v5-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; sum4 -= *v4-- * tmp0; sum5 -= *v5-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; sum5 -= *v5-- * tmp0; x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; break; default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); } } ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr); ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); PetscFunctionReturn(0); } PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) { Mat C =B; Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; IS isrow = b->row,isicol = b->icol; PetscErrorCode ierr; const PetscInt *r,*ic,*ics; const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; PetscInt i,j,k,nz,nzL,row,*pj; const PetscInt *ajtmp,*bjtmp; MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; FactorShiftCtx sctx; const PetscInt *ddiag; PetscReal rs; MatScalar d; PetscInt inod,nodesz,node_max,col; const PetscInt *ns; PetscInt *tmp_vec1,*tmp_vec2,*nsmap; PetscFunctionBegin; /* MatPivotSetUp(): initialize shift context sctx */ ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ ddiag = a->diag; sctx.shift_top = info->zeropivot; for (i=0; isctx.shift_top) sctx.shift_top = rs; } sctx.shift_top *= 1.1; sctx.nshift_max = 5; sctx.shift_lo = 0.; sctx.shift_hi = 1.; } ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); ics = ic; node_max = a->inode.node_count; ns = a->inode.size; if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); /* If max inode size > 4, split it into two inodes.*/ /* also map the inode sizes according to the ordering */ ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); for (i=0,j=0; i 4) { tmp_vec1[j] = 4; ++j; tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; } else { tmp_vec1[j] = ns[i]; } } /* Use the correct node_max */ node_max = j; /* Now reorder the inode info based on mat re-ordering info */ /* First create a row -> inode_size_array_index map */ ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); for (i=0,row=0; ia + bdiag[row]; mul1 = *pc * (*pv); *pc = mul1; pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ pv = b->a + bdiag[row+1]+1; nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ for (j=0; ja */ rs = 0.0; /* L part */ pv = b->a + bi[i]; pj = b->j + bi[i]; nz = bi[i+1] - bi[i]; for (j=0; ja + bdiag[i+1]+1; pj = b->j + bdiag[i+1]+1; nz = bdiag[i] - bdiag[i+1]-1; for (j=0; ja + bdiag[i]; *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ break; case 2: /*----------*/ /* zero rtmp1 and rtmp2 */ /* L part */ nz = bi[i+1] - bi[i]; bjtmp = bj + bi[i]; for (j=0; ja + bdiag[row]; mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); *pc1 = mul1; *pc2 = mul2; pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ pv = b->a + bdiag[row+1]+1; nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ for (j=0; ja */ rs = 0.0; /* L part */ pc1 = b->a + bi[i]; pj = b->j + bi[i]; nz = bi[i+1] - bi[i]; for (j=0; ja + bdiag[i+1]+1; pj = b->j + bdiag[i+1]+1; nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ for (j=0; ja + bdiag[i]; /* Mark diagonal */ *pc1 = 1.0/sctx.pv; /* Now take care of diagonal 2x2 block. */ pc2 = rtmp2 + i; if (*pc2 != 0.0) { mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ *pc2 = mul1; /* insert L entry */ pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ for (j=0; ja */ rs = 0.0; /* L part */ pc2 = b->a + bi[i+1]; pj = b->j + bi[i+1]; nz = bi[i+2] - bi[i+1]; for (j=0; ja + bdiag[i+2]+1; pj = b->j + bdiag[i+2]+1; nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ for (j=0; ja + bdiag[i+1]; *pc2 = 1.0/sctx.pv; break; case 3: /*----------*/ /* zero rtmp */ /* L part */ nz = bi[i+1] - bi[i]; bjtmp = bj + bi[i]; for (j=0; ja + bdiag[row]; mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ pv = b->a + bdiag[row+1]+1; nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ for (j=0; ja */ rs = 0.0; /* L part */ pc1 = b->a + bi[i]; pj = b->j + bi[i]; nz = bi[i+1] - bi[i]; for (j=0; ja + bdiag[i+1]+1; pj = b->j + bdiag[i+1]+1; nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ for (j=0; ja + bdiag[i]; /* Mark diag[i] */ *pc1 = 1.0/sctx.pv; /* Now take care of 1st column of diagonal 3x3 block. */ pc2 = rtmp2 + i; pc3 = rtmp3 + i; if (*pc2 != 0.0 || *pc3 != 0.0) { mul2 = (*pc2)*(*pc1); *pc2 = mul2; mul3 = (*pc3)*(*pc1); *pc3 = mul3; pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ for (j=0; ja */ rs = 0.0; /* L part */ pc2 = b->a + bi[i+1]; pj = b->j + bi[i+1]; nz = bi[i+2] - bi[i+1]; for (j=0; ja + bdiag[i+2]+1; pj = b->j + bdiag[i+2]+1; nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ for (j=0; ja + bdiag[i+1]; *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ /* Now take care of 2nd column of diagonal 3x3 block. */ pc3 = rtmp3 + i+1; if (*pc3 != 0.0) { mul3 = (*pc3)*(*pc2); *pc3 = mul3; pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ for (j=0; ja */ rs = 0.0; /* L part */ pc3 = b->a + bi[i+2]; pj = b->j + bi[i+2]; nz = bi[i+3] - bi[i+2]; for (j=0; ja + bdiag[i+3]+1; pj = b->j + bdiag[i+3]+1; nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ for (j=0; ja + bdiag[i+2]; *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ break; case 4: /*----------*/ /* zero rtmp */ /* L part */ nz = bi[i+1] - bi[i]; bjtmp = bj + bi[i]; for (j=0; ja + bdiag[row]; mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ pv = b->a + bdiag[row+1]+1; nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ for (j=0; ja */ rs = 0.0; /* L part */ pc1 = b->a + bi[i]; pj = b->j + bi[i]; nz = bi[i+1] - bi[i]; for (j=0; ja + bdiag[i+1]+1; pj = b->j + bdiag[i+1]+1; nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ for (j=0; ja + bdiag[i]; /* Mark diag[i] */ *pc1 = 1.0/sctx.pv; /* Now take care of 1st column of diagonal 4x4 block. */ pc2 = rtmp2 + i; pc3 = rtmp3 + i; pc4 = rtmp4 + i; if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { mul2 = (*pc2)*(*pc1); *pc2 = mul2; mul3 = (*pc3)*(*pc1); *pc3 = mul3; mul4 = (*pc4)*(*pc1); *pc4 = mul4; pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ for (j=0; ja */ rs = 0.0; /* L part */ pc2 = b->a + bi[i+1]; pj = b->j + bi[i+1]; nz = bi[i+2] - bi[i+1]; for (j=0; ja + bdiag[i+2]+1; pj = b->j + bdiag[i+2]+1; nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ for (j=0; ja + bdiag[i+1]; *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ /* Now take care of 2nd column of diagonal 4x4 block. */ pc3 = rtmp3 + i+1; pc4 = rtmp4 + i+1; if (*pc3 != 0.0 || *pc4 != 0.0) { mul3 = (*pc3)*(*pc2); *pc3 = mul3; mul4 = (*pc4)*(*pc2); *pc4 = mul4; pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ for (j=0; ja */ rs = 0.0; /* L part */ pc3 = b->a + bi[i+2]; pj = b->j + bi[i+2]; nz = bi[i+3] - bi[i+2]; for (j=0; ja + bdiag[i+3]+1; pj = b->j + bdiag[i+3]+1; nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ for (j=0; ja + bdiag[i+2]; *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ /* Now take care of 3rd column of diagonal 4x4 block. */ pc4 = rtmp4 + i+2; if (*pc4 != 0.0) { mul4 = (*pc4)*(*pc3); *pc4 = mul4; pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ for (j=0; ja */ rs = 0.0; /* L part */ pc4 = b->a + bi[i+3]; pj = b->j + bi[i+3]; nz = bi[i+4] - bi[i+3]; for (j=0; ja + bdiag[i+4]+1; pj = b->j + bdiag[i+4]+1; nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ for (j=0; ja + bdiag[i+3]; *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ break; default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); } if (sctx.newshift) break; /* break for (inod=0,i=0; inodshifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshiftinode.size) { C->ops->solve = MatSolve_SeqAIJ_Inode; } else { C->ops->solve = MatSolve_SeqAIJ; } C->ops->solveadd = MatSolveAdd_SeqAIJ; C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; C->ops->matsolve = MatMatSolve_SeqAIJ; C->assembled = PETSC_TRUE; C->preallocated = PETSC_TRUE; ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); /* MatShiftView(A,info,&sctx) */ if (sctx.nshift) { if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); } } PetscFunctionReturn(0); } PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) { Mat C = B; Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; IS iscol = b->col,isrow = b->row,isicol = b->icol; PetscErrorCode ierr; const PetscInt *r,*ic,*c,*ics; PetscInt n = A->rmap->n,*bi = b->i; PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; PetscInt *ai = a->i,*aj = a->j; PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; PetscScalar mul1,mul2,mul3,tmp; MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; PetscReal rs=0.0; FactorShiftCtx sctx; PetscFunctionBegin; sctx.shift_top = 0; sctx.nshift_max = 0; sctx.shift_lo = 0; sctx.shift_hi = 0; sctx.shift_fraction = 0; /* if both shift schemes are chosen by user, only use info->shiftpd */ if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ sctx.shift_top = 0; for (i=0; isctx.shift_top) sctx.shift_top = rs; } if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; sctx.shift_top *= 1.1; sctx.nshift_max = 5; sctx.shift_lo = 0.; sctx.shift_hi = 1.; } sctx.shift_amount = 0; sctx.nshift = 0; ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); ics = ic; node_max = a->inode.node_count; ns = a->inode.size; if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); /* If max inode size > 3, split it into two inodes.*/ /* also map the inode sizes according to the ordering */ ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); for (i=0,j=0; i3) { tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ ++j; tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; } else { tmp_vec1[j] = ns[i]; } } /* Use the correct node_max */ node_max = j; /* Now reorder the inode info based on mat re-ordering info */ /* First create a row -> inode_size_array_index map */ ierr = PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);CHKERRQ(ierr); for (i=0,row=0; i ba */ if (idx != row) rs += PetscAbsScalar(pc1[j]); } sctx.rs = rs; ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); if (sctx.newshift) goto endofwhile; break; case 2: for (j=0; jops->solve = MatSolve_SeqAIJ_inplace; /* do not set solve add, since MatSolve_Inode + Add is faster */ C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; C->assembled = PETSC_TRUE; C->preallocated = PETSC_TRUE; if (sctx.nshift) { if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); } } ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr); PetscFunctionReturn(0); } /* ----------------------------------------------------------- */ PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; IS iscol = a->col,isrow = a->row; PetscErrorCode ierr; const PetscInt *r,*c,*rout,*cout; PetscInt i,j,n = A->rmap->n; PetscInt node_max,row,nsz,aii,i0,i1,nz; const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; PetscScalar *x,*tmp,*tmps,tmp0,tmp1; PetscScalar sum1,sum2,sum3,sum4,sum5; const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; const PetscScalar *b; PetscFunctionBegin; if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); node_max = a->inode.node_count; ns = a->inode.size; /* Node Size array */ ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr); tmp = a->solve_work; ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; /* forward solve the lower triangular */ tmps = tmp; aa = a_a; aj = a_j; ad = a->diag; for (i = 0,row = 0; i< node_max; ++i) { nsz = ns[i]; aii = ai[row]; v1 = aa + aii; vi = aj + aii; nz = ai[row+1]- ai[row]; if (i < node_max-1) { /* Prefetch the indices for the next block */ PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ /* Prefetch the data for the next block */ PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); } switch (nsz) { /* Each loop in 'case' is unrolled */ case 1: sum1 = b[r[row]]; for (j=0; j=0; i--) { nsz = ns[i]; aii = ad[row+1] + 1; v1 = aa + aii; vi = aj + aii; nz = ad[row]- ad[row+1] - 1; if (i > 0) { /* Prefetch the indices for the next block */ PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the data for the next block */ PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); } switch (nsz) { /* Each loop in 'case' is unrolled */ case 1: sum1 = tmp[row]; for (j=0; jnz - A->cmap->n);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Makes a longer coloring[] array and calls the usual code with that */ PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; PetscErrorCode ierr; PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; PetscInt *colorused,i; ISColoringValue *newcolor; PetscFunctionBegin; if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); /* loop over inodes, marking a color for each column*/ row = 0; for (i=0; i PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; MatScalar *ibdiag,*bdiag,work[25],*t; PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; const PetscScalar *xb, *b; PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0; PetscErrorCode ierr; PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; PetscInt sz,k,ipvt[5]; PetscBool allowzeropivot,zeropivotdetected; const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; PetscFunctionBegin; allowzeropivot = PetscNot(A->erroriffailure); if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); if (!a->inode.ibdiagvalid) { if (!a->inode.ibdiag) { /* calculate space needed for diagonal blocks */ for (i=0; iinode.bdiagsize = cnt; ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); } /* copy over the diagonal blocks and invert them */ ibdiag = a->inode.ibdiag; bdiag = a->inode.bdiag; cnt = 0; for (i=0, row = 0; ifactorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); A->factorerror_zeropivot_row = row; ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr); } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); } ibdiag[cnt] = 1.0/ibdiag[cnt]; break; case 2: ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; break; case 3: ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; break; case 4: ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; break; case 5: ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; break; default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); } cnt += sizes[i]*sizes[i]; row += sizes[i]; } a->inode.ibdiagvalid = PETSC_TRUE; } ibdiag = a->inode.ibdiag; bdiag = a->inode.bdiag; t = a->inode.ssor_work; ierr = VecGetArray(xx,&x);CHKERRQ(ierr); ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ if (flag & SOR_ZERO_INITIAL_GUESS) { if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { for (i=0, row=0; ia + ii[row]; idx = a->j + ii[row]; /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ switch (sizes[i]) { case 1: sum1 = b[row]; for (n = 0; na + ii[row+1]; sum1 = b[row]; sum2 = b[row+1]; for (n = 0; na + ii[row+1]; v3 = a->a + ii[row+2]; sum1 = b[row]; sum2 = b[row+1]; sum3 = b[row+2]; for (n = 0; na + ii[row+1]; v3 = a->a + ii[row+2]; v4 = a->a + ii[row+3]; sum1 = b[row]; sum2 = b[row+1]; sum3 = b[row+2]; sum4 = b[row+3]; for (n = 0; na + ii[row+1]; v3 = a->a + ii[row+2]; v4 = a->a + ii[row+3]; v5 = a->a + ii[row+4]; sum1 = b[row]; sum2 = b[row+1]; sum3 = b[row+2]; sum4 = b[row+3]; sum5 = b[row+4]; for (n = 0; nnz);CHKERRQ(ierr); } else xb = b; if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { ibdiag = a->inode.ibdiag+a->inode.bdiagsize; for (i=m-1, row=A->rmap->n-1; i>=0; i--) { ibdiag -= sizes[i]*sizes[i]; sz = ii[row+1] - diag[row] - 1; v1 = a->a + diag[row] + 1; idx = a->j + diag[row] + 1; /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ switch (sizes[i]) { case 1: sum1 = xb[row]; for (n = 0; na + diag[row-1] + 2; for (n = 0; na + diag[row-1] + 2; v3 = a->a + diag[row-2] + 3; for (n = 0; na + diag[row-1] + 2; v3 = a->a + diag[row-2] + 3; v4 = a->a + diag[row-3] + 4; for (n = 0; na + diag[row-1] + 2; v3 = a->a + diag[row-2] + 3; v4 = a->a + diag[row-3] + 4; v5 = a->a + diag[row-4] + 5; for (n = 0; nnz);CHKERRQ(ierr); } its--; } while (its--) { if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { for (i=0, row=0, ibdiag = a->inode.ibdiag; ia + ii[row]; idx = a->j + ii[row]; /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ switch (sizes[i]) { case 1: sum1 = b[row]; for (n = 0; nj + diag[row] + 1; v1 += 1; for (n = 0; na + ii[row+1]; sum1 = b[row]; sum2 = b[row+1]; for (n = 0; nj + diag[row] + 2; v1 += 2; v2 += 2; for (n = 0; na + ii[row+1]; v3 = a->a + ii[row+2]; sum1 = b[row]; sum2 = b[row+1]; sum3 = b[row+2]; for (n = 0; nj + diag[row] + 3; v1 += 3; v2 += 3; v3 += 3; for (n = 0; na + ii[row+1]; v3 = a->a + ii[row+2]; v4 = a->a + ii[row+3]; sum1 = b[row]; sum2 = b[row+1]; sum3 = b[row+2]; sum4 = b[row+3]; for (n = 0; nj + diag[row] + 4; v1 += 4; v2 += 4; v3 += 4; v4 += 4; for (n = 0; na + ii[row+1]; v3 = a->a + ii[row+2]; v4 = a->a + ii[row+3]; v5 = a->a + ii[row+4]; sum1 = b[row]; sum2 = b[row+1]; sum3 = b[row+2]; sum4 = b[row+3]; sum5 = b[row+4]; for (n = 0; nj + diag[row] + 5; v1 += 5; v2 += 5; v3 += 5; v4 += 5; v5 += 5; for (n = 0; nnz);CHKERRQ(ierr); /* undercounts diag inverse */ } else xb = b; if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { ibdiag = a->inode.ibdiag+a->inode.bdiagsize; for (i=m-1, row=A->rmap->n-1; i>=0; i--) { ibdiag -= sizes[i]*sizes[i]; /* set RHS */ if (xb == b) { /* whole (old way) */ sz = ii[row+1] - ii[row]; idx = a->j + ii[row]; switch (sizes[i]) { case 5: v5 = a->a + ii[row-4]; case 4: /* fall through */ v4 = a->a + ii[row-3]; case 3: v3 = a->a + ii[row-2]; case 2: v2 = a->a + ii[row-1]; case 1: v1 = a->a + ii[row]; break; default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); } } else { /* upper, no diag */ sz = ii[row+1] - diag[row] - 1; idx = a->j + diag[row] + 1; switch (sizes[i]) { case 5: v5 = a->a + diag[row-4] + 5; case 4: /* fall through */ v4 = a->a + diag[row-3] + 4; case 3: v3 = a->a + diag[row-2] + 3; case 2: v2 = a->a + diag[row-1] + 2; case 1: v1 = a->a + diag[row] + 1; } } /* set sum */ switch (sizes[i]) { case 5: sum5 = xb[row-4]; case 4: /* fall through */ sum4 = xb[row-3]; case 3: sum3 = xb[row-2]; case 2: sum2 = xb[row-1]; case 1: /* note that sum1 is associated with the last row */ sum1 = xb[row]; } /* do sums */ for (n = 0; nnz);CHKERRQ(ierr); } else { ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ } } } if (flag & SOR_EISENSTAT) { /* Apply (U + D)^-1 where D is now the block diagonal */ ibdiag = a->inode.ibdiag+a->inode.bdiagsize; for (i=m-1, row=A->rmap->n-1; i>=0; i--) { ibdiag -= sizes[i]*sizes[i]; sz = ii[row+1] - diag[row] - 1; v1 = a->a + diag[row] + 1; idx = a->j + diag[row] + 1; /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ switch (sizes[i]) { case 1: sum1 = b[row]; for (n = 0; na + diag[row-1] + 2; for (n = 0; na + diag[row-1] + 2; v3 = a->a + diag[row-2] + 3; for (n = 0; na + diag[row-1] + 2; v3 = a->a + diag[row-2] + 3; v4 = a->a + diag[row-3] + 4; for (n = 0; na + diag[row-1] + 2; v3 = a->a + diag[row-2] + 3; v4 = a->a + diag[row-3] + 4; v5 = a->a + diag[row-4] + 5; for (n = 0; nnz);CHKERRQ(ierr); /* t = b - D x where D is the block diagonal */ cnt = 0; for (i=0, row=0; ia + ii[row]; idx = a->j + ii[row]; /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ switch (sizes[i]) { case 1: sum1 = t[row]; for (n = 0; na + ii[row+1]; sum1 = t[row]; sum2 = t[row+1]; for (n = 0; na + ii[row+1]; v3 = a->a + ii[row+2]; sum1 = t[row]; sum2 = t[row+1]; sum3 = t[row+2]; for (n = 0; na + ii[row+1]; v3 = a->a + ii[row+2]; v4 = a->a + ii[row+3]; sum1 = t[row]; sum2 = t[row+1]; sum3 = t[row+2]; sum4 = t[row+3]; for (n = 0; na + ii[row+1]; v3 = a->a + ii[row+2]; v4 = a->a + ii[row+3]; v5 = a->a + ii[row+4]; sum1 = t[row]; sum2 = t[row+1]; sum3 = t[row+2]; sum4 = t[row+3]; sum5 = t[row+4]; for (n = 0; nnz);CHKERRQ(ierr); } ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); PetscFunctionReturn(0); } PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; const MatScalar *bdiag = a->inode.bdiag; const PetscScalar *b; PetscErrorCode ierr; PetscInt m = a->inode.node_count,cnt = 0,i,row; const PetscInt *sizes = a->inode.size; PetscFunctionBegin; if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); ierr = VecGetArray(xx,&x);CHKERRQ(ierr); ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); cnt = 0; for (i=0, row=0; idata; PetscFunctionBegin; a->inode.node_count = 0; a->inode.use = PETSC_FALSE; a->inode.checked = PETSC_FALSE; a->inode.mat_nonzerostate = -1; A->ops->getrowij = MatGetRowIJ_SeqAIJ; A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; A->ops->coloringpatch = NULL; A->ops->multdiagonalblock = NULL; if (A->factortype) { A->ops->solve = MatSolve_SeqAIJ_inplace; } PetscFunctionReturn(0); } /* samestructure indicates that the matrix has not changed its nonzero structure so we do not need to recompute the inodes */ PetscErrorCode MatSeqAIJCheckInode(Mat A) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; PetscBool flag; const PetscInt *idx,*idy,*ii; PetscFunctionBegin; if (!a->inode.use) { ierr = MatSeqAIJ_Inode_ResetOps(A);CHKERRQ(ierr); ierr = PetscFree(a->inode.size);CHKERRQ(ierr); PetscFunctionReturn(0); } if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); m = A->rmap->n; if (!a->inode.size) { ierr = PetscMalloc1(m+1,&a->inode.size);CHKERRQ(ierr); } ns = a->inode.size; i = 0; node_count = 0; idx = a->j; ii = a->i; while (i < m) { /* For each row */ nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ /* Limits the number of elements in a node to 'a->inode.limit' */ for (j=i+1,idy=idx,blk_size=1; jinode.limit; ++j,++blk_size) { nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ if (nzy != nzx) break; idy += nzx; /* Same nonzero pattern */ ierr = PetscArraycmp(idx,idy,nzx,&flag);CHKERRQ(ierr); if (!flag) break; } ns[node_count++] = blk_size; idx += blk_size*nzx; i = j; } /* If not enough inodes found,, do not use inode version of the routines */ if (!m || node_count > .8*m) { ierr = MatSeqAIJ_Inode_ResetOps(A);CHKERRQ(ierr); ierr = PetscFree(a->inode.size);CHKERRQ(ierr); ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); } else { if (!A->factortype) { A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; if (A->rmap->n == A->cmap->n) { A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; } } else { A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; } a->inode.node_count = node_count; ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); } a->inode.checked = PETSC_TRUE; a->inode.mat_nonzerostate = A->nonzerostate; PetscFunctionReturn(0); } PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) { Mat B =*C; Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscInt m=A->rmap->n; PetscFunctionBegin; c->inode.use = a->inode.use; c->inode.limit = a->inode.limit; c->inode.max_limit = a->inode.max_limit; c->inode.checked = PETSC_FALSE; c->inode.size = NULL; c->inode.node_count = 0; c->inode.ibdiagvalid = PETSC_FALSE; c->inode.ibdiag = NULL; c->inode.bdiag = NULL; c->inode.mat_nonzerostate = -1; if (a->inode.use) { if (a->inode.checked && a->inode.size) { ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); ierr = PetscArraycpy(c->inode.size,a->inode.size,m+1);CHKERRQ(ierr); c->inode.checked = PETSC_TRUE; c->inode.node_count = a->inode.node_count; c->inode.mat_nonzerostate = (*C)->nonzerostate; } /* note the table of functions below should match that in MatSeqAIJCheckInode() */ if (!B->factortype) { B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; } else { B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; } } PetscFunctionReturn(0); } PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) { PetscInt k; const PetscInt *vi; PetscFunctionBegin; vi = aj + ai[row]; for (k=0; kdata; PetscErrorCode ierr; PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; PetscInt *cols1,*cols2,*ns; const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; PetscBool flag; PetscFunctionBegin; if (!a->inode.use) PetscFunctionReturn(0); if (a->inode.checked) PetscFunctionReturn(0); m = A->rmap->n; if (a->inode.size) ns = a->inode.size; else { ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); } i = 0; node_count = 0; ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); while (i < m) { /* For each row */ nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ nzx = nzl1 + nzu1 + 1; MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); /* Limits the number of elements in a node to 'a->inode.limit' */ for (j=i+1,blk_size=1; jinode.limit; ++j,++blk_size) { nzl2 = ai[j+1] - ai[j]; nzu2 = adiag[j] - adiag[j+1] - 1; nzy = nzl2 + nzu2 + 1; if (nzy != nzx) break; ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); ierr = PetscArraycmp(cols1,cols2,nzx,&flag);CHKERRQ(ierr); if (!flag) break; } ns[node_count++] = blk_size; i = j; } ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); /* If not enough inodes found,, do not use inode version of the routines */ if (!m || node_count > .8*m) { ierr = PetscFree(ns);CHKERRQ(ierr); a->inode.node_count = 0; a->inode.size = NULL; a->inode.use = PETSC_FALSE; ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); } else { A->ops->mult = NULL; A->ops->sor = NULL; A->ops->multadd = NULL; A->ops->getrowij = NULL; A->ops->restorerowij = NULL; A->ops->getcolumnij = NULL; A->ops->restorecolumnij = NULL; A->ops->coloringpatch = NULL; A->ops->multdiagonalblock = NULL; a->inode.node_count = node_count; a->inode.size = ns; ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); } a->inode.checked = PETSC_TRUE; PetscFunctionReturn(0); } PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) { Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; PetscFunctionBegin; a->inode.ibdiagvalid = PETSC_FALSE; PetscFunctionReturn(0); } /* This is really ugly. if inodes are used this replaces the permutations with ones that correspond to rows/cols of the matrix rather then inode blocks */ PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) { PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); PetscFunctionReturn(0); } PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) { Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; const PetscInt *ridx,*cidx; PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; PetscInt nslim_col,*ns_col; IS ris = *rperm,cis = *cperm; PetscFunctionBegin; if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); /* Form the inode structure for the rows of permuted matric using inv perm*/ for (i=0,tns[0]=0; iassembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); if (f) { ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); } PetscFunctionReturn(0); } PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscFunctionBegin; if (node_count) *node_count = a->inode.node_count; if (sizes) *sizes = a->inode.size; if (limit) *limit = a->inode.limit; PetscFunctionReturn(0); }