#define PETSCMAT_DLL /* Routines to compute overlapping regions of a parallel MPI matrix and to find submatrices that were shared across processors. */ #include "src/mat/impls/aij/mpi/mpiaij.h" #include "petscbt.h" static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); EXTERN PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); EXTERN PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); #undef __FUNCT__ #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) { PetscErrorCode ierr; PetscInt i; PetscFunctionBegin; if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); for (i=0; i is[1] mesg [2] = sizeof(is[1]); ----------- mesg [3] = 5 => is[5] mesg [4] = sizeof(is[5]); ----------- mesg [5] mesg [n] datas[1] ----------- mesg[n+1] mesg[m] data(is[5]) ----------- Notes: nrqs - no of requests sent (or to be sent out) nrqr - no of requests recieved (which have to be or which have been processed */ #undef __FUNCT__ #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) { Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; PetscInt **idx,*n,*rtable,**data,len,*idx_i; PetscErrorCode ierr; PetscMPIInt size,rank,tag1,tag2; PetscInt m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; PetscBT *table; MPI_Comm comm; MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; MPI_Status *s_status,*recv_status; PetscFunctionBegin; comm = ((PetscObject)C)->comm; size = c->size; rank = c->rank; m = C->rmap.N; ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); len = (imax+1)*sizeof(PetscInt*)+ (imax + m)*sizeof(PetscInt); ierr = PetscMalloc(len,&idx);CHKERRQ(ierr); n = (PetscInt*)(idx + imax); rtable = n + imax; for (i=0; i proc*/ for (i=0,j=0; irmap.range[i+1]; for (; jdata; Mat A = c->A,B = c->B; Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; PetscInt start,end,val,max,rstart,cstart,*ai,*aj; PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; PetscBT table_i; PetscFunctionBegin; rstart = C->rmap.rstart; cstart = C->cmap.rstart; ai = a->i; aj = a->j; bi = b->i; bj = b->j; garray = c->garray; for (i=0; idata; Mat A = c->A,B = c->B; Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; PetscErrorCode ierr; PetscMPIInt rank; PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; PetscInt *rbuf_i,kmax,rbuf_0; PetscBT xtable; PetscFunctionBegin; rank = c->rank; m = C->rmap.N; rstart = C->rmap.rstart; cstart = C->cmap.rstart; ai = a->i; aj = a->j; bi = b->i; bj = b->j; garray = c->garray; for (i=0,ct=0,total_sz=0; irmap.n) max1 = ct*(a->nz + b->nz)/C->rmap.n; else max1 = 1; mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); ++no_malloc; ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); ct3 = 0; for (i=0; idata; Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; PetscErrorCode ierr; PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; PetscInt sendcount,i,*rstarts = A->rmap.range,n,cnt,j; PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; PetscScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; PetscFunctionBegin; ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); if (scall == MAT_INITIAL_MATRIX) { /* ---------------------------------------------------------------- Tell every processor the number of nonzeros per row */ ierr = PetscMalloc(A->rmap.N*sizeof(PetscInt),&lens);CHKERRQ(ierr); for (i=A->rmap.rstart; irmap.rend; i++) { lens[i] = ad->i[i-A->rmap.rstart+1] - ad->i[i-A->rmap.rstart] + bd->i[i-A->rmap.rstart+1] - bd->i[i-A->rmap.rstart]; } sendcount = A->rmap.rend - A->rmap.rstart; ierr = PetscMalloc(2*size*sizeof(PetscMPIInt),&recvcounts);CHKERRQ(ierr); displs = recvcounts + size; for (i=0; irmap.range[i+1] - A->rmap.range[i]; displs[i] = A->rmap.range[i]; } ierr = MPI_Allgatherv(lens+A->rmap.rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); /* --------------------------------------------------------------- Create the sequential matrix of the same type as the local block diagonal */ ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); ierr = MatSetSizes(B,A->rmap.N,A->cmap.N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); **Bin = B; b = (Mat_SeqAIJ *)B->data; /*-------------------------------------------------------------------- Copy my part of matrix column indices over */ sendcount = ad->nz + bd->nz; jsendbuf = b->j + b->i[rstarts[rank]]; a_jsendbuf = ad->j; b_jsendbuf = bd->j; n = A->rmap.rend - A->rmap.rstart; cnt = 0; for (i=0; ii[i+1] - bd->i[i]; while (m > 0) { /* is it above diagonal (in bd (compressed) numbering) */ if (garray[*b_jsendbuf] > A->rmap.rstart + i) break; jsendbuf[cnt++] = garray[*b_jsendbuf++]; m--; } /* put in diagonal portion */ for (j=ad->i[i]; ji[i+1]; j++) { jsendbuf[cnt++] = A->rmap.rstart + *a_jsendbuf++; } /* put in upper diagonal portion */ while (m-- > 0) { jsendbuf[cnt++] = garray[*b_jsendbuf++]; } } if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); /*-------------------------------------------------------------------- Gather all column indices to all processors */ for (i=0; irmap.range[i]; jrmap.range[i+1]; j++) { recvcounts[i] += lens[j]; } } displs[0] = 0; for (i=1; ij,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); /*-------------------------------------------------------------------- Assemble the matrix into useable form (note numerical values not yet set) */ /* set the b->ilen (length of each row) values */ ierr = PetscMemcpy(b->ilen,lens,A->rmap.N*sizeof(PetscInt));CHKERRQ(ierr); /* set the b->i indices */ b->i[0] = 0; for (i=1; i<=A->rmap.N; i++) { b->i[i] = b->i[i-1] + lens[i-1]; } ierr = PetscFree(lens);CHKERRQ(ierr); ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); } else { B = **Bin; b = (Mat_SeqAIJ *)B->data; } /*-------------------------------------------------------------------- Copy my part of matrix numerical values into the values location */ sendcount = ad->nz + bd->nz; sendbuf = b->a + b->i[rstarts[rank]]; a_sendbuf = ad->a; b_sendbuf = bd->a; b_sendj = bd->j; n = A->rmap.rend - A->rmap.rstart; cnt = 0; for (i=0; ii[i+1] - bd->i[i]; while (m > 0) { /* is it above diagonal (in bd (compressed) numbering) */ if (garray[*b_sendj] > A->rmap.rstart + i) break; sendbuf[cnt++] = *b_sendbuf++; m--; b_sendj++; } /* put in diagonal portion */ for (j=ad->i[i]; ji[i+1]; j++) { sendbuf[cnt++] = *a_sendbuf++; } /* put in upper diagonal portion */ while (m-- > 0) { sendbuf[cnt++] = *b_sendbuf++; b_sendj++; } } if (cnt != sendcount) SETERRQ2(PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); /* ----------------------------------------------------------------- Gather all numerical values to all processors */ if (!recvcounts) { ierr = PetscMalloc(2*size*sizeof(PetscInt),&recvcounts);CHKERRQ(ierr); displs = recvcounts + size; } for (i=0; ii[rstarts[i+1]] - b->i[rstarts[i]]; } displs[0] = 0; for (i=1; ia; ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); ierr = PetscFree(recvcounts);CHKERRQ(ierr); if (A->symmetric){ ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); } else if (A->hermitian) { ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); } else if (A->structurally_symmetric) { ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); } PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) { PetscErrorCode ierr; PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; PetscTruth rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix; PetscFunctionBegin; /* Check for special case each processor gets entire matrix */ if (ismax == 1 && C->rmap.N == C->cmap.N) { ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); if (rowflag && colflag && nrow == C->rmap.N && ncol == C->cmap.N) { wantallmatrix = PETSC_TRUE; ierr = PetscOptionsGetTruth(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); } } ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); if (twantallmatrix) { ierr = MatGetSubMatrix_MPIAIJ_All(C,scall,submat);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Allocate memory to hold all the submatrices */ if (scall != MAT_REUSE_MATRIX) { ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); } /* Determine the number of stages through which submatrices are done */ nmax = 20*1000000 / (C->cmap.N * sizeof(PetscInt)); if (!nmax) nmax = 1; nstages_local = ismax/nmax + ((ismax % nmax)?1:0); /* Make sure every processor loops through the nstages */ ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); for (i=0,pos=0; idata; Mat A = c->A; Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; PetscInt **irow,**icol,*nrow,*ncol,start; PetscErrorCode ierr; PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap; PetscInt **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; PetscInt len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; PetscInt *rmap_i; MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; MPI_Request *r_waits4,*s_waits3,*s_waits4; MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; MPI_Status *r_status3,*r_status4,*s_status4; MPI_Comm comm; PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; PetscTruth sorted; PetscMPIInt *onodes1,*olengths1; PetscMPIInt idex,idex2,end; PetscFunctionBegin; comm = ((PetscObject)C)->comm; tag0 = ((PetscObject)C)->tag; size = c->size; rank = c->rank; /* Get some new tags to keep the communication clean */ ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); /* Check if the col indices are sorted */ for (i=0; i= C->rmap.range[l+1]) l++; proc = l; w4[proc]++; } for (j=0; j= C->rmap.range[l+1]) l++; proc = l; if (proc != rank) { /* copy to the outgoing buf*/ ctr[proc]++; *ptr[proc] = row; ptr[proc]++; } } /* Update the headers for the current IS */ for (j=0; jA->data,*sB = (Mat_SeqAIJ*)c->B->data; PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap.rstart; PetscInt *sbuf2_i; for (i=0; ij, and send them off */ ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); for (i=0,j=0; ii,*b_i = b->i,lwrite; PetscInt *cworkA,*cworkB,cstart = C->cmap.rstart,rstart = C->rmap.rstart,*bmap = c->garray; PetscInt cend = C->cmap.rend; PetscInt *a_j = a->j,*b_j = b->j,ctmp; for (i=0; i= cend) cols[lwrite++] = ctmp; } ct2 += ncols; } } ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); } } ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); /* Allocate buffers for a->a, and send them off */ ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); for (i=0,j=0; ii,*b_i = b->i, *cworkB,lwrite; PetscInt cstart = C->cmap.rstart,rstart = C->rmap.rstart,*bmap = c->garray; PetscInt cend = C->cmap.rend; PetscInt *b_j = b->j; PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; for (i=0; i= cend) vals[lwrite++] = vworkB[l]; } ct2 += ncols; } } ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); } } ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); ierr = PetscFree(rbuf1);CHKERRQ(ierr); /* Form the matrix */ /* create col map */ { PetscInt *icol_i; len = (1+ismax)*sizeof(PetscInt*)+ (1+ismax*C->cmap.N)*sizeof(PetscInt); ierr = PetscMalloc(len,&cmap);CHKERRQ(ierr); cmap[0] = (PetscInt*)(cmap + ismax); ierr = PetscMemzero(cmap[0],(1+ismax*C->cmap.N)*sizeof(PetscInt));CHKERRQ(ierr); for (i=1; icmap.N; } for (i=0; i= C->rmap.range[l+1]) l++; proc = l; if (proc == rank) { ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); for (k=0; krmap.N*sizeof(PetscInt); ierr = PetscMalloc(len,&rmap);CHKERRQ(ierr); rmap[0] = (PetscInt*)(rmap + ismax); ierr = PetscMemzero(rmap[0],ismax*C->rmap.N*sizeof(PetscInt));CHKERRQ(ierr); for (i=1; irmap.N;} for (i=0; idata); if ((submats[i]->rmap.n != nrow[i]) || (submats[i]->cmap.n != ncol[i])) { SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); } ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap.n*sizeof(PetscInt),&flag);CHKERRQ(ierr); if (!flag) { SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); } /* Initial matrix as if empty */ ierr = PetscMemzero(mat->ilen,submats[i]->rmap.n*sizeof(PetscInt));CHKERRQ(ierr); submats[i]->factor = C->factor; } } else { for (i=0; itype_name);CHKERRQ(ierr); ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); } } /* Assemble the matrices */ /* First assemble the local rows */ { PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; PetscScalar *imat_a; for (i=0; idata; imat_ilen = mat->ilen; imat_j = mat->j; imat_i = mat->i; imat_a = mat->a; cmap_i = cmap[i]; rmap_i = rmap[i]; irow_i = irow[i]; jmax = nrow[i]; for (j=0; j= C->rmap.range[l+1]) l++; proc = l; if (proc == rank) { old_row = row; row = rmap_i[row]; ilen_row = imat_ilen[row]; ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); mat_i = imat_i[row] ; mat_a = imat_a + mat_i; mat_j = imat_j + mat_i; for (k=0; kdata; imat_ilen = mat->ilen; imat_j = mat->j; imat_i = mat->i; imat_a = mat->a; max1 = sbuf1_i[2*j]; for (k=0; k