/*$Id: mpiov.c,v 1.26.1.76.2.22 2001/09/07 20:09:38 bsmith Exp $*/ /* Routines to compute overlapping regions of a parallel MPI matrix and to find submatrices that were shared across processors. */ #include "src/mat/impls/aij/mpi/mpiaij.h" #include "petscbt.h" static int MatIncreaseOverlap_MPIAIJ_Once(Mat,int,IS *); static int MatIncreaseOverlap_MPIAIJ_Local(Mat,int,char **,int*,int**); static int MatIncreaseOverlap_MPIAIJ_Receive(Mat,int,int **,int**,int*); EXTERN int MatGetRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**); EXTERN int MatRestoreRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**); #undef __FUNCT__ #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" int MatIncreaseOverlap_MPIAIJ(Mat C,int imax,IS is[],int ov) { int i,ierr; PetscFunctionBegin; if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); for (i=0; i is[1] mesg [2] = sizeof(is[1]); ----------- mesg [3] = 5 => is[5] mesg [4] = sizeof(is[5]); ----------- mesg [5] mesg [n] datas[1] ----------- mesg[n+1] mesg[m] data(is[5]) ----------- Notes: nrqs - no of requests sent (or to be sent out) nrqr - no of requests recieved (which have to be or which have been processed */ #undef __FUNCT__ #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" static int MatIncreaseOverlap_MPIAIJ_Once(Mat C,int imax,IS is[]) { Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; int **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i; int size,rank,m,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; int *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2; int *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2; PetscBT *table; MPI_Comm comm; MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; MPI_Status *s_status,*recv_status; PetscFunctionBegin; comm = C->comm; size = c->size; rank = c->rank; m = C->M; ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); len = (imax+1)*sizeof(int*)+ (imax + m)*sizeof(int); ierr = PetscMalloc(len,&idx);CHKERRQ(ierr); n = (int*)(idx + imax); rtable = n + imax; for (i=0; i proc*/ for (i=0,j=0; irowners[i+1]; for (; jdata; Mat A = c->A,B = c->B; Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; int start,end,val,max,rstart,cstart,*ai,*aj; int *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; PetscBT table_i; PetscFunctionBegin; rstart = c->rstart; cstart = c->cstart; ai = a->i; aj = a->j; bi = b->i; bj = b->j; garray = c->garray; for (i=0; idata; Mat A = c->A,B = c->B; Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; int rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; int row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; int val,max1,max2,rank,m,no_malloc =0,*tmp,new_estimate,ctr; int *rbuf_i,kmax,rbuf_0,ierr; PetscBT xtable; PetscFunctionBegin; rank = c->rank; m = C->M; rstart = c->rstart; cstart = c->cstart; ai = a->i; aj = a->j; bi = b->i; bj = b->j; garray = c->garray; for (i=0,ct=0,total_sz=0; im) max1 = ct*(a->nz + b->nz)/C->m; else max1 = 1; mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); ierr = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);CHKERRQ(ierr); ++no_malloc; ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); ierr = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr); ct3 = 0; for (i=0; idata; Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; int ierr,sendcount,*recvcounts = 0,*displs = 0,size,i,*rstarts = a->rowners,rank,n,cnt,j; int m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; PetscScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; PetscFunctionBegin; ierr = MPI_Comm_size(A->comm,&size);CHKERRQ(ierr); ierr = MPI_Comm_rank(A->comm,&rank);CHKERRQ(ierr); if (scall == MAT_INITIAL_MATRIX) { /* ---------------------------------------------------------------- Tell every processor the number of nonzeros per row */ ierr = PetscMalloc(A->M*sizeof(int),&lens);CHKERRQ(ierr); for (i=a->rstart; irend; i++) { lens[i] = ad->i[i-a->rstart+1] - ad->i[i-a->rstart] + bd->i[i-a->rstart+1] - bd->i[i-a->rstart]; } sendcount = a->rend - a->rstart; ierr = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr); displs = recvcounts + size; for (i=0; irowners[i+1] - a->rowners[i]; displs[i] = a->rowners[i]; } ierr = MPI_Allgatherv(lens+a->rstart,sendcount,MPI_INT,lens,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr); /* --------------------------------------------------------------- Create the sequential matrix of the same type as the local block diagonal */ ierr = MatCreate(PETSC_COMM_SELF,A->M,A->N,PETSC_DETERMINE,PETSC_DETERMINE,&B);CHKERRQ(ierr); ierr = MatSetType(B,a->A->type_name);CHKERRQ(ierr); ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); **Bin = B; b = (Mat_SeqAIJ *)B->data; /*-------------------------------------------------------------------- Copy my part of matrix column indices over */ sendcount = ad->nz + bd->nz; jsendbuf = b->j + b->i[rstarts[rank]]; a_jsendbuf = ad->j; b_jsendbuf = bd->j; n = a->rend - a->rstart; cnt = 0; for (i=0; ii[i+1] - bd->i[i]; while (m > 0) { /* is it above diagonal (in bd (compressed) numbering) */ if (garray[*b_jsendbuf] > a->rstart + i) break; jsendbuf[cnt++] = garray[*b_jsendbuf++]; m--; } /* put in diagonal portion */ for (j=ad->i[i]; ji[i+1]; j++) { jsendbuf[cnt++] = a->rstart + *a_jsendbuf++; } /* put in upper diagonal portion */ while (m-- > 0) { jsendbuf[cnt++] = garray[*b_jsendbuf++]; } } if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt); /*-------------------------------------------------------------------- Gather all column indices to all processors */ for (i=0; irowners[i]; jrowners[i+1]; j++) { recvcounts[i] += lens[j]; } } displs[0] = 0; for (i=1; ij,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr); /*-------------------------------------------------------------------- Assemble the matrix into useable form (note numerical values not yet set) */ /* set the b->ilen (length of each row) values */ ierr = PetscMemcpy(b->ilen,lens,A->M*sizeof(int));CHKERRQ(ierr); /* set the b->i indices */ b->i[0] = 0; for (i=1; i<=A->M; i++) { b->i[i] = b->i[i-1] + lens[i-1]; } ierr = PetscFree(lens);CHKERRQ(ierr); ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); } else { B = **Bin; b = (Mat_SeqAIJ *)B->data; } /*-------------------------------------------------------------------- Copy my part of matrix numerical values into the values location */ sendcount = ad->nz + bd->nz; sendbuf = b->a + b->i[rstarts[rank]]; a_sendbuf = ad->a; b_sendbuf = bd->a; b_sendj = bd->j; n = a->rend - a->rstart; cnt = 0; for (i=0; ii[i+1] - bd->i[i]; while (m > 0) { /* is it above diagonal (in bd (compressed) numbering) */ if (garray[*b_sendj] > a->rstart + i) break; sendbuf[cnt++] = *b_sendbuf++; m--; b_sendj++; } /* put in diagonal portion */ for (j=ad->i[i]; ji[i+1]; j++) { sendbuf[cnt++] = *a_sendbuf++; } /* put in upper diagonal portion */ while (m-- > 0) { sendbuf[cnt++] = *b_sendbuf++; b_sendj++; } } if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt); /* ----------------------------------------------------------------- Gather all numerical values to all processors */ if (!recvcounts) { ierr = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr); displs = recvcounts + size; } for (i=0; ii[rstarts[i+1]] - b->i[rstarts[i]]; } displs[0] = 0; for (i=1; ia; ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,A->comm);CHKERRQ(ierr); ierr = PetscFree(recvcounts);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" int MatGetSubMatrices_MPIAIJ(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) { int nmax,nstages_local,nstages,i,pos,max_no,ierr,nrow,ncol; PetscTruth rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix; PetscFunctionBegin; /* Check for special case each processor gets entire matrix */ if (ismax == 1 && C->M == C->N) { ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); if (rowflag && colflag && nrow == C->M && ncol == C->N) { wantallmatrix = PETSC_TRUE; ierr = PetscOptionsGetLogical(C->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); } } ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,C->comm);CHKERRQ(ierr); if (twantallmatrix) { ierr = MatGetSubMatrix_MPIAIJ_All(C,scall,submat);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Allocate memory to hold all the submatrices */ if (scall != MAT_REUSE_MATRIX) { ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); } /* Determine the number of stages through which submatrices are done */ nmax = 20*1000000 / (C->N * sizeof(int)); if (!nmax) nmax = 1; nstages_local = ismax/nmax + ((ismax % nmax)?1:0); /* Make sure every processor loops through the nstages */ ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr); for (i=0,pos=0; idata; Mat A = c->A; Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; int **sbuf1,**sbuf2,rank,m,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc; int nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr; int **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap; int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; int *rmap_i,tag0,tag1,tag2,tag3; MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; MPI_Request *r_waits4,*s_waits3,*s_waits4; MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; MPI_Status *r_status3,*r_status4,*s_status4; MPI_Comm comm; PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; PetscTruth sorted; int *onodes1,*olengths1; PetscFunctionBegin; comm = C->comm; tag0 = C->tag; size = c->size; rank = c->rank; m = C->M; /* Get some new tags to keep the communication clean */ ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); /* Check if the col indices are sorted */ for (i=0; i proc*/ for (i=0,j=0; irowners[i+1]; for (; jA->data,*sB = (Mat_SeqAIJ*)c->B->data; int *sAi = sA->i,*sBi = sB->i,id,rstart = c->rstart; int *sbuf2_i; for (i=0; ij, and send them off */ ierr = PetscMalloc((nrqr+1)*sizeof(int*),&sbuf_aj);CHKERRQ(ierr); for (i=0,j=0; ii,*b_i = b->i,imark; int *cworkA,*cworkB,cstart = c->cstart,rstart = c->rstart,*bmap = c->garray; int *a_j = a->j,*b_j = b->j,ctmp; for (i=0; ia, and send them off */ ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); for (i=0,j=0; ii,*b_i = b->i, *cworkB,imark; int cstart = c->cstart,rstart = c->rstart,*bmap = c->garray; int *b_j = b->j; PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; for (i=0; iN*sizeof(int); ierr = PetscMalloc(len,&cmap);CHKERRQ(ierr); cmap[0] = (int *)(cmap + ismax); ierr = PetscMemzero(cmap[0],(1+ismax*C->N)*sizeof(int));CHKERRQ(ierr); for (i=1; iN; } for (i=0; iM*sizeof(int); ierr = PetscMalloc(len,&rmap);CHKERRQ(ierr); rmap[0] = (int *)(rmap + ismax); ierr = PetscMemzero(rmap[0],ismax*C->M*sizeof(int));CHKERRQ(ierr); for (i=1; iM;} for (i=0; idata); if ((submats[i]->m != nrow[i]) || (submats[i]->n != ncol[i])) { SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); } ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->m*sizeof(int),&flag);CHKERRQ(ierr); if (flag == PETSC_FALSE) { SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); } /* Initial matrix as if empty */ ierr = PetscMemzero(mat->ilen,submats[i]->m*sizeof(int));CHKERRQ(ierr); submats[i]->factor = C->factor; } } else { for (i=0; itype_name);CHKERRQ(ierr); ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); } } /* Assemble the matrices */ /* First assemble the local rows */ { int ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; PetscScalar *imat_a; for (i=0; idata; imat_ilen = mat->ilen; imat_j = mat->j; imat_i = mat->i; imat_a = mat->a; cmap_i = cmap[i]; rmap_i = rmap[i]; irow_i = irow[i]; jmax = nrow[i]; for (j=0; jdata; imat_ilen = mat->ilen; imat_j = mat->j; imat_i = mat->i; imat_a = mat->a; max1 = sbuf1_i[2*j]; for (k=0; k