1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 29 PetscFunctionBegin; 30 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 31 for (i=0; i<ov; ++i) { 32 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 33 } 34 PetscFunctionReturn(0); 35 } 36 37 #undef __FUNCT__ 38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 40 { 41 PetscErrorCode ierr; 42 PetscInt i; 43 44 PetscFunctionBegin; 45 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 46 for (i=0; i<ov; ++i) { 47 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 48 } 49 PetscFunctionReturn(0); 50 } 51 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 56 { 57 PetscErrorCode ierr; 58 MPI_Comm comm; 59 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 60 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 61 PetscInt nrecvrows,*sbsizes,*sbdata,nto,nfrom; 62 const PetscInt **indices,*indices_i; 63 PetscLayout rmap; 64 PetscMPIInt rank,size,*toranks,*fromranks; 65 PetscSF sf; 66 PetscSFNode *remote; 67 68 PetscFunctionBegin; 69 /*communicator */ 70 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 71 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 72 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 73 /*row map*/ 74 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 75 /*retrieve IS data*/ 76 ierr = PetscCalloc2(nidx,&indices,nidx,&length);CHKERRQ(ierr); 77 /*get length and indices*/ 78 for(i=0,tlength=0; i<nidx; i++){ 79 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 80 tlength += length[i]; 81 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 82 } 83 /*find these rows on remote processors */ 84 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 85 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 86 nrrows = 0; 87 for(i=0; i<nidx; i++){ 88 length_i = length[i]; 89 indices_i = indices[i]; 90 for(j=0; j<length_i; j++){ 91 owner = -1; 92 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 93 /*remote processors*/ 94 if(owner != rank){ 95 tosizes_temp[owner]++; /*number of rows to owner*/ 96 rrow_ranks[nrrows] = owner; /*processor */ 97 rrow_isids[nrrows] = i; /*is id*/ 98 remoterows[nrrows++] = indices_i[j]; /* row */ 99 } 100 } 101 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 102 } 103 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 104 /*test if we need to exchange messages 105 * generally speaking, we do not need to exchange 106 * data when overlap is 1 107 * */ 108 ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 109 /*we do not have any messages 110 * It usually corresponds to overlap 1 111 * */ 112 if(!reducednrrows){ 113 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 114 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 115 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 nto = 0; 119 /*send sizes and ranks*/ 120 for(i=0; i<size; i++){ 121 if(tosizes_temp[i]){ 122 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 123 tosizes_temp[i] = nto; /* a map from processor to index */ 124 toranks[nto++] = i; /* processor */ 125 } 126 } 127 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 128 for(i=0; i<nto; i++){ 129 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /*offsets*/ 130 tosizes[2*i+1] = toffsets[i]; /*offsets to send*/ 131 } 132 /*send information to other processors*/ 133 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 134 /*build a star forest */ 135 nrecvrows = 0; 136 for(i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 137 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 138 nrecvrows = 0; 139 for(i=0; i<nfrom; i++){ 140 for(j=0; j<fromsizes[2*i]; j++){ 141 remote[nrecvrows].rank = fromranks[i]; 142 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 143 } 144 } 145 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 146 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 147 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 148 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 149 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 150 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 151 /*message pair <no of is, row> */ 152 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 153 for(i=0; i<nrrows; i++){ 154 owner = rrow_ranks[i]; /* processor */ 155 j = tosizes_temp[owner]; /* index */ 156 todata[toffsets[j]++] = rrow_isids[i]; 157 todata[toffsets[j]++] = remoterows[i]; 158 } 159 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 160 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 161 ierr = PetscFree(toffsets);CHKERRQ(ierr); 162 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 163 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 164 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 165 /*deal with remote data */ 166 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 167 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 168 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 169 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 170 ierr = PetscFree(fromranks);CHKERRQ(ierr); 171 nrecvrows = 0; 172 for(i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 173 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 174 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 175 nrecvrows = 0; 176 for(i=0; i<nto; i++){ 177 for(j=0; j<tosizes[2*i]; j++){ 178 remote[nrecvrows].rank = toranks[i]; 179 remote[nrecvrows++].index = tosizes[2*i+1]+j; 180 } 181 } 182 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 183 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 184 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 185 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 186 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 187 /*overlap communication and computation*/ 188 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 189 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 190 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 191 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 192 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 193 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 194 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 195 ierr = PetscFree(toranks);CHKERRQ(ierr); 196 ierr = PetscFree(tosizes);CHKERRQ(ierr); 197 ierr = PetscFree(todata);CHKERRQ(ierr); 198 PetscFunctionReturn(0); 199 } 200 201 #undef __FUNCT__ 202 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 203 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 204 { 205 PetscInt *isz,isz_i,i,j,is_id, data_size; 206 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 207 const PetscInt *indices_i_temp; 208 PetscErrorCode ierr; 209 210 PetscFunctionBegin; 211 max_lsize = 0; 212 ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr); 213 for(i=0; i<nidx; i++){ 214 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 215 max_lsize = lsize>max_lsize ? lsize:max_lsize; 216 isz[i] = lsize; 217 } 218 ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr); 219 for(i=0; i<nidx; i++){ 220 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 221 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 222 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 223 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 224 } 225 /*retrieve information */ 226 for(i=0; i<nrecvs; ){ 227 is_id = recvdata[i++]; 228 data_size = recvdata[i++]; 229 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 230 isz_i = isz[is_id]; 231 for(j=0; j< data_size; j++){ 232 col = recvdata[i++]; 233 indices_i[isz_i++] = col; 234 } 235 isz[is_id] = isz_i; 236 } 237 /*remove duplicate entities*/ 238 for(i=0; i<nidx; i++){ 239 indices_i = indices_temp+(max_lsize+nrecvs)*i; 240 isz_i = isz[i]; 241 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 242 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 243 } 244 ierr = PetscFree(isz);CHKERRQ(ierr); 245 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 246 PetscFunctionReturn(0); 247 } 248 249 #undef __FUNCT__ 250 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 251 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 252 { 253 PetscLayout rmap,cmap; 254 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 255 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 256 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 257 const PetscInt *gcols,*ai,*aj,*bi,*bj; 258 Mat amat,bmat; 259 PetscMPIInt rank; 260 PetscBool done; 261 MPI_Comm comm; 262 PetscErrorCode ierr; 263 264 PetscFunctionBegin; 265 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 266 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 267 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 268 /* Even if the mat is symmetric, we still assume it is not symmetric*/ 269 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 270 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 271 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 272 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 273 /*total number of nonzero values */ 274 tnz = ai[an]+bi[bn]; 275 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 276 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 277 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 278 /*longest message */ 279 max_fszs = 0; 280 for(i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 281 /*better way to estimate number of nonzero in the mat???*/ 282 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 283 for(i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 284 rows_pos = 0; 285 totalrows = 0; 286 for(i=0; i<nfrom; i++){ 287 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 288 /*group data*/ 289 for(j=0; j<fromsizes[2*i]; j+=2){ 290 is_id = fromrows[rows_pos++];/*no of is*/ 291 rows_i = rows_data[is_id]; 292 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 293 } 294 /*estimate a space to avoid multiple allocations */ 295 for(j=0; j<nidx; j++){ 296 indvc_ij = 0; 297 rows_i = rows_data[j]; 298 for(l=0; l<rows_pos_i[j]; l++){ 299 row = rows_i[l]-rstart; 300 start = ai[row]; 301 end = ai[row+1]; 302 for(k=start; k<end; k++){ /* Amat */ 303 col = aj[k] + cstart; 304 /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr); 305 if(owner != fromranks[i])*/ 306 indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/ 307 } 308 start = bi[row]; 309 end = bi[row+1]; 310 for(k=start; k<end; k++) { /* Bmat */ 311 col = gcols[bj[k]]; 312 /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr); 313 if(owner != fromranks[i]) */ 314 indices_tmp[indvc_ij++] = col; 315 } 316 } 317 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 318 indv_counts[i*nidx+j] = indvc_ij; 319 totalrows += indvc_ij; 320 } 321 } 322 /*message triple <no of is, number of rows, rows> */ 323 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 324 totalrows = 0; 325 rows_pos = 0; 326 /* use this code again */ 327 for(i=0;i<nfrom;i++){ 328 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 329 for(j=0; j<fromsizes[2*i]; j+=2){ 330 is_id = fromrows[rows_pos++]; 331 rows_i = rows_data[is_id]; 332 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 333 } 334 /* add data */ 335 for(j=0; j<nidx; j++){ 336 if(!indv_counts[i*nidx+j]) continue; 337 indvc_ij = 0; 338 sbdata[totalrows++] = j; 339 sbdata[totalrows++] = indv_counts[i*nidx+j]; 340 sbsizes[2*i] += 2; 341 rows_i = rows_data[j]; 342 for(l=0; l<rows_pos_i[j]; l++){ 343 row = rows_i[l]-rstart; 344 start = ai[row]; 345 end = ai[row+1]; 346 for(k=start; k<end; k++){ /* Amat */ 347 col = aj[k] + cstart; 348 indices_tmp[indvc_ij++] = col; 349 } 350 start = bi[row]; 351 end = bi[row+1]; 352 for(k=start; k<end; k++) { /* Bmat */ 353 col = gcols[bj[k]]; 354 indices_tmp[indvc_ij++] = col; 355 } 356 } 357 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 358 sbsizes[2*i] += indvc_ij; 359 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 360 totalrows += indvc_ij; 361 } 362 } 363 /* offsets */ 364 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 365 for(i=0; i<nfrom; i++){ 366 offsets[i+1] = offsets[i] + sbsizes[2*i]; 367 sbsizes[2*i+1] = offsets[i]; 368 } 369 ierr = PetscFree(offsets);CHKERRQ(ierr); 370 if(sbrowsizes) *sbrowsizes = sbsizes; 371 if(sbrows) *sbrows = sbdata; 372 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 373 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 374 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 375 PetscFunctionReturn(0); 376 } 377 378 #undef __FUNCT__ 379 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 380 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 381 { 382 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 383 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 384 PetscInt lsize,lsize_tmp,owner; 385 PetscMPIInt rank; 386 Mat amat,bmat; 387 PetscBool done; 388 PetscLayout cmap,rmap; 389 MPI_Comm comm; 390 PetscErrorCode ierr; 391 392 PetscFunctionBegin; 393 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 394 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 395 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 396 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 397 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 398 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 399 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 400 /*is it a safe way to compute number of nonzero values ?*/ 401 tnz = ai[an]+bi[bn]; 402 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 403 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 404 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 405 /*it is a better way to estimate memory than the old implementation 406 * where global size of matrix is used 407 * */ 408 ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr); 409 for(i=0; i<nidx; i++) { 410 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 411 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 412 lsize_tmp = 0; 413 for(j=0; j<lsize; j++) { 414 owner = -1; 415 row = indices[j]; 416 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 417 if(owner != rank) continue; 418 /*local number*/ 419 row -= rstart; 420 start = ai[row]; 421 end = ai[row+1]; 422 for(k=start; k<end; k++) { /* Amat */ 423 col = aj[k] + cstart; 424 indices_temp[lsize_tmp++] = col; 425 } 426 start = bi[row]; 427 end = bi[row+1]; 428 for(k=start; k<end; k++) { /* Bmat */ 429 col = gcols[bj[k]]; 430 indices_temp[lsize_tmp++] = col; 431 } 432 } 433 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 434 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 435 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 436 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 437 } 438 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 439 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 440 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 441 PetscFunctionReturn(0); 442 } 443 444 445 /* 446 Sample message format: 447 If a processor A wants processor B to process some elements corresponding 448 to index sets is[1],is[5] 449 mesg [0] = 2 (no of index sets in the mesg) 450 ----------- 451 mesg [1] = 1 => is[1] 452 mesg [2] = sizeof(is[1]); 453 ----------- 454 mesg [3] = 5 => is[5] 455 mesg [4] = sizeof(is[5]); 456 ----------- 457 mesg [5] 458 mesg [n] datas[1] 459 ----------- 460 mesg[n+1] 461 mesg[m] data(is[5]) 462 ----------- 463 464 Notes: 465 nrqs - no of requests sent (or to be sent out) 466 nrqr - no of requests recieved (which have to be or which have been processed 467 */ 468 #undef __FUNCT__ 469 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 470 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 471 { 472 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 473 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 474 const PetscInt **idx,*idx_i; 475 PetscInt *n,**data,len; 476 PetscErrorCode ierr; 477 PetscMPIInt size,rank,tag1,tag2; 478 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 479 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 480 PetscBT *table; 481 MPI_Comm comm; 482 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 483 MPI_Status *s_status,*recv_status; 484 char *t_p; 485 486 PetscFunctionBegin; 487 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 488 size = c->size; 489 rank = c->rank; 490 M = C->rmap->N; 491 492 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 493 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 494 495 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 496 497 for (i=0; i<imax; i++) { 498 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 499 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 500 } 501 502 /* evaluate communication - mesg to who,length of mesg, and buffer space 503 required. Based on this, buffers are allocated, and data copied into them*/ 504 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 505 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 506 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 507 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 508 for (i=0; i<imax; i++) { 509 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 510 idx_i = idx[i]; 511 len = n[i]; 512 for (j=0; j<len; j++) { 513 row = idx_i[j]; 514 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 515 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 516 w4[proc]++; 517 } 518 for (j=0; j<size; j++) { 519 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 520 } 521 } 522 523 nrqs = 0; /* no of outgoing messages */ 524 msz = 0; /* total mesg length (for all proc */ 525 w1[rank] = 0; /* no mesg sent to intself */ 526 w3[rank] = 0; 527 for (i=0; i<size; i++) { 528 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 529 } 530 /* pa - is list of processors to communicate with */ 531 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 532 for (i=0,j=0; i<size; i++) { 533 if (w1[i]) {pa[j] = i; j++;} 534 } 535 536 /* Each message would have a header = 1 + 2*(no of IS) + data */ 537 for (i=0; i<nrqs; i++) { 538 j = pa[i]; 539 w1[j] += w2[j] + 2*w3[j]; 540 msz += w1[j]; 541 } 542 543 /* Determine the number of messages to expect, their lengths, from from-ids */ 544 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 545 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 546 547 /* Now post the Irecvs corresponding to these messages */ 548 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 549 550 /* Allocate Memory for outgoing messages */ 551 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 552 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 553 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 554 555 { 556 PetscInt *iptr = tmp,ict = 0; 557 for (i=0; i<nrqs; i++) { 558 j = pa[i]; 559 iptr += ict; 560 outdat[j] = iptr; 561 ict = w1[j]; 562 } 563 } 564 565 /* Form the outgoing messages */ 566 /*plug in the headers*/ 567 for (i=0; i<nrqs; i++) { 568 j = pa[i]; 569 outdat[j][0] = 0; 570 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 571 ptr[j] = outdat[j] + 2*w3[j] + 1; 572 } 573 574 /* Memory for doing local proc's work*/ 575 { 576 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 577 578 for (i=0; i<imax; i++) { 579 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 580 data[i] = d_p + M*i; 581 } 582 } 583 584 /* Parse the IS and update local tables and the outgoing buf with the data*/ 585 { 586 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 587 PetscBT table_i; 588 589 for (i=0; i<imax; i++) { 590 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 591 n_i = n[i]; 592 table_i = table[i]; 593 idx_i = idx[i]; 594 data_i = data[i]; 595 isz_i = isz[i]; 596 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 597 row = idx_i[j]; 598 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 599 if (proc != rank) { /* copy to the outgoing buffer */ 600 ctr[proc]++; 601 *ptr[proc] = row; 602 ptr[proc]++; 603 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 604 } 605 /* Update the headers for the current IS */ 606 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 607 if ((ctr_j = ctr[j])) { 608 outdat_j = outdat[j]; 609 k = ++outdat_j[0]; 610 outdat_j[2*k] = ctr_j; 611 outdat_j[2*k-1] = i; 612 } 613 } 614 isz[i] = isz_i; 615 } 616 } 617 618 /* Now post the sends */ 619 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 620 for (i=0; i<nrqs; ++i) { 621 j = pa[i]; 622 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 623 } 624 625 /* No longer need the original indices*/ 626 for (i=0; i<imax; ++i) { 627 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 628 } 629 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 630 631 for (i=0; i<imax; ++i) { 632 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 633 } 634 635 /* Do Local work*/ 636 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 637 638 /* Receive messages*/ 639 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 640 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 641 642 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 643 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 644 645 /* Phase 1 sends are complete - deallocate buffers */ 646 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 647 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 648 649 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 650 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 651 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 652 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 653 ierr = PetscFree(rbuf);CHKERRQ(ierr); 654 655 656 /* Send the data back*/ 657 /* Do a global reduction to know the buffer space req for incoming messages*/ 658 { 659 PetscMPIInt *rw1; 660 661 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 662 663 for (i=0; i<nrqr; ++i) { 664 proc = recv_status[i].MPI_SOURCE; 665 666 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 667 rw1[proc] = isz1[i]; 668 } 669 ierr = PetscFree(onodes1);CHKERRQ(ierr); 670 ierr = PetscFree(olengths1);CHKERRQ(ierr); 671 672 /* Determine the number of messages to expect, their lengths, from from-ids */ 673 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 674 ierr = PetscFree(rw1);CHKERRQ(ierr); 675 } 676 /* Now post the Irecvs corresponding to these messages */ 677 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 678 679 /* Now post the sends */ 680 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 681 for (i=0; i<nrqr; ++i) { 682 j = recv_status[i].MPI_SOURCE; 683 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 684 } 685 686 /* receive work done on other processors*/ 687 { 688 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 689 PetscMPIInt idex; 690 PetscBT table_i; 691 MPI_Status *status2; 692 693 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 694 for (i=0; i<nrqs; ++i) { 695 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 696 /* Process the message*/ 697 rbuf2_i = rbuf2[idex]; 698 ct1 = 2*rbuf2_i[0]+1; 699 jmax = rbuf2[idex][0]; 700 for (j=1; j<=jmax; j++) { 701 max = rbuf2_i[2*j]; 702 is_no = rbuf2_i[2*j-1]; 703 isz_i = isz[is_no]; 704 data_i = data[is_no]; 705 table_i = table[is_no]; 706 for (k=0; k<max; k++,ct1++) { 707 row = rbuf2_i[ct1]; 708 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 709 } 710 isz[is_no] = isz_i; 711 } 712 } 713 714 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 715 ierr = PetscFree(status2);CHKERRQ(ierr); 716 } 717 718 for (i=0; i<imax; ++i) { 719 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 720 } 721 722 ierr = PetscFree(onodes2);CHKERRQ(ierr); 723 ierr = PetscFree(olengths2);CHKERRQ(ierr); 724 725 ierr = PetscFree(pa);CHKERRQ(ierr); 726 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 727 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 728 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 729 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 730 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 731 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 732 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 733 ierr = PetscFree(s_status);CHKERRQ(ierr); 734 ierr = PetscFree(recv_status);CHKERRQ(ierr); 735 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 736 ierr = PetscFree(xdata);CHKERRQ(ierr); 737 ierr = PetscFree(isz1);CHKERRQ(ierr); 738 PetscFunctionReturn(0); 739 } 740 741 #undef __FUNCT__ 742 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 743 /* 744 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 745 the work on the local processor. 746 747 Inputs: 748 C - MAT_MPIAIJ; 749 imax - total no of index sets processed at a time; 750 table - an array of char - size = m bits. 751 752 Output: 753 isz - array containing the count of the solution elements corresponding 754 to each index set; 755 data - pointer to the solutions 756 */ 757 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 758 { 759 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 760 Mat A = c->A,B = c->B; 761 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 762 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 763 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 764 PetscBT table_i; 765 766 PetscFunctionBegin; 767 rstart = C->rmap->rstart; 768 cstart = C->cmap->rstart; 769 ai = a->i; 770 aj = a->j; 771 bi = b->i; 772 bj = b->j; 773 garray = c->garray; 774 775 776 for (i=0; i<imax; i++) { 777 data_i = data[i]; 778 table_i = table[i]; 779 isz_i = isz[i]; 780 for (j=0,max=isz[i]; j<max; j++) { 781 row = data_i[j] - rstart; 782 start = ai[row]; 783 end = ai[row+1]; 784 for (k=start; k<end; k++) { /* Amat */ 785 val = aj[k] + cstart; 786 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 787 } 788 start = bi[row]; 789 end = bi[row+1]; 790 for (k=start; k<end; k++) { /* Bmat */ 791 val = garray[bj[k]]; 792 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 793 } 794 } 795 isz[i] = isz_i; 796 } 797 PetscFunctionReturn(0); 798 } 799 800 #undef __FUNCT__ 801 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 802 /* 803 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 804 and return the output 805 806 Input: 807 C - the matrix 808 nrqr - no of messages being processed. 809 rbuf - an array of pointers to the recieved requests 810 811 Output: 812 xdata - array of messages to be sent back 813 isz1 - size of each message 814 815 For better efficiency perhaps we should malloc separately each xdata[i], 816 then if a remalloc is required we need only copy the data for that one row 817 rather then all previous rows as it is now where a single large chunck of 818 memory is used. 819 820 */ 821 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 822 { 823 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 824 Mat A = c->A,B = c->B; 825 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 826 PetscErrorCode ierr; 827 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 828 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 829 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 830 PetscInt *rbuf_i,kmax,rbuf_0; 831 PetscBT xtable; 832 833 PetscFunctionBegin; 834 m = C->rmap->N; 835 rstart = C->rmap->rstart; 836 cstart = C->cmap->rstart; 837 ai = a->i; 838 aj = a->j; 839 bi = b->i; 840 bj = b->j; 841 garray = c->garray; 842 843 844 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 845 rbuf_i = rbuf[i]; 846 rbuf_0 = rbuf_i[0]; 847 ct += rbuf_0; 848 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 849 } 850 851 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 852 else max1 = 1; 853 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 854 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 855 ++no_malloc; 856 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 857 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 858 859 ct3 = 0; 860 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 861 rbuf_i = rbuf[i]; 862 rbuf_0 = rbuf_i[0]; 863 ct1 = 2*rbuf_0+1; 864 ct2 = ct1; 865 ct3 += ct1; 866 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 867 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 868 oct2 = ct2; 869 kmax = rbuf_i[2*j]; 870 for (k=0; k<kmax; k++,ct1++) { 871 row = rbuf_i[ct1]; 872 if (!PetscBTLookupSet(xtable,row)) { 873 if (!(ct3 < mem_estimate)) { 874 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 875 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 876 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 877 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 878 xdata[0] = tmp; 879 mem_estimate = new_estimate; ++no_malloc; 880 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 881 } 882 xdata[i][ct2++] = row; 883 ct3++; 884 } 885 } 886 for (k=oct2,max2=ct2; k<max2; k++) { 887 row = xdata[i][k] - rstart; 888 start = ai[row]; 889 end = ai[row+1]; 890 for (l=start; l<end; l++) { 891 val = aj[l] + cstart; 892 if (!PetscBTLookupSet(xtable,val)) { 893 if (!(ct3 < mem_estimate)) { 894 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 895 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 896 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 897 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 898 xdata[0] = tmp; 899 mem_estimate = new_estimate; ++no_malloc; 900 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 901 } 902 xdata[i][ct2++] = val; 903 ct3++; 904 } 905 } 906 start = bi[row]; 907 end = bi[row+1]; 908 for (l=start; l<end; l++) { 909 val = garray[bj[l]]; 910 if (!PetscBTLookupSet(xtable,val)) { 911 if (!(ct3 < mem_estimate)) { 912 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 913 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 914 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 915 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 916 xdata[0] = tmp; 917 mem_estimate = new_estimate; ++no_malloc; 918 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 919 } 920 xdata[i][ct2++] = val; 921 ct3++; 922 } 923 } 924 } 925 /* Update the header*/ 926 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 927 xdata[i][2*j-1] = rbuf_i[2*j-1]; 928 } 929 xdata[i][0] = rbuf_0; 930 xdata[i+1] = xdata[i] + ct2; 931 isz1[i] = ct2; /* size of each message */ 932 } 933 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 934 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 935 PetscFunctionReturn(0); 936 } 937 /* -------------------------------------------------------------------------*/ 938 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 939 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 940 /* 941 Every processor gets the entire matrix 942 */ 943 #undef __FUNCT__ 944 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 945 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 946 { 947 Mat B; 948 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 949 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 950 PetscErrorCode ierr; 951 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 952 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 953 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 954 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 955 956 PetscFunctionBegin; 957 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 958 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 959 960 if (scall == MAT_INITIAL_MATRIX) { 961 /* ---------------------------------------------------------------- 962 Tell every processor the number of nonzeros per row 963 */ 964 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 965 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 966 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 967 } 968 sendcount = A->rmap->rend - A->rmap->rstart; 969 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 970 for (i=0; i<size; i++) { 971 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 972 displs[i] = A->rmap->range[i]; 973 } 974 #if defined(PETSC_HAVE_MPI_IN_PLACE) 975 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 976 #else 977 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 978 #endif 979 /* --------------------------------------------------------------- 980 Create the sequential matrix of the same type as the local block diagonal 981 */ 982 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 983 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 984 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 985 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 986 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 987 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 988 **Bin = B; 989 b = (Mat_SeqAIJ*)B->data; 990 991 /*-------------------------------------------------------------------- 992 Copy my part of matrix column indices over 993 */ 994 sendcount = ad->nz + bd->nz; 995 jsendbuf = b->j + b->i[rstarts[rank]]; 996 a_jsendbuf = ad->j; 997 b_jsendbuf = bd->j; 998 n = A->rmap->rend - A->rmap->rstart; 999 cnt = 0; 1000 for (i=0; i<n; i++) { 1001 1002 /* put in lower diagonal portion */ 1003 m = bd->i[i+1] - bd->i[i]; 1004 while (m > 0) { 1005 /* is it above diagonal (in bd (compressed) numbering) */ 1006 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1007 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1008 m--; 1009 } 1010 1011 /* put in diagonal portion */ 1012 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1013 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1014 } 1015 1016 /* put in upper diagonal portion */ 1017 while (m-- > 0) { 1018 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1019 } 1020 } 1021 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1022 1023 /*-------------------------------------------------------------------- 1024 Gather all column indices to all processors 1025 */ 1026 for (i=0; i<size; i++) { 1027 recvcounts[i] = 0; 1028 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1029 recvcounts[i] += lens[j]; 1030 } 1031 } 1032 displs[0] = 0; 1033 for (i=1; i<size; i++) { 1034 displs[i] = displs[i-1] + recvcounts[i-1]; 1035 } 1036 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1037 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1038 #else 1039 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1040 #endif 1041 /*-------------------------------------------------------------------- 1042 Assemble the matrix into useable form (note numerical values not yet set) 1043 */ 1044 /* set the b->ilen (length of each row) values */ 1045 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1046 /* set the b->i indices */ 1047 b->i[0] = 0; 1048 for (i=1; i<=A->rmap->N; i++) { 1049 b->i[i] = b->i[i-1] + lens[i-1]; 1050 } 1051 ierr = PetscFree(lens);CHKERRQ(ierr); 1052 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1053 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1054 1055 } else { 1056 B = **Bin; 1057 b = (Mat_SeqAIJ*)B->data; 1058 } 1059 1060 /*-------------------------------------------------------------------- 1061 Copy my part of matrix numerical values into the values location 1062 */ 1063 if (flag == MAT_GET_VALUES) { 1064 sendcount = ad->nz + bd->nz; 1065 sendbuf = b->a + b->i[rstarts[rank]]; 1066 a_sendbuf = ad->a; 1067 b_sendbuf = bd->a; 1068 b_sendj = bd->j; 1069 n = A->rmap->rend - A->rmap->rstart; 1070 cnt = 0; 1071 for (i=0; i<n; i++) { 1072 1073 /* put in lower diagonal portion */ 1074 m = bd->i[i+1] - bd->i[i]; 1075 while (m > 0) { 1076 /* is it above diagonal (in bd (compressed) numbering) */ 1077 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1078 sendbuf[cnt++] = *b_sendbuf++; 1079 m--; 1080 b_sendj++; 1081 } 1082 1083 /* put in diagonal portion */ 1084 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1085 sendbuf[cnt++] = *a_sendbuf++; 1086 } 1087 1088 /* put in upper diagonal portion */ 1089 while (m-- > 0) { 1090 sendbuf[cnt++] = *b_sendbuf++; 1091 b_sendj++; 1092 } 1093 } 1094 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1095 1096 /* ----------------------------------------------------------------- 1097 Gather all numerical values to all processors 1098 */ 1099 if (!recvcounts) { 1100 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1101 } 1102 for (i=0; i<size; i++) { 1103 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1104 } 1105 displs[0] = 0; 1106 for (i=1; i<size; i++) { 1107 displs[i] = displs[i-1] + recvcounts[i-1]; 1108 } 1109 recvbuf = b->a; 1110 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1111 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1112 #else 1113 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1114 #endif 1115 } /* endof (flag == MAT_GET_VALUES) */ 1116 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1117 1118 if (A->symmetric) { 1119 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1120 } else if (A->hermitian) { 1121 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1122 } else if (A->structurally_symmetric) { 1123 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1124 } 1125 PetscFunctionReturn(0); 1126 } 1127 1128 1129 1130 #undef __FUNCT__ 1131 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 1132 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1133 { 1134 PetscErrorCode ierr; 1135 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 1136 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 1137 1138 PetscFunctionBegin; 1139 /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */ 1140 /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level. 1141 However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that 1142 take care to order the result correctly by assembling it with MatSetValues() (after preallocating). 1143 */ 1144 for (i = 0; i < ismax; ++i) { 1145 PetscBool sorted; 1146 ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr); 1147 if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i); 1148 } 1149 1150 /* 1151 Check for special case: each processor gets entire matrix 1152 */ 1153 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1154 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1155 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1156 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1157 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1158 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1159 wantallmatrix = PETSC_TRUE; 1160 1161 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1162 } 1163 } 1164 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1165 if (twantallmatrix) { 1166 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 1167 PetscFunctionReturn(0); 1168 } 1169 1170 /* Allocate memory to hold all the submatrices */ 1171 if (scall != MAT_REUSE_MATRIX) { 1172 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 1173 } 1174 1175 /* Check for special case: each processor gets entire matrix columns */ 1176 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 1177 for (i=0; i<ismax; i++) { 1178 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 1179 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 1180 if (colflag && ncol == C->cmap->N) { 1181 allcolumns[i] = PETSC_TRUE; 1182 } else { 1183 allcolumns[i] = PETSC_FALSE; 1184 } 1185 } 1186 1187 /* Determine the number of stages through which submatrices are done */ 1188 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1189 1190 /* 1191 Each stage will extract nmax submatrices. 1192 nmax is determined by the matrix column dimension. 1193 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1194 */ 1195 if (!nmax) nmax = 1; 1196 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 1197 1198 /* Make sure every processor loops through the nstages */ 1199 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1200 1201 for (i=0,pos=0; i<nstages; i++) { 1202 if (pos+nmax <= ismax) max_no = nmax; 1203 else if (pos == ismax) max_no = 0; 1204 else max_no = ismax-pos; 1205 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 1206 pos += max_no; 1207 } 1208 1209 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 1210 PetscFunctionReturn(0); 1211 } 1212 1213 /* -------------------------------------------------------------------------*/ 1214 #undef __FUNCT__ 1215 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 1216 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 1217 { 1218 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1219 Mat A = c->A; 1220 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 1221 const PetscInt **icol,**irow; 1222 PetscInt *nrow,*ncol,start; 1223 PetscErrorCode ierr; 1224 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 1225 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 1226 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 1227 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 1228 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 1229 #if defined(PETSC_USE_CTABLE) 1230 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 1231 #else 1232 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 1233 #endif 1234 const PetscInt *irow_i; 1235 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 1236 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1237 MPI_Request *r_waits4,*s_waits3,*s_waits4; 1238 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 1239 MPI_Status *r_status3,*r_status4,*s_status4; 1240 MPI_Comm comm; 1241 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 1242 PetscMPIInt *onodes1,*olengths1; 1243 PetscMPIInt idex,idex2,end; 1244 1245 PetscFunctionBegin; 1246 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1247 tag0 = ((PetscObject)C)->tag; 1248 size = c->size; 1249 rank = c->rank; 1250 1251 /* Get some new tags to keep the communication clean */ 1252 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 1253 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1254 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1255 1256 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 1257 1258 for (i=0; i<ismax; i++) { 1259 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 1260 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 1261 if (allcolumns[i]) { 1262 icol[i] = NULL; 1263 ncol[i] = C->cmap->N; 1264 } else { 1265 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 1266 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 1267 } 1268 } 1269 1270 /* evaluate communication - mesg to who, length of mesg, and buffer space 1271 required. Based on this, buffers are allocated, and data copied into them*/ 1272 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 1273 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1274 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1275 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1276 for (i=0; i<ismax; i++) { 1277 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1278 jmax = nrow[i]; 1279 irow_i = irow[i]; 1280 for (j=0; j<jmax; j++) { 1281 l = 0; 1282 row = irow_i[j]; 1283 while (row >= C->rmap->range[l+1]) l++; 1284 proc = l; 1285 w4[proc]++; 1286 } 1287 for (j=0; j<size; j++) { 1288 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 1289 } 1290 } 1291 1292 nrqs = 0; /* no of outgoing messages */ 1293 msz = 0; /* total mesg length (for all procs) */ 1294 w1[rank] = 0; /* no mesg sent to self */ 1295 w3[rank] = 0; 1296 for (i=0; i<size; i++) { 1297 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 1298 } 1299 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1300 for (i=0,j=0; i<size; i++) { 1301 if (w1[i]) { pa[j] = i; j++; } 1302 } 1303 1304 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1305 for (i=0; i<nrqs; i++) { 1306 j = pa[i]; 1307 w1[j] += w2[j] + 2* w3[j]; 1308 msz += w1[j]; 1309 } 1310 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1311 1312 /* Determine the number of messages to expect, their lengths, from from-ids */ 1313 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1314 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1315 1316 /* Now post the Irecvs corresponding to these messages */ 1317 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1318 1319 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1320 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1321 1322 /* Allocate Memory for outgoing messages */ 1323 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1324 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1325 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1326 1327 { 1328 PetscInt *iptr = tmp,ict = 0; 1329 for (i=0; i<nrqs; i++) { 1330 j = pa[i]; 1331 iptr += ict; 1332 sbuf1[j] = iptr; 1333 ict = w1[j]; 1334 } 1335 } 1336 1337 /* Form the outgoing messages */ 1338 /* Initialize the header space */ 1339 for (i=0; i<nrqs; i++) { 1340 j = pa[i]; 1341 sbuf1[j][0] = 0; 1342 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 1343 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 1344 } 1345 1346 /* Parse the isrow and copy data into outbuf */ 1347 for (i=0; i<ismax; i++) { 1348 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1349 irow_i = irow[i]; 1350 jmax = nrow[i]; 1351 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 1352 l = 0; 1353 row = irow_i[j]; 1354 while (row >= C->rmap->range[l+1]) l++; 1355 proc = l; 1356 if (proc != rank) { /* copy to the outgoing buf*/ 1357 ctr[proc]++; 1358 *ptr[proc] = row; 1359 ptr[proc]++; 1360 } 1361 } 1362 /* Update the headers for the current IS */ 1363 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1364 if ((ctr_j = ctr[j])) { 1365 sbuf1_j = sbuf1[j]; 1366 k = ++sbuf1_j[0]; 1367 sbuf1_j[2*k] = ctr_j; 1368 sbuf1_j[2*k-1] = i; 1369 } 1370 } 1371 } 1372 1373 /* Now post the sends */ 1374 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1375 for (i=0; i<nrqs; ++i) { 1376 j = pa[i]; 1377 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 1378 } 1379 1380 /* Post Receives to capture the buffer size */ 1381 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 1382 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 1383 rbuf2[0] = tmp + msz; 1384 for (i=1; i<nrqs; ++i) { 1385 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 1386 } 1387 for (i=0; i<nrqs; ++i) { 1388 j = pa[i]; 1389 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 1390 } 1391 1392 /* Send to other procs the buf size they should allocate */ 1393 1394 1395 /* Receive messages*/ 1396 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 1397 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1398 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 1399 { 1400 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 1401 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 1402 PetscInt *sbuf2_i; 1403 1404 for (i=0; i<nrqr; ++i) { 1405 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1406 1407 req_size[idex] = 0; 1408 rbuf1_i = rbuf1[idex]; 1409 start = 2*rbuf1_i[0] + 1; 1410 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1411 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 1412 sbuf2_i = sbuf2[idex]; 1413 for (j=start; j<end; j++) { 1414 id = rbuf1_i[j] - rstart; 1415 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1416 sbuf2_i[j] = ncols; 1417 req_size[idex] += ncols; 1418 } 1419 req_source[idex] = r_status1[i].MPI_SOURCE; 1420 /* form the header */ 1421 sbuf2_i[0] = req_size[idex]; 1422 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1423 1424 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1425 } 1426 } 1427 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1428 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1429 1430 /* recv buffer sizes */ 1431 /* Receive messages*/ 1432 1433 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 1434 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1435 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 1436 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1437 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 1438 1439 for (i=0; i<nrqs; ++i) { 1440 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1441 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 1442 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 1443 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1444 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1445 } 1446 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1447 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1448 1449 /* Wait on sends1 and sends2 */ 1450 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1451 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 1452 1453 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1454 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1455 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1456 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1457 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1458 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1459 1460 /* Now allocate buffers for a->j, and send them off */ 1461 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1462 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1463 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1464 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1465 1466 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 1467 { 1468 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1469 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1470 PetscInt cend = C->cmap->rend; 1471 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1472 1473 for (i=0; i<nrqr; i++) { 1474 rbuf1_i = rbuf1[i]; 1475 sbuf_aj_i = sbuf_aj[i]; 1476 ct1 = 2*rbuf1_i[0] + 1; 1477 ct2 = 0; 1478 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1479 kmax = rbuf1[i][2*j]; 1480 for (k=0; k<kmax; k++,ct1++) { 1481 row = rbuf1_i[ct1] - rstart; 1482 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1483 ncols = nzA + nzB; 1484 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1485 1486 /* load the column indices for this row into cols*/ 1487 cols = sbuf_aj_i + ct2; 1488 1489 lwrite = 0; 1490 for (l=0; l<nzB; l++) { 1491 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1492 } 1493 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1494 for (l=0; l<nzB; l++) { 1495 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1496 } 1497 1498 ct2 += ncols; 1499 } 1500 } 1501 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1502 } 1503 } 1504 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 1505 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 1506 1507 /* Allocate buffers for a->a, and send them off */ 1508 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1509 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1510 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1511 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1512 1513 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1514 { 1515 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1516 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1517 PetscInt cend = C->cmap->rend; 1518 PetscInt *b_j = b->j; 1519 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1520 1521 for (i=0; i<nrqr; i++) { 1522 rbuf1_i = rbuf1[i]; 1523 sbuf_aa_i = sbuf_aa[i]; 1524 ct1 = 2*rbuf1_i[0]+1; 1525 ct2 = 0; 1526 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1527 kmax = rbuf1_i[2*j]; 1528 for (k=0; k<kmax; k++,ct1++) { 1529 row = rbuf1_i[ct1] - rstart; 1530 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1531 ncols = nzA + nzB; 1532 cworkB = b_j + b_i[row]; 1533 vworkA = a_a + a_i[row]; 1534 vworkB = b_a + b_i[row]; 1535 1536 /* load the column values for this row into vals*/ 1537 vals = sbuf_aa_i+ct2; 1538 1539 lwrite = 0; 1540 for (l=0; l<nzB; l++) { 1541 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1542 } 1543 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1544 for (l=0; l<nzB; l++) { 1545 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1546 } 1547 1548 ct2 += ncols; 1549 } 1550 } 1551 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1552 } 1553 } 1554 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1555 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1556 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1557 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1558 1559 /* Form the matrix */ 1560 /* create col map: global col of C -> local col of submatrices */ 1561 { 1562 const PetscInt *icol_i; 1563 #if defined(PETSC_USE_CTABLE) 1564 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 1565 for (i=0; i<ismax; i++) { 1566 if (!allcolumns[i]) { 1567 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1568 1569 jmax = ncol[i]; 1570 icol_i = icol[i]; 1571 cmap_i = cmap[i]; 1572 for (j=0; j<jmax; j++) { 1573 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1574 } 1575 } else { 1576 cmap[i] = NULL; 1577 } 1578 } 1579 #else 1580 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1581 for (i=0; i<ismax; i++) { 1582 if (!allcolumns[i]) { 1583 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1584 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1585 jmax = ncol[i]; 1586 icol_i = icol[i]; 1587 cmap_i = cmap[i]; 1588 for (j=0; j<jmax; j++) { 1589 cmap_i[icol_i[j]] = j+1; 1590 } 1591 } else { 1592 cmap[i] = NULL; 1593 } 1594 } 1595 #endif 1596 } 1597 1598 /* Create lens which is required for MatCreate... */ 1599 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1600 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1601 if (ismax) { 1602 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1603 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1604 } 1605 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1606 1607 /* Update lens from local data */ 1608 for (i=0; i<ismax; i++) { 1609 jmax = nrow[i]; 1610 if (!allcolumns[i]) cmap_i = cmap[i]; 1611 irow_i = irow[i]; 1612 lens_i = lens[i]; 1613 for (j=0; j<jmax; j++) { 1614 l = 0; 1615 row = irow_i[j]; 1616 while (row >= C->rmap->range[l+1]) l++; 1617 proc = l; 1618 if (proc == rank) { 1619 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1620 if (!allcolumns[i]) { 1621 for (k=0; k<ncols; k++) { 1622 #if defined(PETSC_USE_CTABLE) 1623 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1624 #else 1625 tcol = cmap_i[cols[k]]; 1626 #endif 1627 if (tcol) lens_i[j]++; 1628 } 1629 } else { /* allcolumns */ 1630 lens_i[j] = ncols; 1631 } 1632 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1633 } 1634 } 1635 } 1636 1637 /* Create row map: global row of C -> local row of submatrices */ 1638 #if defined(PETSC_USE_CTABLE) 1639 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 1640 for (i=0; i<ismax; i++) { 1641 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1642 rmap_i = rmap[i]; 1643 irow_i = irow[i]; 1644 jmax = nrow[i]; 1645 for (j=0; j<jmax; j++) { 1646 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1647 } 1648 } 1649 #else 1650 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1651 if (ismax) { 1652 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1653 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1654 } 1655 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1656 for (i=0; i<ismax; i++) { 1657 rmap_i = rmap[i]; 1658 irow_i = irow[i]; 1659 jmax = nrow[i]; 1660 for (j=0; j<jmax; j++) { 1661 rmap_i[irow_i[j]] = j; 1662 } 1663 } 1664 #endif 1665 1666 /* Update lens from offproc data */ 1667 { 1668 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1669 1670 for (tmp2=0; tmp2<nrqs; tmp2++) { 1671 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1672 idex = pa[idex2]; 1673 sbuf1_i = sbuf1[idex]; 1674 jmax = sbuf1_i[0]; 1675 ct1 = 2*jmax+1; 1676 ct2 = 0; 1677 rbuf2_i = rbuf2[idex2]; 1678 rbuf3_i = rbuf3[idex2]; 1679 for (j=1; j<=jmax; j++) { 1680 is_no = sbuf1_i[2*j-1]; 1681 max1 = sbuf1_i[2*j]; 1682 lens_i = lens[is_no]; 1683 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1684 rmap_i = rmap[is_no]; 1685 for (k=0; k<max1; k++,ct1++) { 1686 #if defined(PETSC_USE_CTABLE) 1687 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1688 row--; 1689 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1690 #else 1691 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1692 #endif 1693 max2 = rbuf2_i[ct1]; 1694 for (l=0; l<max2; l++,ct2++) { 1695 if (!allcolumns[is_no]) { 1696 #if defined(PETSC_USE_CTABLE) 1697 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1698 #else 1699 tcol = cmap_i[rbuf3_i[ct2]]; 1700 #endif 1701 if (tcol) lens_i[row]++; 1702 } else { /* allcolumns */ 1703 lens_i[row]++; /* lens_i[row] += max2 ? */ 1704 } 1705 } 1706 } 1707 } 1708 } 1709 } 1710 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1711 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1712 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1713 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1714 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1715 1716 /* Create the submatrices */ 1717 if (scall == MAT_REUSE_MATRIX) { 1718 PetscBool flag; 1719 1720 /* 1721 Assumes new rows are same length as the old rows,hence bug! 1722 */ 1723 for (i=0; i<ismax; i++) { 1724 mat = (Mat_SeqAIJ*)(submats[i]->data); 1725 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1726 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1727 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1728 /* Initial matrix as if empty */ 1729 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1730 1731 submats[i]->factortype = C->factortype; 1732 } 1733 } else { 1734 for (i=0; i<ismax; i++) { 1735 PetscInt rbs,cbs; 1736 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1737 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1738 1739 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1740 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1741 1742 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1743 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1744 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1745 } 1746 } 1747 1748 /* Assemble the matrices */ 1749 /* First assemble the local rows */ 1750 { 1751 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1752 PetscScalar *imat_a; 1753 1754 for (i=0; i<ismax; i++) { 1755 mat = (Mat_SeqAIJ*)submats[i]->data; 1756 imat_ilen = mat->ilen; 1757 imat_j = mat->j; 1758 imat_i = mat->i; 1759 imat_a = mat->a; 1760 1761 if (!allcolumns[i]) cmap_i = cmap[i]; 1762 rmap_i = rmap[i]; 1763 irow_i = irow[i]; 1764 jmax = nrow[i]; 1765 for (j=0; j<jmax; j++) { 1766 l = 0; 1767 row = irow_i[j]; 1768 while (row >= C->rmap->range[l+1]) l++; 1769 proc = l; 1770 if (proc == rank) { 1771 old_row = row; 1772 #if defined(PETSC_USE_CTABLE) 1773 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1774 row--; 1775 #else 1776 row = rmap_i[row]; 1777 #endif 1778 ilen_row = imat_ilen[row]; 1779 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1780 mat_i = imat_i[row]; 1781 mat_a = imat_a + mat_i; 1782 mat_j = imat_j + mat_i; 1783 if (!allcolumns[i]) { 1784 for (k=0; k<ncols; k++) { 1785 #if defined(PETSC_USE_CTABLE) 1786 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1787 #else 1788 tcol = cmap_i[cols[k]]; 1789 #endif 1790 if (tcol) { 1791 *mat_j++ = tcol - 1; 1792 *mat_a++ = vals[k]; 1793 ilen_row++; 1794 } 1795 } 1796 } else { /* allcolumns */ 1797 for (k=0; k<ncols; k++) { 1798 *mat_j++ = cols[k]; /* global col index! */ 1799 *mat_a++ = vals[k]; 1800 ilen_row++; 1801 } 1802 } 1803 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1804 1805 imat_ilen[row] = ilen_row; 1806 } 1807 } 1808 } 1809 } 1810 1811 /* Now assemble the off proc rows*/ 1812 { 1813 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1814 PetscInt *imat_j,*imat_i; 1815 PetscScalar *imat_a,*rbuf4_i; 1816 1817 for (tmp2=0; tmp2<nrqs; tmp2++) { 1818 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1819 idex = pa[idex2]; 1820 sbuf1_i = sbuf1[idex]; 1821 jmax = sbuf1_i[0]; 1822 ct1 = 2*jmax + 1; 1823 ct2 = 0; 1824 rbuf2_i = rbuf2[idex2]; 1825 rbuf3_i = rbuf3[idex2]; 1826 rbuf4_i = rbuf4[idex2]; 1827 for (j=1; j<=jmax; j++) { 1828 is_no = sbuf1_i[2*j-1]; 1829 rmap_i = rmap[is_no]; 1830 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1831 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1832 imat_ilen = mat->ilen; 1833 imat_j = mat->j; 1834 imat_i = mat->i; 1835 imat_a = mat->a; 1836 max1 = sbuf1_i[2*j]; 1837 for (k=0; k<max1; k++,ct1++) { 1838 row = sbuf1_i[ct1]; 1839 #if defined(PETSC_USE_CTABLE) 1840 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1841 row--; 1842 #else 1843 row = rmap_i[row]; 1844 #endif 1845 ilen = imat_ilen[row]; 1846 mat_i = imat_i[row]; 1847 mat_a = imat_a + mat_i; 1848 mat_j = imat_j + mat_i; 1849 max2 = rbuf2_i[ct1]; 1850 if (!allcolumns[is_no]) { 1851 for (l=0; l<max2; l++,ct2++) { 1852 1853 #if defined(PETSC_USE_CTABLE) 1854 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1855 #else 1856 tcol = cmap_i[rbuf3_i[ct2]]; 1857 #endif 1858 if (tcol) { 1859 *mat_j++ = tcol - 1; 1860 *mat_a++ = rbuf4_i[ct2]; 1861 ilen++; 1862 } 1863 } 1864 } else { /* allcolumns */ 1865 for (l=0; l<max2; l++,ct2++) { 1866 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1867 *mat_a++ = rbuf4_i[ct2]; 1868 ilen++; 1869 } 1870 } 1871 imat_ilen[row] = ilen; 1872 } 1873 } 1874 } 1875 } 1876 1877 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1878 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1879 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1880 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1881 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1882 1883 /* Restore the indices */ 1884 for (i=0; i<ismax; i++) { 1885 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1886 if (!allcolumns[i]) { 1887 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1888 } 1889 } 1890 1891 /* Destroy allocated memory */ 1892 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1893 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1894 ierr = PetscFree(pa);CHKERRQ(ierr); 1895 1896 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1897 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1898 for (i=0; i<nrqr; ++i) { 1899 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1900 } 1901 for (i=0; i<nrqs; ++i) { 1902 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1903 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1904 } 1905 1906 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1907 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1908 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1909 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1910 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1911 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1912 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1913 1914 #if defined(PETSC_USE_CTABLE) 1915 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1916 #else 1917 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1918 #endif 1919 ierr = PetscFree(rmap);CHKERRQ(ierr); 1920 1921 for (i=0; i<ismax; i++) { 1922 if (!allcolumns[i]) { 1923 #if defined(PETSC_USE_CTABLE) 1924 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1925 #else 1926 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1927 #endif 1928 } 1929 } 1930 ierr = PetscFree(cmap);CHKERRQ(ierr); 1931 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1932 ierr = PetscFree(lens);CHKERRQ(ierr); 1933 1934 for (i=0; i<ismax; i++) { 1935 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1936 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1937 } 1938 PetscFunctionReturn(0); 1939 } 1940 1941 /* 1942 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1943 Be careful not to destroy them elsewhere. 1944 */ 1945 #undef __FUNCT__ 1946 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1947 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1948 { 1949 /* If making this function public, change the error returned in this function away from _PLIB. */ 1950 PetscErrorCode ierr; 1951 Mat_MPIAIJ *aij; 1952 PetscBool seqaij; 1953 1954 PetscFunctionBegin; 1955 /* Check to make sure the component matrices are compatible with C. */ 1956 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1957 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1958 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1959 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1960 if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1961 1962 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1963 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1964 ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr); 1965 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1966 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1967 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1968 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1969 aij = (Mat_MPIAIJ*)((*C)->data); 1970 aij->A = A; 1971 aij->B = B; 1972 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr); 1973 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr); 1974 1975 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1976 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1977 PetscFunctionReturn(0); 1978 } 1979 1980 #undef __FUNCT__ 1981 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1982 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1983 { 1984 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1985 1986 PetscFunctionBegin; 1987 PetscValidPointer(A,2); 1988 PetscValidPointer(B,3); 1989 *A = aij->A; 1990 *B = aij->B; 1991 /* Note that we don't incref *A and *B, so be careful! */ 1992 PetscFunctionReturn(0); 1993 } 1994 1995 #undef __FUNCT__ 1996 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1997 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1998 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1999 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 2000 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 2001 { 2002 PetscErrorCode ierr; 2003 PetscMPIInt size, flag; 2004 PetscInt i,ii; 2005 PetscInt ismax_c; 2006 2007 PetscFunctionBegin; 2008 if (!ismax) PetscFunctionReturn(0); 2009 2010 for (i = 0, ismax_c = 0; i < ismax; ++i) { 2011 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 2012 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2013 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2014 if (size > 1) ++ismax_c; 2015 } 2016 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 2017 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2018 } else { /* if (ismax_c) */ 2019 Mat *A,*B; 2020 IS *isrow_c, *iscol_c; 2021 PetscMPIInt size; 2022 /* 2023 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 2024 array of sequential matrices underlying the resulting parallel matrices. 2025 Which arrays to allocate is based on the value of MatReuse scall. 2026 There are as many diag matrices as there are original index sets. 2027 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 2028 2029 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 2030 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 2031 will deposite the extracted diag and off-diag parts. 2032 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 2033 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 2034 */ 2035 2036 /* Parallel matrix array is allocated only if no reuse is taking place. */ 2037 if (scall != MAT_REUSE_MATRIX) { 2038 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 2039 } else { 2040 ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr); 2041 ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr); 2042 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 2043 for (i = 0, ii = 0; i < ismax; ++i) { 2044 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2045 if (size > 1) { 2046 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 2047 ++ii; 2048 } else A[i] = (*submat)[i]; 2049 } 2050 } 2051 /* 2052 Construct the complements of the iscol ISs for parallel ISs only. 2053 These are used to extract the off-diag portion of the resulting parallel matrix. 2054 The row IS for the off-diag portion is the same as for the diag portion, 2055 so we merely alias the row IS, while skipping those that are sequential. 2056 */ 2057 ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr); 2058 for (i = 0, ii = 0; i < ismax; ++i) { 2059 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2060 if (size > 1) { 2061 isrow_c[ii] = isrow[i]; 2062 2063 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 2064 ++ii; 2065 } 2066 } 2067 /* Now obtain the sequential A and B submatrices separately. */ 2068 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 2069 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 2070 for (ii = 0; ii < ismax_c; ++ii) { 2071 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 2072 } 2073 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 2074 /* 2075 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 2076 have been extracted directly into the parallel matrices containing them, or 2077 simply into the sequential matrix identical with the corresponding A (if size == 1). 2078 Otherwise, make sure that parallel matrices are constructed from A & B, or the 2079 A is put into the correct submat slot (if size == 1). 2080 */ 2081 if (scall != MAT_REUSE_MATRIX) { 2082 for (i = 0, ii = 0; i < ismax; ++i) { 2083 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2084 if (size > 1) { 2085 /* 2086 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 2087 */ 2088 /* Construct submat[i] from the Seq pieces A and B. */ 2089 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 2090 2091 ++ii; 2092 } else (*submat)[i] = A[i]; 2093 } 2094 } 2095 ierr = PetscFree(A);CHKERRQ(ierr); 2096 ierr = PetscFree(B);CHKERRQ(ierr); 2097 } 2098 PetscFunctionReturn(0); 2099 } /* MatGetSubMatricesParallel_MPIXAIJ() */ 2100 2101 2102 2103 #undef __FUNCT__ 2104 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 2105 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2106 { 2107 PetscErrorCode ierr; 2108 2109 PetscFunctionBegin; 2110 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 2111 PetscFunctionReturn(0); 2112 } 2113