1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 29 PetscFunctionBegin; 30 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 31 for (i=0; i<ov; ++i) { 32 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 33 } 34 PetscFunctionReturn(0); 35 } 36 37 #undef __FUNCT__ 38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 40 { 41 PetscErrorCode ierr; 42 PetscInt i; 43 44 PetscFunctionBegin; 45 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 46 for (i=0; i<ov; ++i) { 47 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 48 } 49 PetscFunctionReturn(0); 50 } 51 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 56 { 57 PetscErrorCode ierr; 58 MPI_Comm comm; 59 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 60 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 61 PetscInt nrecvrows,*sbsizes,*sbdata,nto,nfrom; 62 const PetscInt **indices,*indices_i; 63 PetscLayout rmap; 64 PetscMPIInt rank,size,*toranks,*fromranks; 65 PetscSF sf; 66 PetscSFNode *remote; 67 68 PetscFunctionBegin; 69 /*communicator */ 70 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 71 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 72 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 73 /*row map*/ 74 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 75 /*retrieve IS data*/ 76 ierr = PetscCalloc2(nidx,&indices,nidx,&length);CHKERRQ(ierr); 77 /*get length and indices*/ 78 for(i=0,tlength=0; i<nidx; i++){ 79 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 80 tlength += length[i]; 81 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 82 } 83 /*find these rows on remote processors */ 84 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 85 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 86 nrrows = 0; 87 for(i=0; i<nidx; i++){ 88 length_i = length[i]; 89 indices_i = indices[i]; 90 for(j=0; j<length_i; j++){ 91 owner = -1; 92 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 93 /*remote processors*/ 94 if(owner != rank){ 95 tosizes_temp[owner]++; /*number of rows to owner*/ 96 rrow_ranks[nrrows] = owner; /*processor */ 97 rrow_isids[nrrows] = i; /*is id*/ 98 remoterows[nrrows++] = indices_i[j]; /* row */ 99 } 100 } 101 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 102 } 103 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 104 /*test if we need to exchange messages 105 * generally speaking, we do not need to exchange 106 * data when overlap is 1 107 * */ 108 ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 109 /*we do not have any messages 110 * It usually corresponds to overlap 1 111 * */ 112 if(!reducednrrows){ 113 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 114 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 115 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 nto = 0; 119 /*send sizes and ranks*/ 120 for(i=0; i<size; i++){ 121 if(tosizes_temp[i]){ 122 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 123 tosizes_temp[i] = nto; /* a map from processor to index */ 124 toranks[nto++] = i; /* processor */ 125 } 126 } 127 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 128 for(i=0; i<nto; i++){ 129 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /*offsets*/ 130 tosizes[2*i+1] = toffsets[i]; /*offsets to send*/ 131 } 132 /*send information to other processors*/ 133 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 134 /*build a star forest */ 135 nrecvrows = 0; 136 for(i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 137 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 138 nrecvrows = 0; 139 for(i=0; i<nfrom; i++){ 140 for(j=0; j<fromsizes[2*i]; j++){ 141 remote[nrecvrows].rank = fromranks[i]; 142 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 143 } 144 } 145 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 146 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 147 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 148 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 149 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 150 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 151 /*message pair <no of is, row> */ 152 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 153 for(i=0; i<nrrows; i++){ 154 owner = rrow_ranks[i]; /* processor */ 155 j = tosizes_temp[owner]; /* index */ 156 todata[toffsets[j]++] = rrow_isids[i]; 157 todata[toffsets[j]++] = remoterows[i]; 158 } 159 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 160 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 161 ierr = PetscFree(toffsets);CHKERRQ(ierr); 162 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 163 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 164 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 165 /*deal with remote data */ 166 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 167 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 168 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 169 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 170 ierr = PetscFree(fromranks);CHKERRQ(ierr); 171 nrecvrows = 0; 172 for(i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 173 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 174 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 175 nrecvrows = 0; 176 for(i=0; i<nto; i++){ 177 for(j=0; j<tosizes[2*i]; j++){ 178 remote[nrecvrows].rank = toranks[i]; 179 remote[nrecvrows++].index = tosizes[2*i+1]+j; 180 } 181 } 182 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 183 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 184 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 185 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 186 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 187 /*overlap communication and computation*/ 188 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 189 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 190 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 191 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 192 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 193 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 194 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 195 ierr = PetscFree(toranks);CHKERRQ(ierr); 196 ierr = PetscFree(tosizes);CHKERRQ(ierr); 197 ierr = PetscFree(todata);CHKERRQ(ierr); 198 PetscFunctionReturn(0); 199 } 200 201 #undef __FUNCT__ 202 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 203 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 204 { 205 PetscInt *isz,isz_i,i,j,is_id, data_size; 206 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 207 const PetscInt *indices_i_temp; 208 PetscErrorCode ierr; 209 210 PetscFunctionBegin; 211 max_lsize = 0; 212 ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr); 213 for(i=0; i<nidx; i++){ 214 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 215 max_lsize = lsize>max_lsize ? lsize:max_lsize; 216 isz[i] = lsize; 217 } 218 ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr); 219 for(i=0; i<nidx; i++){ 220 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 221 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 222 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 223 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 224 } 225 /*retrieve information */ 226 for(i=0; i<nrecvs; ){ 227 is_id = recvdata[i++]; 228 data_size = recvdata[i++]; 229 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 230 isz_i = isz[is_id]; 231 for(j=0; j< data_size; j++){ 232 col = recvdata[i++]; 233 indices_i[isz_i++] = col; 234 } 235 isz[is_id] = isz_i; 236 } 237 /*remove duplicate entities*/ 238 for(i=0; i<nidx; i++){ 239 indices_i = indices_temp+(max_lsize+nrecvs)*i; 240 isz_i = isz[i]; 241 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 242 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 243 } 244 ierr = PetscFree(isz);CHKERRQ(ierr); 245 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 246 PetscFunctionReturn(0); 247 } 248 249 #undef __FUNCT__ 250 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 251 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 252 { 253 PetscLayout rmap,cmap; 254 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 255 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 256 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 257 const PetscInt *gcols,*ai,*aj,*bi,*bj; 258 Mat amat,bmat; 259 PetscMPIInt rank; 260 PetscBool done; 261 MPI_Comm comm; 262 PetscErrorCode ierr; 263 264 PetscFunctionBegin; 265 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 266 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 267 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 268 /* Even if the mat is symmetric, we still assume it is not symmetric*/ 269 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 270 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 271 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 272 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 273 /*total number of nonzero values */ 274 tnz = ai[an]+bi[bn]; 275 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 276 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 277 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 278 /*longest message */ 279 max_fszs = 0; 280 for(i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 281 /*better way to estimate number of nonzero in the mat???*/ 282 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 283 for(i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 284 rows_pos = 0; 285 totalrows = 0; 286 for(i=0; i<nfrom; i++){ 287 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 288 /*group data*/ 289 for(j=0; j<fromsizes[2*i]; j+=2){ 290 is_id = fromrows[rows_pos++];/*no of is*/ 291 rows_i = rows_data[is_id]; 292 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 293 } 294 /*estimate a space to avoid multiple allocations */ 295 for(j=0; j<nidx; j++){ 296 indvc_ij = 0; 297 rows_i = rows_data[j]; 298 for(l=0; l<rows_pos_i[j]; l++){ 299 row = rows_i[l]-rstart; 300 start = ai[row]; 301 end = ai[row+1]; 302 for(k=start; k<end; k++){ /* Amat */ 303 col = aj[k] + cstart; 304 /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr); 305 if(owner != fromranks[i])*/ 306 indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/ 307 } 308 start = bi[row]; 309 end = bi[row+1]; 310 for(k=start; k<end; k++) { /* Bmat */ 311 col = gcols[bj[k]]; 312 /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr); 313 if(owner != fromranks[i]) */ 314 indices_tmp[indvc_ij++] = col; 315 } 316 } 317 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 318 indv_counts[i*nidx+j] = indvc_ij; 319 totalrows += indvc_ij; 320 } 321 } 322 /*message triple <no of is, number of rows, rows> */ 323 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 324 totalrows = 0; 325 rows_pos = 0; 326 /* use this code again */ 327 for(i=0;i<nfrom;i++){ 328 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 329 for(j=0; j<fromsizes[2*i]; j+=2){ 330 is_id = fromrows[rows_pos++]; 331 rows_i = rows_data[is_id]; 332 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 333 } 334 /* add data */ 335 for(j=0; j<nidx; j++){ 336 if(!indv_counts[i*nidx+j]) continue; 337 indvc_ij = 0; 338 sbdata[totalrows++] = j; 339 sbdata[totalrows++] = indv_counts[i*nidx+j]; 340 sbsizes[2*i] += 2; 341 rows_i = rows_data[j]; 342 for(l=0; l<rows_pos_i[j]; l++){ 343 row = rows_i[l]-rstart; 344 start = ai[row]; 345 end = ai[row+1]; 346 for(k=start; k<end; k++){ /* Amat */ 347 col = aj[k] + cstart; 348 indices_tmp[indvc_ij++] = col; 349 } 350 start = bi[row]; 351 end = bi[row+1]; 352 for(k=start; k<end; k++) { /* Bmat */ 353 col = gcols[bj[k]]; 354 indices_tmp[indvc_ij++] = col; 355 } 356 } 357 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 358 sbsizes[2*i] += indvc_ij; 359 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 360 totalrows += indvc_ij; 361 } 362 } 363 /* offsets */ 364 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 365 for(i=0; i<nfrom; i++){ 366 offsets[i+1] = offsets[i] + sbsizes[2*i]; 367 sbsizes[2*i+1] = offsets[i]; 368 } 369 ierr = PetscFree(offsets);CHKERRQ(ierr); 370 if(sbrowsizes) *sbrowsizes = sbsizes; 371 if(sbrows) *sbrows = sbdata; 372 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 373 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 374 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 375 PetscFunctionReturn(0); 376 } 377 378 #undef __FUNCT__ 379 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 380 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 381 { 382 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 383 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 384 PetscInt lsize,lsize_tmp,owner; 385 PetscMPIInt rank; 386 Mat amat,bmat; 387 PetscBool done; 388 PetscLayout cmap,rmap; 389 MPI_Comm comm; 390 PetscErrorCode ierr; 391 392 PetscFunctionBegin; 393 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 394 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 395 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 396 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 397 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 398 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 399 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 400 /*is it a safe way to compute number of nonzero values ?*/ 401 tnz = ai[an]+bi[bn]; 402 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 403 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 404 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 405 /*it is a better way to estimate memory than the old implementation 406 * where global size of matrix is used 407 * */ 408 ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr); 409 for(i=0; i<nidx; i++) { 410 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 411 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 412 lsize_tmp = 0; 413 for(j=0; j<lsize; j++) { 414 owner = -1; 415 row = indices[j]; 416 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 417 if(owner != rank) continue; 418 /*local number*/ 419 row -= rstart; 420 start = ai[row]; 421 end = ai[row+1]; 422 for(k=start; k<end; k++) { /* Amat */ 423 col = aj[k] + cstart; 424 indices_temp[lsize_tmp++] = col; 425 } 426 start = bi[row]; 427 end = bi[row+1]; 428 for(k=start; k<end; k++) { /* Bmat */ 429 col = gcols[bj[k]]; 430 indices_temp[lsize_tmp++] = col; 431 } 432 } 433 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 434 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 435 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 436 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 437 } 438 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 439 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 440 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 441 PetscFunctionReturn(0); 442 } 443 444 445 /* 446 Sample message format: 447 If a processor A wants processor B to process some elements corresponding 448 to index sets is[1],is[5] 449 mesg [0] = 2 (no of index sets in the mesg) 450 ----------- 451 mesg [1] = 1 => is[1] 452 mesg [2] = sizeof(is[1]); 453 ----------- 454 mesg [3] = 5 => is[5] 455 mesg [4] = sizeof(is[5]); 456 ----------- 457 mesg [5] 458 mesg [n] datas[1] 459 ----------- 460 mesg[n+1] 461 mesg[m] data(is[5]) 462 ----------- 463 464 Notes: 465 nrqs - no of requests sent (or to be sent out) 466 nrqr - no of requests recieved (which have to be or which have been processed 467 */ 468 #undef __FUNCT__ 469 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 470 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 471 { 472 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 473 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 474 const PetscInt **idx,*idx_i; 475 PetscInt *n,**data,len; 476 PetscErrorCode ierr; 477 PetscMPIInt size,rank,tag1,tag2; 478 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 479 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 480 PetscBT *table; 481 MPI_Comm comm; 482 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 483 MPI_Status *s_status,*recv_status; 484 char *t_p; 485 486 PetscFunctionBegin; 487 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 488 size = c->size; 489 rank = c->rank; 490 M = C->rmap->N; 491 492 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 493 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 494 495 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 496 497 for (i=0; i<imax; i++) { 498 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 499 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 500 } 501 502 /* evaluate communication - mesg to who,length of mesg, and buffer space 503 required. Based on this, buffers are allocated, and data copied into them*/ 504 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 505 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 506 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 507 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 508 for (i=0; i<imax; i++) { 509 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 510 idx_i = idx[i]; 511 len = n[i]; 512 for (j=0; j<len; j++) { 513 row = idx_i[j]; 514 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 515 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 516 w4[proc]++; 517 } 518 for (j=0; j<size; j++) { 519 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 520 } 521 } 522 523 nrqs = 0; /* no of outgoing messages */ 524 msz = 0; /* total mesg length (for all proc */ 525 w1[rank] = 0; /* no mesg sent to intself */ 526 w3[rank] = 0; 527 for (i=0; i<size; i++) { 528 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 529 } 530 /* pa - is list of processors to communicate with */ 531 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 532 for (i=0,j=0; i<size; i++) { 533 if (w1[i]) {pa[j] = i; j++;} 534 } 535 536 /* Each message would have a header = 1 + 2*(no of IS) + data */ 537 for (i=0; i<nrqs; i++) { 538 j = pa[i]; 539 w1[j] += w2[j] + 2*w3[j]; 540 msz += w1[j]; 541 } 542 543 /* Determine the number of messages to expect, their lengths, from from-ids */ 544 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 545 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 546 547 /* Now post the Irecvs corresponding to these messages */ 548 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 549 550 /* Allocate Memory for outgoing messages */ 551 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 552 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 553 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 554 555 { 556 PetscInt *iptr = tmp,ict = 0; 557 for (i=0; i<nrqs; i++) { 558 j = pa[i]; 559 iptr += ict; 560 outdat[j] = iptr; 561 ict = w1[j]; 562 } 563 } 564 565 /* Form the outgoing messages */ 566 /*plug in the headers*/ 567 for (i=0; i<nrqs; i++) { 568 j = pa[i]; 569 outdat[j][0] = 0; 570 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 571 ptr[j] = outdat[j] + 2*w3[j] + 1; 572 } 573 574 /* Memory for doing local proc's work*/ 575 { 576 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 577 578 for (i=0; i<imax; i++) { 579 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 580 data[i] = d_p + M*i; 581 } 582 } 583 584 /* Parse the IS and update local tables and the outgoing buf with the data*/ 585 { 586 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 587 PetscBT table_i; 588 589 for (i=0; i<imax; i++) { 590 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 591 n_i = n[i]; 592 table_i = table[i]; 593 idx_i = idx[i]; 594 data_i = data[i]; 595 isz_i = isz[i]; 596 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 597 row = idx_i[j]; 598 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 599 if (proc != rank) { /* copy to the outgoing buffer */ 600 ctr[proc]++; 601 *ptr[proc] = row; 602 ptr[proc]++; 603 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 604 } 605 /* Update the headers for the current IS */ 606 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 607 if ((ctr_j = ctr[j])) { 608 outdat_j = outdat[j]; 609 k = ++outdat_j[0]; 610 outdat_j[2*k] = ctr_j; 611 outdat_j[2*k-1] = i; 612 } 613 } 614 isz[i] = isz_i; 615 } 616 } 617 618 /* Now post the sends */ 619 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 620 for (i=0; i<nrqs; ++i) { 621 j = pa[i]; 622 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 623 } 624 625 /* No longer need the original indices*/ 626 for (i=0; i<imax; ++i) { 627 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 628 } 629 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 630 631 for (i=0; i<imax; ++i) { 632 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 633 } 634 635 /* Do Local work*/ 636 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 637 638 /* Receive messages*/ 639 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 640 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 641 642 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 643 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 644 645 /* Phase 1 sends are complete - deallocate buffers */ 646 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 647 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 648 649 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 650 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 651 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 652 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 653 ierr = PetscFree(rbuf);CHKERRQ(ierr); 654 655 656 /* Send the data back*/ 657 /* Do a global reduction to know the buffer space req for incoming messages*/ 658 { 659 PetscMPIInt *rw1; 660 661 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 662 663 for (i=0; i<nrqr; ++i) { 664 proc = recv_status[i].MPI_SOURCE; 665 666 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 667 rw1[proc] = isz1[i]; 668 } 669 ierr = PetscFree(onodes1);CHKERRQ(ierr); 670 ierr = PetscFree(olengths1);CHKERRQ(ierr); 671 672 /* Determine the number of messages to expect, their lengths, from from-ids */ 673 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 674 ierr = PetscFree(rw1);CHKERRQ(ierr); 675 } 676 /* Now post the Irecvs corresponding to these messages */ 677 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 678 679 /* Now post the sends */ 680 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 681 for (i=0; i<nrqr; ++i) { 682 j = recv_status[i].MPI_SOURCE; 683 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 684 } 685 686 /* receive work done on other processors*/ 687 { 688 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 689 PetscMPIInt idex; 690 PetscBT table_i; 691 MPI_Status *status2; 692 693 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 694 for (i=0; i<nrqs; ++i) { 695 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 696 /* Process the message*/ 697 rbuf2_i = rbuf2[idex]; 698 ct1 = 2*rbuf2_i[0]+1; 699 jmax = rbuf2[idex][0]; 700 for (j=1; j<=jmax; j++) { 701 max = rbuf2_i[2*j]; 702 is_no = rbuf2_i[2*j-1]; 703 isz_i = isz[is_no]; 704 data_i = data[is_no]; 705 table_i = table[is_no]; 706 for (k=0; k<max; k++,ct1++) { 707 row = rbuf2_i[ct1]; 708 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 709 } 710 isz[is_no] = isz_i; 711 } 712 } 713 714 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 715 ierr = PetscFree(status2);CHKERRQ(ierr); 716 } 717 718 for (i=0; i<imax; ++i) { 719 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 720 } 721 722 ierr = PetscFree(onodes2);CHKERRQ(ierr); 723 ierr = PetscFree(olengths2);CHKERRQ(ierr); 724 725 ierr = PetscFree(pa);CHKERRQ(ierr); 726 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 727 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 728 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 729 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 730 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 731 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 732 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 733 ierr = PetscFree(s_status);CHKERRQ(ierr); 734 ierr = PetscFree(recv_status);CHKERRQ(ierr); 735 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 736 ierr = PetscFree(xdata);CHKERRQ(ierr); 737 ierr = PetscFree(isz1);CHKERRQ(ierr); 738 PetscFunctionReturn(0); 739 } 740 741 #undef __FUNCT__ 742 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 743 /* 744 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 745 the work on the local processor. 746 747 Inputs: 748 C - MAT_MPIAIJ; 749 imax - total no of index sets processed at a time; 750 table - an array of char - size = m bits. 751 752 Output: 753 isz - array containing the count of the solution elements corresponding 754 to each index set; 755 data - pointer to the solutions 756 */ 757 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 758 { 759 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 760 Mat A = c->A,B = c->B; 761 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 762 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 763 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 764 PetscBT table_i; 765 766 PetscFunctionBegin; 767 rstart = C->rmap->rstart; 768 cstart = C->cmap->rstart; 769 ai = a->i; 770 aj = a->j; 771 bi = b->i; 772 bj = b->j; 773 garray = c->garray; 774 775 776 for (i=0; i<imax; i++) { 777 data_i = data[i]; 778 table_i = table[i]; 779 isz_i = isz[i]; 780 for (j=0,max=isz[i]; j<max; j++) { 781 row = data_i[j] - rstart; 782 start = ai[row]; 783 end = ai[row+1]; 784 for (k=start; k<end; k++) { /* Amat */ 785 val = aj[k] + cstart; 786 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 787 } 788 start = bi[row]; 789 end = bi[row+1]; 790 for (k=start; k<end; k++) { /* Bmat */ 791 val = garray[bj[k]]; 792 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 793 } 794 } 795 isz[i] = isz_i; 796 } 797 PetscFunctionReturn(0); 798 } 799 800 #undef __FUNCT__ 801 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 802 /* 803 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 804 and return the output 805 806 Input: 807 C - the matrix 808 nrqr - no of messages being processed. 809 rbuf - an array of pointers to the recieved requests 810 811 Output: 812 xdata - array of messages to be sent back 813 isz1 - size of each message 814 815 For better efficiency perhaps we should malloc separately each xdata[i], 816 then if a remalloc is required we need only copy the data for that one row 817 rather then all previous rows as it is now where a single large chunck of 818 memory is used. 819 820 */ 821 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 822 { 823 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 824 Mat A = c->A,B = c->B; 825 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 826 PetscErrorCode ierr; 827 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 828 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 829 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 830 PetscInt *rbuf_i,kmax,rbuf_0; 831 PetscBT xtable; 832 833 PetscFunctionBegin; 834 m = C->rmap->N; 835 rstart = C->rmap->rstart; 836 cstart = C->cmap->rstart; 837 ai = a->i; 838 aj = a->j; 839 bi = b->i; 840 bj = b->j; 841 garray = c->garray; 842 843 844 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 845 rbuf_i = rbuf[i]; 846 rbuf_0 = rbuf_i[0]; 847 ct += rbuf_0; 848 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 849 } 850 851 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 852 else max1 = 1; 853 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 854 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 855 ++no_malloc; 856 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 857 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 858 859 ct3 = 0; 860 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 861 rbuf_i = rbuf[i]; 862 rbuf_0 = rbuf_i[0]; 863 ct1 = 2*rbuf_0+1; 864 ct2 = ct1; 865 ct3 += ct1; 866 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 867 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 868 oct2 = ct2; 869 kmax = rbuf_i[2*j]; 870 for (k=0; k<kmax; k++,ct1++) { 871 row = rbuf_i[ct1]; 872 if (!PetscBTLookupSet(xtable,row)) { 873 if (!(ct3 < mem_estimate)) { 874 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 875 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 876 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 877 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 878 xdata[0] = tmp; 879 mem_estimate = new_estimate; ++no_malloc; 880 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 881 } 882 xdata[i][ct2++] = row; 883 ct3++; 884 } 885 } 886 for (k=oct2,max2=ct2; k<max2; k++) { 887 row = xdata[i][k] - rstart; 888 start = ai[row]; 889 end = ai[row+1]; 890 for (l=start; l<end; l++) { 891 val = aj[l] + cstart; 892 if (!PetscBTLookupSet(xtable,val)) { 893 if (!(ct3 < mem_estimate)) { 894 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 895 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 896 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 897 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 898 xdata[0] = tmp; 899 mem_estimate = new_estimate; ++no_malloc; 900 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 901 } 902 xdata[i][ct2++] = val; 903 ct3++; 904 } 905 } 906 start = bi[row]; 907 end = bi[row+1]; 908 for (l=start; l<end; l++) { 909 val = garray[bj[l]]; 910 if (!PetscBTLookupSet(xtable,val)) { 911 if (!(ct3 < mem_estimate)) { 912 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 913 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 914 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 915 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 916 xdata[0] = tmp; 917 mem_estimate = new_estimate; ++no_malloc; 918 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 919 } 920 xdata[i][ct2++] = val; 921 ct3++; 922 } 923 } 924 } 925 /* Update the header*/ 926 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 927 xdata[i][2*j-1] = rbuf_i[2*j-1]; 928 } 929 xdata[i][0] = rbuf_0; 930 xdata[i+1] = xdata[i] + ct2; 931 isz1[i] = ct2; /* size of each message */ 932 } 933 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 934 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 935 PetscFunctionReturn(0); 936 } 937 /* -------------------------------------------------------------------------*/ 938 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 939 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 940 /* 941 Every processor gets the entire matrix 942 */ 943 #undef __FUNCT__ 944 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 945 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 946 { 947 Mat B; 948 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 949 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 950 PetscErrorCode ierr; 951 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 952 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 953 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 954 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 955 956 PetscFunctionBegin; 957 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 958 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 959 960 if (scall == MAT_INITIAL_MATRIX) { 961 /* ---------------------------------------------------------------- 962 Tell every processor the number of nonzeros per row 963 */ 964 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 965 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 966 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 967 } 968 sendcount = A->rmap->rend - A->rmap->rstart; 969 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 970 for (i=0; i<size; i++) { 971 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 972 displs[i] = A->rmap->range[i]; 973 } 974 #if defined(PETSC_HAVE_MPI_IN_PLACE) 975 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 976 #else 977 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 978 #endif 979 /* --------------------------------------------------------------- 980 Create the sequential matrix of the same type as the local block diagonal 981 */ 982 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 983 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 984 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 985 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 986 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 987 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 988 **Bin = B; 989 b = (Mat_SeqAIJ*)B->data; 990 991 /*-------------------------------------------------------------------- 992 Copy my part of matrix column indices over 993 */ 994 sendcount = ad->nz + bd->nz; 995 jsendbuf = b->j + b->i[rstarts[rank]]; 996 a_jsendbuf = ad->j; 997 b_jsendbuf = bd->j; 998 n = A->rmap->rend - A->rmap->rstart; 999 cnt = 0; 1000 for (i=0; i<n; i++) { 1001 1002 /* put in lower diagonal portion */ 1003 m = bd->i[i+1] - bd->i[i]; 1004 while (m > 0) { 1005 /* is it above diagonal (in bd (compressed) numbering) */ 1006 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1007 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1008 m--; 1009 } 1010 1011 /* put in diagonal portion */ 1012 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1013 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1014 } 1015 1016 /* put in upper diagonal portion */ 1017 while (m-- > 0) { 1018 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1019 } 1020 } 1021 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1022 1023 /*-------------------------------------------------------------------- 1024 Gather all column indices to all processors 1025 */ 1026 for (i=0; i<size; i++) { 1027 recvcounts[i] = 0; 1028 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1029 recvcounts[i] += lens[j]; 1030 } 1031 } 1032 displs[0] = 0; 1033 for (i=1; i<size; i++) { 1034 displs[i] = displs[i-1] + recvcounts[i-1]; 1035 } 1036 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1037 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1038 #else 1039 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1040 #endif 1041 /*-------------------------------------------------------------------- 1042 Assemble the matrix into useable form (note numerical values not yet set) 1043 */ 1044 /* set the b->ilen (length of each row) values */ 1045 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1046 /* set the b->i indices */ 1047 b->i[0] = 0; 1048 for (i=1; i<=A->rmap->N; i++) { 1049 b->i[i] = b->i[i-1] + lens[i-1]; 1050 } 1051 ierr = PetscFree(lens);CHKERRQ(ierr); 1052 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1053 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1054 1055 } else { 1056 B = **Bin; 1057 b = (Mat_SeqAIJ*)B->data; 1058 } 1059 1060 /*-------------------------------------------------------------------- 1061 Copy my part of matrix numerical values into the values location 1062 */ 1063 if (flag == MAT_GET_VALUES) { 1064 sendcount = ad->nz + bd->nz; 1065 sendbuf = b->a + b->i[rstarts[rank]]; 1066 a_sendbuf = ad->a; 1067 b_sendbuf = bd->a; 1068 b_sendj = bd->j; 1069 n = A->rmap->rend - A->rmap->rstart; 1070 cnt = 0; 1071 for (i=0; i<n; i++) { 1072 1073 /* put in lower diagonal portion */ 1074 m = bd->i[i+1] - bd->i[i]; 1075 while (m > 0) { 1076 /* is it above diagonal (in bd (compressed) numbering) */ 1077 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1078 sendbuf[cnt++] = *b_sendbuf++; 1079 m--; 1080 b_sendj++; 1081 } 1082 1083 /* put in diagonal portion */ 1084 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1085 sendbuf[cnt++] = *a_sendbuf++; 1086 } 1087 1088 /* put in upper diagonal portion */ 1089 while (m-- > 0) { 1090 sendbuf[cnt++] = *b_sendbuf++; 1091 b_sendj++; 1092 } 1093 } 1094 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1095 1096 /* ----------------------------------------------------------------- 1097 Gather all numerical values to all processors 1098 */ 1099 if (!recvcounts) { 1100 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1101 } 1102 for (i=0; i<size; i++) { 1103 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1104 } 1105 displs[0] = 0; 1106 for (i=1; i<size; i++) { 1107 displs[i] = displs[i-1] + recvcounts[i-1]; 1108 } 1109 recvbuf = b->a; 1110 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1111 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1112 #else 1113 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1114 #endif 1115 } /* endof (flag == MAT_GET_VALUES) */ 1116 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1117 1118 if (A->symmetric) { 1119 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1120 } else if (A->hermitian) { 1121 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1122 } else if (A->structurally_symmetric) { 1123 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1124 } 1125 PetscFunctionReturn(0); 1126 } 1127 1128 1129 1130 #undef __FUNCT__ 1131 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 1132 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1133 { 1134 PetscErrorCode ierr; 1135 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 1136 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 1137 1138 PetscFunctionBegin; 1139 1140 /* 1141 Check for special case: each processor gets entire matrix 1142 */ 1143 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1144 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1145 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1146 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1147 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1148 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1149 wantallmatrix = PETSC_TRUE; 1150 1151 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1152 } 1153 } 1154 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1155 if (twantallmatrix) { 1156 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 1157 PetscFunctionReturn(0); 1158 } 1159 1160 /* Allocate memory to hold all the submatrices */ 1161 if (scall != MAT_REUSE_MATRIX) { 1162 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 1163 } 1164 1165 /* Check for special case: each processor gets entire matrix columns */ 1166 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 1167 for (i=0; i<ismax; i++) { 1168 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 1169 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 1170 if (colflag && ncol == C->cmap->N) { 1171 allcolumns[i] = PETSC_TRUE; 1172 } else { 1173 allcolumns[i] = PETSC_FALSE; 1174 } 1175 } 1176 1177 /* Determine the number of stages through which submatrices are done */ 1178 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1179 1180 /* 1181 Each stage will extract nmax submatrices. 1182 nmax is determined by the matrix column dimension. 1183 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1184 */ 1185 if (!nmax) nmax = 1; 1186 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 1187 1188 /* Make sure every processor loops through the nstages */ 1189 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1190 1191 for (i=0,pos=0; i<nstages; i++) { 1192 if (pos+nmax <= ismax) max_no = nmax; 1193 else if (pos == ismax) max_no = 0; 1194 else max_no = ismax-pos; 1195 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 1196 pos += max_no; 1197 } 1198 1199 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 1200 PetscFunctionReturn(0); 1201 } 1202 1203 /* -------------------------------------------------------------------------*/ 1204 #undef __FUNCT__ 1205 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 1206 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 1207 { 1208 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1209 Mat A = c->A; 1210 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 1211 const PetscInt **icol,**irow; 1212 PetscInt *nrow,*ncol,start; 1213 PetscErrorCode ierr; 1214 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 1215 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 1216 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 1217 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 1218 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 1219 #if defined(PETSC_USE_CTABLE) 1220 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 1221 #else 1222 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 1223 #endif 1224 const PetscInt *irow_i; 1225 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 1226 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1227 MPI_Request *r_waits4,*s_waits3,*s_waits4; 1228 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 1229 MPI_Status *r_status3,*r_status4,*s_status4; 1230 MPI_Comm comm; 1231 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 1232 PetscMPIInt *onodes1,*olengths1; 1233 PetscMPIInt idex,idex2,end; 1234 1235 PetscFunctionBegin; 1236 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1237 tag0 = ((PetscObject)C)->tag; 1238 size = c->size; 1239 rank = c->rank; 1240 1241 /* Get some new tags to keep the communication clean */ 1242 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 1243 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1244 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1245 1246 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 1247 1248 for (i=0; i<ismax; i++) { 1249 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 1250 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 1251 if (allcolumns[i]) { 1252 icol[i] = NULL; 1253 ncol[i] = C->cmap->N; 1254 } else { 1255 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 1256 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 1257 } 1258 } 1259 1260 /* evaluate communication - mesg to who, length of mesg, and buffer space 1261 required. Based on this, buffers are allocated, and data copied into them*/ 1262 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 1263 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1264 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1265 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1266 for (i=0; i<ismax; i++) { 1267 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1268 jmax = nrow[i]; 1269 irow_i = irow[i]; 1270 for (j=0; j<jmax; j++) { 1271 l = 0; 1272 row = irow_i[j]; 1273 while (row >= C->rmap->range[l+1]) l++; 1274 proc = l; 1275 w4[proc]++; 1276 } 1277 for (j=0; j<size; j++) { 1278 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 1279 } 1280 } 1281 1282 nrqs = 0; /* no of outgoing messages */ 1283 msz = 0; /* total mesg length (for all procs) */ 1284 w1[rank] = 0; /* no mesg sent to self */ 1285 w3[rank] = 0; 1286 for (i=0; i<size; i++) { 1287 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 1288 } 1289 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1290 for (i=0,j=0; i<size; i++) { 1291 if (w1[i]) { pa[j] = i; j++; } 1292 } 1293 1294 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1295 for (i=0; i<nrqs; i++) { 1296 j = pa[i]; 1297 w1[j] += w2[j] + 2* w3[j]; 1298 msz += w1[j]; 1299 } 1300 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1301 1302 /* Determine the number of messages to expect, their lengths, from from-ids */ 1303 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1304 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1305 1306 /* Now post the Irecvs corresponding to these messages */ 1307 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1308 1309 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1310 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1311 1312 /* Allocate Memory for outgoing messages */ 1313 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1314 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1315 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1316 1317 { 1318 PetscInt *iptr = tmp,ict = 0; 1319 for (i=0; i<nrqs; i++) { 1320 j = pa[i]; 1321 iptr += ict; 1322 sbuf1[j] = iptr; 1323 ict = w1[j]; 1324 } 1325 } 1326 1327 /* Form the outgoing messages */ 1328 /* Initialize the header space */ 1329 for (i=0; i<nrqs; i++) { 1330 j = pa[i]; 1331 sbuf1[j][0] = 0; 1332 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 1333 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 1334 } 1335 1336 /* Parse the isrow and copy data into outbuf */ 1337 for (i=0; i<ismax; i++) { 1338 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1339 irow_i = irow[i]; 1340 jmax = nrow[i]; 1341 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 1342 l = 0; 1343 row = irow_i[j]; 1344 while (row >= C->rmap->range[l+1]) l++; 1345 proc = l; 1346 if (proc != rank) { /* copy to the outgoing buf*/ 1347 ctr[proc]++; 1348 *ptr[proc] = row; 1349 ptr[proc]++; 1350 } 1351 } 1352 /* Update the headers for the current IS */ 1353 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1354 if ((ctr_j = ctr[j])) { 1355 sbuf1_j = sbuf1[j]; 1356 k = ++sbuf1_j[0]; 1357 sbuf1_j[2*k] = ctr_j; 1358 sbuf1_j[2*k-1] = i; 1359 } 1360 } 1361 } 1362 1363 /* Now post the sends */ 1364 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1365 for (i=0; i<nrqs; ++i) { 1366 j = pa[i]; 1367 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 1368 } 1369 1370 /* Post Receives to capture the buffer size */ 1371 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 1372 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 1373 rbuf2[0] = tmp + msz; 1374 for (i=1; i<nrqs; ++i) { 1375 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 1376 } 1377 for (i=0; i<nrqs; ++i) { 1378 j = pa[i]; 1379 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 1380 } 1381 1382 /* Send to other procs the buf size they should allocate */ 1383 1384 1385 /* Receive messages*/ 1386 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 1387 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1388 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 1389 { 1390 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 1391 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 1392 PetscInt *sbuf2_i; 1393 1394 for (i=0; i<nrqr; ++i) { 1395 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1396 1397 req_size[idex] = 0; 1398 rbuf1_i = rbuf1[idex]; 1399 start = 2*rbuf1_i[0] + 1; 1400 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1401 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 1402 sbuf2_i = sbuf2[idex]; 1403 for (j=start; j<end; j++) { 1404 id = rbuf1_i[j] - rstart; 1405 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1406 sbuf2_i[j] = ncols; 1407 req_size[idex] += ncols; 1408 } 1409 req_source[idex] = r_status1[i].MPI_SOURCE; 1410 /* form the header */ 1411 sbuf2_i[0] = req_size[idex]; 1412 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1413 1414 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1415 } 1416 } 1417 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1418 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1419 1420 /* recv buffer sizes */ 1421 /* Receive messages*/ 1422 1423 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 1424 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1425 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 1426 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1427 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 1428 1429 for (i=0; i<nrqs; ++i) { 1430 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1431 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 1432 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 1433 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1434 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1435 } 1436 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1437 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1438 1439 /* Wait on sends1 and sends2 */ 1440 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1441 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 1442 1443 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1444 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1445 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1446 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1447 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1448 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1449 1450 /* Now allocate buffers for a->j, and send them off */ 1451 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1452 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1453 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1454 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1455 1456 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 1457 { 1458 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1459 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1460 PetscInt cend = C->cmap->rend; 1461 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1462 1463 for (i=0; i<nrqr; i++) { 1464 rbuf1_i = rbuf1[i]; 1465 sbuf_aj_i = sbuf_aj[i]; 1466 ct1 = 2*rbuf1_i[0] + 1; 1467 ct2 = 0; 1468 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1469 kmax = rbuf1[i][2*j]; 1470 for (k=0; k<kmax; k++,ct1++) { 1471 row = rbuf1_i[ct1] - rstart; 1472 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1473 ncols = nzA + nzB; 1474 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1475 1476 /* load the column indices for this row into cols*/ 1477 cols = sbuf_aj_i + ct2; 1478 1479 lwrite = 0; 1480 for (l=0; l<nzB; l++) { 1481 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1482 } 1483 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1484 for (l=0; l<nzB; l++) { 1485 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1486 } 1487 1488 ct2 += ncols; 1489 } 1490 } 1491 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1492 } 1493 } 1494 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 1495 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 1496 1497 /* Allocate buffers for a->a, and send them off */ 1498 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1499 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1500 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1501 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1502 1503 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1504 { 1505 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1506 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1507 PetscInt cend = C->cmap->rend; 1508 PetscInt *b_j = b->j; 1509 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1510 1511 for (i=0; i<nrqr; i++) { 1512 rbuf1_i = rbuf1[i]; 1513 sbuf_aa_i = sbuf_aa[i]; 1514 ct1 = 2*rbuf1_i[0]+1; 1515 ct2 = 0; 1516 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1517 kmax = rbuf1_i[2*j]; 1518 for (k=0; k<kmax; k++,ct1++) { 1519 row = rbuf1_i[ct1] - rstart; 1520 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1521 ncols = nzA + nzB; 1522 cworkB = b_j + b_i[row]; 1523 vworkA = a_a + a_i[row]; 1524 vworkB = b_a + b_i[row]; 1525 1526 /* load the column values for this row into vals*/ 1527 vals = sbuf_aa_i+ct2; 1528 1529 lwrite = 0; 1530 for (l=0; l<nzB; l++) { 1531 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1532 } 1533 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1534 for (l=0; l<nzB; l++) { 1535 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1536 } 1537 1538 ct2 += ncols; 1539 } 1540 } 1541 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1542 } 1543 } 1544 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1545 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1546 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1547 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1548 1549 /* Form the matrix */ 1550 /* create col map: global col of C -> local col of submatrices */ 1551 { 1552 const PetscInt *icol_i; 1553 #if defined(PETSC_USE_CTABLE) 1554 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 1555 for (i=0; i<ismax; i++) { 1556 if (!allcolumns[i]) { 1557 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1558 1559 jmax = ncol[i]; 1560 icol_i = icol[i]; 1561 cmap_i = cmap[i]; 1562 for (j=0; j<jmax; j++) { 1563 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1564 } 1565 } else { 1566 cmap[i] = NULL; 1567 } 1568 } 1569 #else 1570 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1571 for (i=0; i<ismax; i++) { 1572 if (!allcolumns[i]) { 1573 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1574 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1575 jmax = ncol[i]; 1576 icol_i = icol[i]; 1577 cmap_i = cmap[i]; 1578 for (j=0; j<jmax; j++) { 1579 cmap_i[icol_i[j]] = j+1; 1580 } 1581 } else { 1582 cmap[i] = NULL; 1583 } 1584 } 1585 #endif 1586 } 1587 1588 /* Create lens which is required for MatCreate... */ 1589 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1590 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1591 if (ismax) { 1592 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1593 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1594 } 1595 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1596 1597 /* Update lens from local data */ 1598 for (i=0; i<ismax; i++) { 1599 jmax = nrow[i]; 1600 if (!allcolumns[i]) cmap_i = cmap[i]; 1601 irow_i = irow[i]; 1602 lens_i = lens[i]; 1603 for (j=0; j<jmax; j++) { 1604 l = 0; 1605 row = irow_i[j]; 1606 while (row >= C->rmap->range[l+1]) l++; 1607 proc = l; 1608 if (proc == rank) { 1609 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1610 if (!allcolumns[i]) { 1611 for (k=0; k<ncols; k++) { 1612 #if defined(PETSC_USE_CTABLE) 1613 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1614 #else 1615 tcol = cmap_i[cols[k]]; 1616 #endif 1617 if (tcol) lens_i[j]++; 1618 } 1619 } else { /* allcolumns */ 1620 lens_i[j] = ncols; 1621 } 1622 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1623 } 1624 } 1625 } 1626 1627 /* Create row map: global row of C -> local row of submatrices */ 1628 #if defined(PETSC_USE_CTABLE) 1629 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 1630 for (i=0; i<ismax; i++) { 1631 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1632 rmap_i = rmap[i]; 1633 irow_i = irow[i]; 1634 jmax = nrow[i]; 1635 for (j=0; j<jmax; j++) { 1636 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1637 } 1638 } 1639 #else 1640 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1641 if (ismax) { 1642 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1643 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1644 } 1645 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1646 for (i=0; i<ismax; i++) { 1647 rmap_i = rmap[i]; 1648 irow_i = irow[i]; 1649 jmax = nrow[i]; 1650 for (j=0; j<jmax; j++) { 1651 rmap_i[irow_i[j]] = j; 1652 } 1653 } 1654 #endif 1655 1656 /* Update lens from offproc data */ 1657 { 1658 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1659 1660 for (tmp2=0; tmp2<nrqs; tmp2++) { 1661 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1662 idex = pa[idex2]; 1663 sbuf1_i = sbuf1[idex]; 1664 jmax = sbuf1_i[0]; 1665 ct1 = 2*jmax+1; 1666 ct2 = 0; 1667 rbuf2_i = rbuf2[idex2]; 1668 rbuf3_i = rbuf3[idex2]; 1669 for (j=1; j<=jmax; j++) { 1670 is_no = sbuf1_i[2*j-1]; 1671 max1 = sbuf1_i[2*j]; 1672 lens_i = lens[is_no]; 1673 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1674 rmap_i = rmap[is_no]; 1675 for (k=0; k<max1; k++,ct1++) { 1676 #if defined(PETSC_USE_CTABLE) 1677 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1678 row--; 1679 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1680 #else 1681 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1682 #endif 1683 max2 = rbuf2_i[ct1]; 1684 for (l=0; l<max2; l++,ct2++) { 1685 if (!allcolumns[is_no]) { 1686 #if defined(PETSC_USE_CTABLE) 1687 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1688 #else 1689 tcol = cmap_i[rbuf3_i[ct2]]; 1690 #endif 1691 if (tcol) lens_i[row]++; 1692 } else { /* allcolumns */ 1693 lens_i[row]++; /* lens_i[row] += max2 ? */ 1694 } 1695 } 1696 } 1697 } 1698 } 1699 } 1700 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1701 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1702 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1703 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1704 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1705 1706 /* Create the submatrices */ 1707 if (scall == MAT_REUSE_MATRIX) { 1708 PetscBool flag; 1709 1710 /* 1711 Assumes new rows are same length as the old rows,hence bug! 1712 */ 1713 for (i=0; i<ismax; i++) { 1714 mat = (Mat_SeqAIJ*)(submats[i]->data); 1715 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1716 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1717 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1718 /* Initial matrix as if empty */ 1719 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1720 1721 submats[i]->factortype = C->factortype; 1722 } 1723 } else { 1724 for (i=0; i<ismax; i++) { 1725 PetscInt rbs,cbs; 1726 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1727 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1728 1729 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1730 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1731 1732 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1733 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1734 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1735 } 1736 } 1737 1738 /* Assemble the matrices */ 1739 /* First assemble the local rows */ 1740 { 1741 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1742 PetscScalar *imat_a; 1743 1744 for (i=0; i<ismax; i++) { 1745 mat = (Mat_SeqAIJ*)submats[i]->data; 1746 imat_ilen = mat->ilen; 1747 imat_j = mat->j; 1748 imat_i = mat->i; 1749 imat_a = mat->a; 1750 1751 if (!allcolumns[i]) cmap_i = cmap[i]; 1752 rmap_i = rmap[i]; 1753 irow_i = irow[i]; 1754 jmax = nrow[i]; 1755 for (j=0; j<jmax; j++) { 1756 l = 0; 1757 row = irow_i[j]; 1758 while (row >= C->rmap->range[l+1]) l++; 1759 proc = l; 1760 if (proc == rank) { 1761 old_row = row; 1762 #if defined(PETSC_USE_CTABLE) 1763 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1764 row--; 1765 #else 1766 row = rmap_i[row]; 1767 #endif 1768 ilen_row = imat_ilen[row]; 1769 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1770 mat_i = imat_i[row]; 1771 mat_a = imat_a + mat_i; 1772 mat_j = imat_j + mat_i; 1773 if (!allcolumns[i]) { 1774 for (k=0; k<ncols; k++) { 1775 #if defined(PETSC_USE_CTABLE) 1776 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1777 #else 1778 tcol = cmap_i[cols[k]]; 1779 #endif 1780 if (tcol) { 1781 *mat_j++ = tcol - 1; 1782 *mat_a++ = vals[k]; 1783 ilen_row++; 1784 } 1785 } 1786 } else { /* allcolumns */ 1787 for (k=0; k<ncols; k++) { 1788 *mat_j++ = cols[k]; /* global col index! */ 1789 *mat_a++ = vals[k]; 1790 ilen_row++; 1791 } 1792 } 1793 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1794 1795 imat_ilen[row] = ilen_row; 1796 } 1797 } 1798 } 1799 } 1800 1801 /* Now assemble the off proc rows*/ 1802 { 1803 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1804 PetscInt *imat_j,*imat_i; 1805 PetscScalar *imat_a,*rbuf4_i; 1806 1807 for (tmp2=0; tmp2<nrqs; tmp2++) { 1808 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1809 idex = pa[idex2]; 1810 sbuf1_i = sbuf1[idex]; 1811 jmax = sbuf1_i[0]; 1812 ct1 = 2*jmax + 1; 1813 ct2 = 0; 1814 rbuf2_i = rbuf2[idex2]; 1815 rbuf3_i = rbuf3[idex2]; 1816 rbuf4_i = rbuf4[idex2]; 1817 for (j=1; j<=jmax; j++) { 1818 is_no = sbuf1_i[2*j-1]; 1819 rmap_i = rmap[is_no]; 1820 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1821 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1822 imat_ilen = mat->ilen; 1823 imat_j = mat->j; 1824 imat_i = mat->i; 1825 imat_a = mat->a; 1826 max1 = sbuf1_i[2*j]; 1827 for (k=0; k<max1; k++,ct1++) { 1828 row = sbuf1_i[ct1]; 1829 #if defined(PETSC_USE_CTABLE) 1830 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1831 row--; 1832 #else 1833 row = rmap_i[row]; 1834 #endif 1835 ilen = imat_ilen[row]; 1836 mat_i = imat_i[row]; 1837 mat_a = imat_a + mat_i; 1838 mat_j = imat_j + mat_i; 1839 max2 = rbuf2_i[ct1]; 1840 if (!allcolumns[is_no]) { 1841 for (l=0; l<max2; l++,ct2++) { 1842 1843 #if defined(PETSC_USE_CTABLE) 1844 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1845 #else 1846 tcol = cmap_i[rbuf3_i[ct2]]; 1847 #endif 1848 if (tcol) { 1849 *mat_j++ = tcol - 1; 1850 *mat_a++ = rbuf4_i[ct2]; 1851 ilen++; 1852 } 1853 } 1854 } else { /* allcolumns */ 1855 for (l=0; l<max2; l++,ct2++) { 1856 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1857 *mat_a++ = rbuf4_i[ct2]; 1858 ilen++; 1859 } 1860 } 1861 imat_ilen[row] = ilen; 1862 } 1863 } 1864 } 1865 } 1866 1867 /* sort the rows */ 1868 { 1869 PetscInt *imat_ilen,*imat_j,*imat_i; 1870 PetscScalar *imat_a; 1871 1872 for (i=0; i<ismax; i++) { 1873 mat = (Mat_SeqAIJ*)submats[i]->data; 1874 imat_j = mat->j; 1875 imat_i = mat->i; 1876 imat_a = mat->a; 1877 imat_ilen = mat->ilen; 1878 1879 if (allcolumns[i]) continue; 1880 jmax = nrow[i]; 1881 for (j=0; j<jmax; j++) { 1882 PetscInt ilen; 1883 1884 mat_i = imat_i[j]; 1885 mat_a = imat_a + mat_i; 1886 mat_j = imat_j + mat_i; 1887 ilen = imat_ilen[j]; 1888 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 1889 } 1890 } 1891 } 1892 1893 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1894 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1895 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1896 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1897 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1898 1899 /* Restore the indices */ 1900 for (i=0; i<ismax; i++) { 1901 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1902 if (!allcolumns[i]) { 1903 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1904 } 1905 } 1906 1907 /* Destroy allocated memory */ 1908 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1909 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1910 ierr = PetscFree(pa);CHKERRQ(ierr); 1911 1912 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1913 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1914 for (i=0; i<nrqr; ++i) { 1915 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1916 } 1917 for (i=0; i<nrqs; ++i) { 1918 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1919 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1920 } 1921 1922 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1923 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1924 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1925 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1926 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1927 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1928 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1929 1930 #if defined(PETSC_USE_CTABLE) 1931 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1932 #else 1933 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1934 #endif 1935 ierr = PetscFree(rmap);CHKERRQ(ierr); 1936 1937 for (i=0; i<ismax; i++) { 1938 if (!allcolumns[i]) { 1939 #if defined(PETSC_USE_CTABLE) 1940 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1941 #else 1942 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1943 #endif 1944 } 1945 } 1946 ierr = PetscFree(cmap);CHKERRQ(ierr); 1947 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1948 ierr = PetscFree(lens);CHKERRQ(ierr); 1949 1950 for (i=0; i<ismax; i++) { 1951 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1952 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1953 } 1954 PetscFunctionReturn(0); 1955 } 1956 1957 /* 1958 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1959 Be careful not to destroy them elsewhere. 1960 */ 1961 #undef __FUNCT__ 1962 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1963 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1964 { 1965 /* If making this function public, change the error returned in this function away from _PLIB. */ 1966 PetscErrorCode ierr; 1967 Mat_MPIAIJ *aij; 1968 PetscBool seqaij; 1969 1970 PetscFunctionBegin; 1971 /* Check to make sure the component matrices are compatible with C. */ 1972 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1973 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1974 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1975 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1976 if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1977 1978 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1979 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1980 ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr); 1981 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1982 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1983 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1984 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1985 aij = (Mat_MPIAIJ*)((*C)->data); 1986 aij->A = A; 1987 aij->B = B; 1988 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr); 1989 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr); 1990 1991 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1992 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1993 PetscFunctionReturn(0); 1994 } 1995 1996 #undef __FUNCT__ 1997 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1998 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1999 { 2000 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 2001 2002 PetscFunctionBegin; 2003 PetscValidPointer(A,2); 2004 PetscValidPointer(B,3); 2005 *A = aij->A; 2006 *B = aij->B; 2007 /* Note that we don't incref *A and *B, so be careful! */ 2008 PetscFunctionReturn(0); 2009 } 2010 2011 #undef __FUNCT__ 2012 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 2013 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 2014 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 2015 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 2016 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 2017 { 2018 PetscErrorCode ierr; 2019 PetscMPIInt size, flag; 2020 PetscInt i,ii; 2021 PetscInt ismax_c; 2022 2023 PetscFunctionBegin; 2024 if (!ismax) PetscFunctionReturn(0); 2025 2026 for (i = 0, ismax_c = 0; i < ismax; ++i) { 2027 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 2028 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2029 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2030 if (size > 1) ++ismax_c; 2031 } 2032 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 2033 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2034 } else { /* if (ismax_c) */ 2035 Mat *A,*B; 2036 IS *isrow_c, *iscol_c; 2037 PetscMPIInt size; 2038 /* 2039 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 2040 array of sequential matrices underlying the resulting parallel matrices. 2041 Which arrays to allocate is based on the value of MatReuse scall. 2042 There are as many diag matrices as there are original index sets. 2043 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 2044 2045 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 2046 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 2047 will deposite the extracted diag and off-diag parts. 2048 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 2049 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 2050 */ 2051 2052 /* Parallel matrix array is allocated only if no reuse is taking place. */ 2053 if (scall != MAT_REUSE_MATRIX) { 2054 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 2055 } else { 2056 ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr); 2057 ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr); 2058 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 2059 for (i = 0, ii = 0; i < ismax; ++i) { 2060 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2061 if (size > 1) { 2062 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 2063 ++ii; 2064 } else A[i] = (*submat)[i]; 2065 } 2066 } 2067 /* 2068 Construct the complements of the iscol ISs for parallel ISs only. 2069 These are used to extract the off-diag portion of the resulting parallel matrix. 2070 The row IS for the off-diag portion is the same as for the diag portion, 2071 so we merely alias the row IS, while skipping those that are sequential. 2072 */ 2073 ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr); 2074 for (i = 0, ii = 0; i < ismax; ++i) { 2075 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2076 if (size > 1) { 2077 isrow_c[ii] = isrow[i]; 2078 2079 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 2080 ++ii; 2081 } 2082 } 2083 /* Now obtain the sequential A and B submatrices separately. */ 2084 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 2085 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 2086 for (ii = 0; ii < ismax_c; ++ii) { 2087 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 2088 } 2089 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 2090 /* 2091 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 2092 have been extracted directly into the parallel matrices containing them, or 2093 simply into the sequential matrix identical with the corresponding A (if size == 1). 2094 Otherwise, make sure that parallel matrices are constructed from A & B, or the 2095 A is put into the correct submat slot (if size == 1). 2096 */ 2097 if (scall != MAT_REUSE_MATRIX) { 2098 for (i = 0, ii = 0; i < ismax; ++i) { 2099 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2100 if (size > 1) { 2101 /* 2102 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 2103 */ 2104 /* Construct submat[i] from the Seq pieces A and B. */ 2105 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 2106 2107 ++ii; 2108 } else (*submat)[i] = A[i]; 2109 } 2110 } 2111 ierr = PetscFree(A);CHKERRQ(ierr); 2112 ierr = PetscFree(B);CHKERRQ(ierr); 2113 } 2114 PetscFunctionReturn(0); 2115 } /* MatGetSubMatricesParallel_MPIXAIJ() */ 2116 2117 2118 2119 #undef __FUNCT__ 2120 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 2121 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2122 { 2123 PetscErrorCode ierr; 2124 2125 PetscFunctionBegin; 2126 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 2127 PetscFunctionReturn(0); 2128 } 2129