1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 #include <../src/mat/impls/aij/mpi/mpiaij.h> 8 #include <petscbt.h> 9 #include <petscsf.h> 10 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 16 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 20 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 21 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 25 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 26 { 27 PetscErrorCode ierr; 28 PetscInt i; 29 30 PetscFunctionBegin; 31 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 32 for (i=0; i<ov; ++i) { 33 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 34 } 35 PetscFunctionReturn(0); 36 } 37 38 #undef __FUNCT__ 39 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 40 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 41 { 42 PetscErrorCode ierr; 43 PetscInt i; 44 45 PetscFunctionBegin; 46 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 47 for (i=0; i<ov; ++i) { 48 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 49 } 50 PetscFunctionReturn(0); 51 } 52 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 56 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 57 { 58 PetscErrorCode ierr; 59 MPI_Comm comm; 60 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 61 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 62 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 63 const PetscInt *indices_i,**indices; 64 PetscLayout rmap; 65 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 66 PetscSF sf; 67 PetscSFNode *remote; 68 69 PetscFunctionBegin; 70 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 71 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 72 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 73 /* get row map to determine where rows should be going */ 74 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 75 /* retrieve IS data and put all together so that we 76 * can optimize communication 77 * */ 78 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 79 for (i=0,tlength=0; i<nidx; i++){ 80 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 81 tlength += length[i]; 82 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 83 } 84 /* find these rows on remote processors */ 85 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 86 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 87 nrrows = 0; 88 for (i=0; i<nidx; i++){ 89 length_i = length[i]; 90 indices_i = indices[i]; 91 for (j=0; j<length_i; j++){ 92 owner = -1; 93 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 94 /* remote processors */ 95 if (owner != rank){ 96 tosizes_temp[owner]++; /* number of rows to owner */ 97 rrow_ranks[nrrows] = owner; /* processor */ 98 rrow_isids[nrrows] = i; /* is id */ 99 remoterows[nrrows++] = indices_i[j]; /* row */ 100 } 101 } 102 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 103 } 104 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 105 /* test if we need to exchange messages 106 * generally speaking, we do not need to exchange 107 * data when overlap is 1 108 * */ 109 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 110 /* we do not have any messages 111 * It usually corresponds to overlap 1 112 * */ 113 if (!reducednrrows){ 114 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 115 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 116 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 117 PetscFunctionReturn(0); 118 } 119 nto = 0; 120 /* send sizes and ranks for building a two-sided communcation */ 121 for (i=0; i<size; i++){ 122 if (tosizes_temp[i]){ 123 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 124 tosizes_temp[i] = nto; /* a map from processor to index */ 125 toranks[nto++] = i; /* processor */ 126 } 127 } 128 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 129 for (i=0; i<nto; i++){ 130 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 131 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 132 } 133 /* send information to other processors */ 134 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 135 nrecvrows = 0; 136 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 137 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 138 nrecvrows = 0; 139 for (i=0; i<nfrom; i++){ 140 for (j=0; j<fromsizes[2*i]; j++){ 141 remote[nrecvrows].rank = fromranks[i]; 142 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 143 } 144 } 145 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 146 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 147 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 148 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 149 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 150 /* message pair <no of is, row> */ 151 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 152 for (i=0; i<nrrows; i++){ 153 owner = rrow_ranks[i]; /* processor */ 154 j = tosizes_temp[owner]; /* index */ 155 todata[toffsets[j]++] = rrow_isids[i]; 156 todata[toffsets[j]++] = remoterows[i]; 157 } 158 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 159 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 160 ierr = PetscFree(toffsets);CHKERRQ(ierr); 161 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 163 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 164 /* send rows belonging to the remote so that then we could get the overlapping data back */ 165 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 166 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 167 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 168 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 169 ierr = PetscFree(fromranks);CHKERRQ(ierr); 170 nrecvrows = 0; 171 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 172 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 173 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 174 nrecvrows = 0; 175 for (i=0; i<nto; i++){ 176 for (j=0; j<tosizes[2*i]; j++){ 177 remote[nrecvrows].rank = toranks[i]; 178 remote[nrecvrows++].index = tosizes[2*i+1]+j; 179 } 180 } 181 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 182 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 183 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 184 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 185 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 186 /* overlap communication and computation */ 187 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 188 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 189 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 190 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 191 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 192 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 193 ierr = PetscFree(toranks);CHKERRQ(ierr); 194 ierr = PetscFree(tosizes);CHKERRQ(ierr); 195 ierr = PetscFree(todata);CHKERRQ(ierr); 196 PetscFunctionReturn(0); 197 } 198 199 #undef __FUNCT__ 200 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 201 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 202 { 203 PetscInt *isz,isz_i,i,j,is_id, data_size; 204 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 205 const PetscInt *indices_i_temp; 206 PetscErrorCode ierr; 207 208 PetscFunctionBegin; 209 max_lsize = 0; 210 ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr); 211 for (i=0; i<nidx; i++){ 212 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 213 max_lsize = lsize>max_lsize ? lsize:max_lsize; 214 isz[i] = lsize; 215 } 216 ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr); 217 for (i=0; i<nidx; i++){ 218 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 219 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 220 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 221 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 222 } 223 /* retrieve information to get row id and its overlap */ 224 for (i=0; i<nrecvs; ){ 225 is_id = recvdata[i++]; 226 data_size = recvdata[i++]; 227 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 228 isz_i = isz[is_id]; 229 for (j=0; j< data_size; j++){ 230 col = recvdata[i++]; 231 indices_i[isz_i++] = col; 232 } 233 isz[is_id] = isz_i; 234 } 235 /* remove duplicate entities */ 236 for (i=0; i<nidx; i++){ 237 indices_i = indices_temp+(max_lsize+nrecvs)*i; 238 isz_i = isz[i]; 239 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 240 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 241 } 242 ierr = PetscFree(isz);CHKERRQ(ierr); 243 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 244 PetscFunctionReturn(0); 245 } 246 247 #undef __FUNCT__ 248 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 249 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 250 { 251 PetscLayout rmap,cmap; 252 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 253 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 254 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 255 const PetscInt *gcols,*ai,*aj,*bi,*bj; 256 Mat amat,bmat; 257 PetscMPIInt rank; 258 PetscBool done; 259 MPI_Comm comm; 260 PetscErrorCode ierr; 261 262 PetscFunctionBegin; 263 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 264 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 265 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 266 /* Even if the mat is symmetric, we still assume it is not symmetric */ 267 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 268 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 269 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 270 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 271 /* total number of nonzero values is used to estimate the memory usage in the next step */ 272 tnz = ai[an]+bi[bn]; 273 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 274 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 275 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 276 /* to find the longest message */ 277 max_fszs = 0; 278 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 279 /* better way to estimate number of nonzero in the mat??? */ 280 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 281 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 282 rows_pos = 0; 283 totalrows = 0; 284 for (i=0; i<nfrom; i++){ 285 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 286 /* group data together */ 287 for (j=0; j<fromsizes[2*i]; j+=2){ 288 is_id = fromrows[rows_pos++];/* no of is */ 289 rows_i = rows_data[is_id]; 290 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 291 } 292 /* estimate a space to avoid multiple allocations */ 293 for (j=0; j<nidx; j++){ 294 indvc_ij = 0; 295 rows_i = rows_data[j]; 296 for (l=0; l<rows_pos_i[j]; l++){ 297 row = rows_i[l]-rstart; 298 start = ai[row]; 299 end = ai[row+1]; 300 for (k=start; k<end; k++){ /* Amat */ 301 col = aj[k] + cstart; 302 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 303 } 304 start = bi[row]; 305 end = bi[row+1]; 306 for (k=start; k<end; k++) { /* Bmat */ 307 col = gcols[bj[k]]; 308 indices_tmp[indvc_ij++] = col; 309 } 310 } 311 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 312 indv_counts[i*nidx+j] = indvc_ij; 313 totalrows += indvc_ij; 314 } 315 } 316 /* message triple <no of is, number of rows, rows> */ 317 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 318 totalrows = 0; 319 rows_pos = 0; 320 /* use this code again */ 321 for (i=0;i<nfrom;i++){ 322 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 323 for (j=0; j<fromsizes[2*i]; j+=2){ 324 is_id = fromrows[rows_pos++]; 325 rows_i = rows_data[is_id]; 326 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 327 } 328 /* add data */ 329 for (j=0; j<nidx; j++){ 330 if (!indv_counts[i*nidx+j]) continue; 331 indvc_ij = 0; 332 sbdata[totalrows++] = j; 333 sbdata[totalrows++] = indv_counts[i*nidx+j]; 334 sbsizes[2*i] += 2; 335 rows_i = rows_data[j]; 336 for (l=0; l<rows_pos_i[j]; l++){ 337 row = rows_i[l]-rstart; 338 start = ai[row]; 339 end = ai[row+1]; 340 for (k=start; k<end; k++){ /* Amat */ 341 col = aj[k] + cstart; 342 indices_tmp[indvc_ij++] = col; 343 } 344 start = bi[row]; 345 end = bi[row+1]; 346 for (k=start; k<end; k++) { /* Bmat */ 347 col = gcols[bj[k]]; 348 indices_tmp[indvc_ij++] = col; 349 } 350 } 351 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 352 sbsizes[2*i] += indvc_ij; 353 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 354 totalrows += indvc_ij; 355 } 356 } 357 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 358 for (i=0; i<nfrom; i++){ 359 offsets[i+1] = offsets[i] + sbsizes[2*i]; 360 sbsizes[2*i+1] = offsets[i]; 361 } 362 ierr = PetscFree(offsets);CHKERRQ(ierr); 363 if (sbrowsizes) *sbrowsizes = sbsizes; 364 if (sbrows) *sbrows = sbdata; 365 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 367 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 368 PetscFunctionReturn(0); 369 } 370 371 #undef __FUNCT__ 372 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 373 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 374 { 375 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 376 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 377 PetscInt lsize,lsize_tmp,owner; 378 PetscMPIInt rank; 379 Mat amat,bmat; 380 PetscBool done; 381 PetscLayout cmap,rmap; 382 MPI_Comm comm; 383 PetscErrorCode ierr; 384 385 PetscFunctionBegin; 386 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 387 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 388 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 389 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 390 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 391 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 392 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 393 /* is it a safe way to compute number of nonzero values ? */ 394 tnz = ai[an]+bi[bn]; 395 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 397 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 398 /* it is a better way to estimate memory than the old implementation 399 * where global size of matrix is used 400 * */ 401 ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr); 402 for (i=0; i<nidx; i++) { 403 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 404 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 405 lsize_tmp = 0; 406 for (j=0; j<lsize; j++) { 407 owner = -1; 408 row = indices[j]; 409 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 410 if (owner != rank) continue; 411 /* local number */ 412 row -= rstart; 413 start = ai[row]; 414 end = ai[row+1]; 415 for (k=start; k<end; k++) { /* Amat */ 416 col = aj[k] + cstart; 417 indices_temp[lsize_tmp++] = col; 418 } 419 start = bi[row]; 420 end = bi[row+1]; 421 for (k=start; k<end; k++) { /* Bmat */ 422 col = gcols[bj[k]]; 423 indices_temp[lsize_tmp++] = col; 424 } 425 } 426 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 427 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 428 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 429 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 430 } 431 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 433 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 434 PetscFunctionReturn(0); 435 } 436 437 438 /* 439 Sample message format: 440 If a processor A wants processor B to process some elements corresponding 441 to index sets is[1],is[5] 442 mesg [0] = 2 (no of index sets in the mesg) 443 ----------- 444 mesg [1] = 1 => is[1] 445 mesg [2] = sizeof(is[1]); 446 ----------- 447 mesg [3] = 5 => is[5] 448 mesg [4] = sizeof(is[5]); 449 ----------- 450 mesg [5] 451 mesg [n] datas[1] 452 ----------- 453 mesg[n+1] 454 mesg[m] data(is[5]) 455 ----------- 456 457 Notes: 458 nrqs - no of requests sent (or to be sent out) 459 nrqr - no of requests recieved (which have to be or which have been processed 460 */ 461 #undef __FUNCT__ 462 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 463 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 464 { 465 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 466 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 467 const PetscInt **idx,*idx_i; 468 PetscInt *n,**data,len; 469 PetscErrorCode ierr; 470 PetscMPIInt size,rank,tag1,tag2; 471 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 472 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 473 PetscBT *table; 474 MPI_Comm comm; 475 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 476 MPI_Status *s_status,*recv_status; 477 char *t_p; 478 479 PetscFunctionBegin; 480 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 481 size = c->size; 482 rank = c->rank; 483 M = C->rmap->N; 484 485 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 486 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 487 488 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 489 490 for (i=0; i<imax; i++) { 491 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 492 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 493 } 494 495 /* evaluate communication - mesg to who,length of mesg, and buffer space 496 required. Based on this, buffers are allocated, and data copied into them */ 497 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 498 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 499 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 500 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 501 for (i=0; i<imax; i++) { 502 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 503 idx_i = idx[i]; 504 len = n[i]; 505 for (j=0; j<len; j++) { 506 row = idx_i[j]; 507 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 508 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 509 w4[proc]++; 510 } 511 for (j=0; j<size; j++) { 512 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 513 } 514 } 515 516 nrqs = 0; /* no of outgoing messages */ 517 msz = 0; /* total mesg length (for all proc */ 518 w1[rank] = 0; /* no mesg sent to intself */ 519 w3[rank] = 0; 520 for (i=0; i<size; i++) { 521 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 522 } 523 /* pa - is list of processors to communicate with */ 524 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 525 for (i=0,j=0; i<size; i++) { 526 if (w1[i]) {pa[j] = i; j++;} 527 } 528 529 /* Each message would have a header = 1 + 2*(no of IS) + data */ 530 for (i=0; i<nrqs; i++) { 531 j = pa[i]; 532 w1[j] += w2[j] + 2*w3[j]; 533 msz += w1[j]; 534 } 535 536 /* Determine the number of messages to expect, their lengths, from from-ids */ 537 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 538 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 539 540 /* Now post the Irecvs corresponding to these messages */ 541 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 542 543 /* Allocate Memory for outgoing messages */ 544 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 545 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 546 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 547 548 { 549 PetscInt *iptr = tmp,ict = 0; 550 for (i=0; i<nrqs; i++) { 551 j = pa[i]; 552 iptr += ict; 553 outdat[j] = iptr; 554 ict = w1[j]; 555 } 556 } 557 558 /* Form the outgoing messages */ 559 /* plug in the headers */ 560 for (i=0; i<nrqs; i++) { 561 j = pa[i]; 562 outdat[j][0] = 0; 563 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 564 ptr[j] = outdat[j] + 2*w3[j] + 1; 565 } 566 567 /* Memory for doing local proc's work */ 568 { 569 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 570 571 for (i=0; i<imax; i++) { 572 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 573 data[i] = d_p + M*i; 574 } 575 } 576 577 /* Parse the IS and update local tables and the outgoing buf with the data */ 578 { 579 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 580 PetscBT table_i; 581 582 for (i=0; i<imax; i++) { 583 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 584 n_i = n[i]; 585 table_i = table[i]; 586 idx_i = idx[i]; 587 data_i = data[i]; 588 isz_i = isz[i]; 589 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 590 row = idx_i[j]; 591 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 592 if (proc != rank) { /* copy to the outgoing buffer */ 593 ctr[proc]++; 594 *ptr[proc] = row; 595 ptr[proc]++; 596 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 597 } 598 /* Update the headers for the current IS */ 599 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 600 if ((ctr_j = ctr[j])) { 601 outdat_j = outdat[j]; 602 k = ++outdat_j[0]; 603 outdat_j[2*k] = ctr_j; 604 outdat_j[2*k-1] = i; 605 } 606 } 607 isz[i] = isz_i; 608 } 609 } 610 611 /* Now post the sends */ 612 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 613 for (i=0; i<nrqs; ++i) { 614 j = pa[i]; 615 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 616 } 617 618 /* No longer need the original indices */ 619 for (i=0; i<imax; ++i) { 620 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 621 } 622 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 623 624 for (i=0; i<imax; ++i) { 625 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 626 } 627 628 /* Do Local work */ 629 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 630 631 /* Receive messages */ 632 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 633 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 634 635 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 636 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 637 638 /* Phase 1 sends are complete - deallocate buffers */ 639 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 640 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 641 642 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 643 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 644 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 645 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 646 ierr = PetscFree(rbuf);CHKERRQ(ierr); 647 648 649 /* Send the data back */ 650 /* Do a global reduction to know the buffer space req for incoming messages */ 651 { 652 PetscMPIInt *rw1; 653 654 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 655 656 for (i=0; i<nrqr; ++i) { 657 proc = recv_status[i].MPI_SOURCE; 658 659 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 660 rw1[proc] = isz1[i]; 661 } 662 ierr = PetscFree(onodes1);CHKERRQ(ierr); 663 ierr = PetscFree(olengths1);CHKERRQ(ierr); 664 665 /* Determine the number of messages to expect, their lengths, from from-ids */ 666 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 667 ierr = PetscFree(rw1);CHKERRQ(ierr); 668 } 669 /* Now post the Irecvs corresponding to these messages */ 670 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 671 672 /* Now post the sends */ 673 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 674 for (i=0; i<nrqr; ++i) { 675 j = recv_status[i].MPI_SOURCE; 676 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 677 } 678 679 /* receive work done on other processors */ 680 { 681 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 682 PetscMPIInt idex; 683 PetscBT table_i; 684 MPI_Status *status2; 685 686 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 687 for (i=0; i<nrqs; ++i) { 688 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 689 /* Process the message */ 690 rbuf2_i = rbuf2[idex]; 691 ct1 = 2*rbuf2_i[0]+1; 692 jmax = rbuf2[idex][0]; 693 for (j=1; j<=jmax; j++) { 694 max = rbuf2_i[2*j]; 695 is_no = rbuf2_i[2*j-1]; 696 isz_i = isz[is_no]; 697 data_i = data[is_no]; 698 table_i = table[is_no]; 699 for (k=0; k<max; k++,ct1++) { 700 row = rbuf2_i[ct1]; 701 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 702 } 703 isz[is_no] = isz_i; 704 } 705 } 706 707 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 708 ierr = PetscFree(status2);CHKERRQ(ierr); 709 } 710 711 for (i=0; i<imax; ++i) { 712 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 713 } 714 715 ierr = PetscFree(onodes2);CHKERRQ(ierr); 716 ierr = PetscFree(olengths2);CHKERRQ(ierr); 717 718 ierr = PetscFree(pa);CHKERRQ(ierr); 719 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 720 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 721 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 722 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 723 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 724 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 725 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 726 ierr = PetscFree(s_status);CHKERRQ(ierr); 727 ierr = PetscFree(recv_status);CHKERRQ(ierr); 728 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 729 ierr = PetscFree(xdata);CHKERRQ(ierr); 730 ierr = PetscFree(isz1);CHKERRQ(ierr); 731 PetscFunctionReturn(0); 732 } 733 734 #undef __FUNCT__ 735 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 736 /* 737 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 738 the work on the local processor. 739 740 Inputs: 741 C - MAT_MPIAIJ; 742 imax - total no of index sets processed at a time; 743 table - an array of char - size = m bits. 744 745 Output: 746 isz - array containing the count of the solution elements corresponding 747 to each index set; 748 data - pointer to the solutions 749 */ 750 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 751 { 752 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 753 Mat A = c->A,B = c->B; 754 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 755 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 756 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 757 PetscBT table_i; 758 759 PetscFunctionBegin; 760 rstart = C->rmap->rstart; 761 cstart = C->cmap->rstart; 762 ai = a->i; 763 aj = a->j; 764 bi = b->i; 765 bj = b->j; 766 garray = c->garray; 767 768 769 for (i=0; i<imax; i++) { 770 data_i = data[i]; 771 table_i = table[i]; 772 isz_i = isz[i]; 773 for (j=0,max=isz[i]; j<max; j++) { 774 row = data_i[j] - rstart; 775 start = ai[row]; 776 end = ai[row+1]; 777 for (k=start; k<end; k++) { /* Amat */ 778 val = aj[k] + cstart; 779 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 780 } 781 start = bi[row]; 782 end = bi[row+1]; 783 for (k=start; k<end; k++) { /* Bmat */ 784 val = garray[bj[k]]; 785 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 786 } 787 } 788 isz[i] = isz_i; 789 } 790 PetscFunctionReturn(0); 791 } 792 793 #undef __FUNCT__ 794 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 795 /* 796 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 797 and return the output 798 799 Input: 800 C - the matrix 801 nrqr - no of messages being processed. 802 rbuf - an array of pointers to the recieved requests 803 804 Output: 805 xdata - array of messages to be sent back 806 isz1 - size of each message 807 808 For better efficiency perhaps we should malloc separately each xdata[i], 809 then if a remalloc is required we need only copy the data for that one row 810 rather then all previous rows as it is now where a single large chunck of 811 memory is used. 812 813 */ 814 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 815 { 816 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 817 Mat A = c->A,B = c->B; 818 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 819 PetscErrorCode ierr; 820 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 821 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 822 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 823 PetscInt *rbuf_i,kmax,rbuf_0; 824 PetscBT xtable; 825 826 PetscFunctionBegin; 827 m = C->rmap->N; 828 rstart = C->rmap->rstart; 829 cstart = C->cmap->rstart; 830 ai = a->i; 831 aj = a->j; 832 bi = b->i; 833 bj = b->j; 834 garray = c->garray; 835 836 837 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 838 rbuf_i = rbuf[i]; 839 rbuf_0 = rbuf_i[0]; 840 ct += rbuf_0; 841 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 842 } 843 844 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 845 else max1 = 1; 846 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 847 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 848 ++no_malloc; 849 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 850 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 851 852 ct3 = 0; 853 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 854 rbuf_i = rbuf[i]; 855 rbuf_0 = rbuf_i[0]; 856 ct1 = 2*rbuf_0+1; 857 ct2 = ct1; 858 ct3 += ct1; 859 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 860 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 861 oct2 = ct2; 862 kmax = rbuf_i[2*j]; 863 for (k=0; k<kmax; k++,ct1++) { 864 row = rbuf_i[ct1]; 865 if (!PetscBTLookupSet(xtable,row)) { 866 if (!(ct3 < mem_estimate)) { 867 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 868 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 869 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 870 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 871 xdata[0] = tmp; 872 mem_estimate = new_estimate; ++no_malloc; 873 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 874 } 875 xdata[i][ct2++] = row; 876 ct3++; 877 } 878 } 879 for (k=oct2,max2=ct2; k<max2; k++) { 880 row = xdata[i][k] - rstart; 881 start = ai[row]; 882 end = ai[row+1]; 883 for (l=start; l<end; l++) { 884 val = aj[l] + cstart; 885 if (!PetscBTLookupSet(xtable,val)) { 886 if (!(ct3 < mem_estimate)) { 887 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 888 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 889 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 890 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 891 xdata[0] = tmp; 892 mem_estimate = new_estimate; ++no_malloc; 893 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 894 } 895 xdata[i][ct2++] = val; 896 ct3++; 897 } 898 } 899 start = bi[row]; 900 end = bi[row+1]; 901 for (l=start; l<end; l++) { 902 val = garray[bj[l]]; 903 if (!PetscBTLookupSet(xtable,val)) { 904 if (!(ct3 < mem_estimate)) { 905 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 906 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 907 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 908 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 909 xdata[0] = tmp; 910 mem_estimate = new_estimate; ++no_malloc; 911 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 912 } 913 xdata[i][ct2++] = val; 914 ct3++; 915 } 916 } 917 } 918 /* Update the header*/ 919 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 920 xdata[i][2*j-1] = rbuf_i[2*j-1]; 921 } 922 xdata[i][0] = rbuf_0; 923 xdata[i+1] = xdata[i] + ct2; 924 isz1[i] = ct2; /* size of each message */ 925 } 926 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 927 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 928 PetscFunctionReturn(0); 929 } 930 /* -------------------------------------------------------------------------*/ 931 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 932 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 933 /* 934 Every processor gets the entire matrix 935 */ 936 #undef __FUNCT__ 937 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 938 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 939 { 940 Mat B; 941 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 942 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 943 PetscErrorCode ierr; 944 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 945 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 946 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 947 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 948 949 PetscFunctionBegin; 950 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 951 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 952 953 if (scall == MAT_INITIAL_MATRIX) { 954 /* ---------------------------------------------------------------- 955 Tell every processor the number of nonzeros per row 956 */ 957 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 958 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 959 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 960 } 961 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 962 for (i=0; i<size; i++) { 963 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 964 displs[i] = A->rmap->range[i]; 965 } 966 #if defined(PETSC_HAVE_MPI_IN_PLACE) 967 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 968 #else 969 sendcount = A->rmap->rend - A->rmap->rstart; 970 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 971 #endif 972 /* --------------------------------------------------------------- 973 Create the sequential matrix of the same type as the local block diagonal 974 */ 975 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 976 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 977 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 978 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 979 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 980 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 981 **Bin = B; 982 b = (Mat_SeqAIJ*)B->data; 983 984 /*-------------------------------------------------------------------- 985 Copy my part of matrix column indices over 986 */ 987 sendcount = ad->nz + bd->nz; 988 jsendbuf = b->j + b->i[rstarts[rank]]; 989 a_jsendbuf = ad->j; 990 b_jsendbuf = bd->j; 991 n = A->rmap->rend - A->rmap->rstart; 992 cnt = 0; 993 for (i=0; i<n; i++) { 994 995 /* put in lower diagonal portion */ 996 m = bd->i[i+1] - bd->i[i]; 997 while (m > 0) { 998 /* is it above diagonal (in bd (compressed) numbering) */ 999 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1000 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1001 m--; 1002 } 1003 1004 /* put in diagonal portion */ 1005 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1006 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1007 } 1008 1009 /* put in upper diagonal portion */ 1010 while (m-- > 0) { 1011 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1012 } 1013 } 1014 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1015 1016 /*-------------------------------------------------------------------- 1017 Gather all column indices to all processors 1018 */ 1019 for (i=0; i<size; i++) { 1020 recvcounts[i] = 0; 1021 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1022 recvcounts[i] += lens[j]; 1023 } 1024 } 1025 displs[0] = 0; 1026 for (i=1; i<size; i++) { 1027 displs[i] = displs[i-1] + recvcounts[i-1]; 1028 } 1029 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1030 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1031 #else 1032 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1033 #endif 1034 /*-------------------------------------------------------------------- 1035 Assemble the matrix into useable form (note numerical values not yet set) 1036 */ 1037 /* set the b->ilen (length of each row) values */ 1038 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1039 /* set the b->i indices */ 1040 b->i[0] = 0; 1041 for (i=1; i<=A->rmap->N; i++) { 1042 b->i[i] = b->i[i-1] + lens[i-1]; 1043 } 1044 ierr = PetscFree(lens);CHKERRQ(ierr); 1045 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1046 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1047 1048 } else { 1049 B = **Bin; 1050 b = (Mat_SeqAIJ*)B->data; 1051 } 1052 1053 /*-------------------------------------------------------------------- 1054 Copy my part of matrix numerical values into the values location 1055 */ 1056 if (flag == MAT_GET_VALUES) { 1057 sendcount = ad->nz + bd->nz; 1058 sendbuf = b->a + b->i[rstarts[rank]]; 1059 a_sendbuf = ad->a; 1060 b_sendbuf = bd->a; 1061 b_sendj = bd->j; 1062 n = A->rmap->rend - A->rmap->rstart; 1063 cnt = 0; 1064 for (i=0; i<n; i++) { 1065 1066 /* put in lower diagonal portion */ 1067 m = bd->i[i+1] - bd->i[i]; 1068 while (m > 0) { 1069 /* is it above diagonal (in bd (compressed) numbering) */ 1070 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1071 sendbuf[cnt++] = *b_sendbuf++; 1072 m--; 1073 b_sendj++; 1074 } 1075 1076 /* put in diagonal portion */ 1077 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1078 sendbuf[cnt++] = *a_sendbuf++; 1079 } 1080 1081 /* put in upper diagonal portion */ 1082 while (m-- > 0) { 1083 sendbuf[cnt++] = *b_sendbuf++; 1084 b_sendj++; 1085 } 1086 } 1087 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1088 1089 /* ----------------------------------------------------------------- 1090 Gather all numerical values to all processors 1091 */ 1092 if (!recvcounts) { 1093 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1094 } 1095 for (i=0; i<size; i++) { 1096 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1097 } 1098 displs[0] = 0; 1099 for (i=1; i<size; i++) { 1100 displs[i] = displs[i-1] + recvcounts[i-1]; 1101 } 1102 recvbuf = b->a; 1103 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1104 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1105 #else 1106 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1107 #endif 1108 } /* endof (flag == MAT_GET_VALUES) */ 1109 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1110 1111 if (A->symmetric) { 1112 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1113 } else if (A->hermitian) { 1114 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1115 } else if (A->structurally_symmetric) { 1116 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1117 } 1118 PetscFunctionReturn(0); 1119 } 1120 1121 1122 1123 #undef __FUNCT__ 1124 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 1125 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1126 { 1127 PetscErrorCode ierr; 1128 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 1129 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 1130 1131 PetscFunctionBegin; 1132 1133 /* 1134 Check for special case: each processor gets entire matrix 1135 */ 1136 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1137 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1138 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1139 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1140 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1141 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1142 wantallmatrix = PETSC_TRUE; 1143 1144 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1145 } 1146 } 1147 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1148 if (twantallmatrix) { 1149 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 1150 PetscFunctionReturn(0); 1151 } 1152 1153 /* Allocate memory to hold all the submatrices */ 1154 if (scall != MAT_REUSE_MATRIX) { 1155 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 1156 } 1157 1158 /* Check for special case: each processor gets entire matrix columns */ 1159 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 1160 for (i=0; i<ismax; i++) { 1161 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 1162 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 1163 if (colflag && ncol == C->cmap->N) { 1164 allcolumns[i] = PETSC_TRUE; 1165 } else { 1166 allcolumns[i] = PETSC_FALSE; 1167 } 1168 } 1169 1170 /* Determine the number of stages through which submatrices are done */ 1171 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1172 1173 /* 1174 Each stage will extract nmax submatrices. 1175 nmax is determined by the matrix column dimension. 1176 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1177 */ 1178 if (!nmax) nmax = 1; 1179 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 1180 1181 /* Make sure every processor loops through the nstages */ 1182 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1183 1184 for (i=0,pos=0; i<nstages; i++) { 1185 if (pos+nmax <= ismax) max_no = nmax; 1186 else if (pos == ismax) max_no = 0; 1187 else max_no = ismax-pos; 1188 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 1189 pos += max_no; 1190 } 1191 1192 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 1193 PetscFunctionReturn(0); 1194 } 1195 1196 /* -------------------------------------------------------------------------*/ 1197 #undef __FUNCT__ 1198 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 1199 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 1200 { 1201 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1202 Mat A = c->A; 1203 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 1204 const PetscInt **icol,**irow; 1205 PetscInt *nrow,*ncol,start; 1206 PetscErrorCode ierr; 1207 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 1208 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 1209 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 1210 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 1211 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 1212 #if defined(PETSC_USE_CTABLE) 1213 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 1214 #else 1215 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 1216 #endif 1217 const PetscInt *irow_i; 1218 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 1219 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1220 MPI_Request *r_waits4,*s_waits3,*s_waits4; 1221 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 1222 MPI_Status *r_status3,*r_status4,*s_status4; 1223 MPI_Comm comm; 1224 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 1225 PetscMPIInt *onodes1,*olengths1; 1226 PetscMPIInt idex,idex2,end; 1227 1228 PetscFunctionBegin; 1229 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1230 tag0 = ((PetscObject)C)->tag; 1231 size = c->size; 1232 rank = c->rank; 1233 1234 /* Get some new tags to keep the communication clean */ 1235 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 1236 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1237 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1238 1239 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 1240 1241 for (i=0; i<ismax; i++) { 1242 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 1243 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 1244 if (allcolumns[i]) { 1245 icol[i] = NULL; 1246 ncol[i] = C->cmap->N; 1247 } else { 1248 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 1249 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 1250 } 1251 } 1252 1253 /* evaluate communication - mesg to who, length of mesg, and buffer space 1254 required. Based on this, buffers are allocated, and data copied into them*/ 1255 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 1256 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1257 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1258 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1259 for (i=0; i<ismax; i++) { 1260 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1261 jmax = nrow[i]; 1262 irow_i = irow[i]; 1263 for (j=0; j<jmax; j++) { 1264 l = 0; 1265 row = irow_i[j]; 1266 while (row >= C->rmap->range[l+1]) l++; 1267 proc = l; 1268 w4[proc]++; 1269 } 1270 for (j=0; j<size; j++) { 1271 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 1272 } 1273 } 1274 1275 nrqs = 0; /* no of outgoing messages */ 1276 msz = 0; /* total mesg length (for all procs) */ 1277 w1[rank] = 0; /* no mesg sent to self */ 1278 w3[rank] = 0; 1279 for (i=0; i<size; i++) { 1280 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 1281 } 1282 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1283 for (i=0,j=0; i<size; i++) { 1284 if (w1[i]) { pa[j] = i; j++; } 1285 } 1286 1287 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1288 for (i=0; i<nrqs; i++) { 1289 j = pa[i]; 1290 w1[j] += w2[j] + 2* w3[j]; 1291 msz += w1[j]; 1292 } 1293 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1294 1295 /* Determine the number of messages to expect, their lengths, from from-ids */ 1296 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1297 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1298 1299 /* Now post the Irecvs corresponding to these messages */ 1300 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1301 1302 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1303 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1304 1305 /* Allocate Memory for outgoing messages */ 1306 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1307 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1308 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1309 1310 { 1311 PetscInt *iptr = tmp,ict = 0; 1312 for (i=0; i<nrqs; i++) { 1313 j = pa[i]; 1314 iptr += ict; 1315 sbuf1[j] = iptr; 1316 ict = w1[j]; 1317 } 1318 } 1319 1320 /* Form the outgoing messages */ 1321 /* Initialize the header space */ 1322 for (i=0; i<nrqs; i++) { 1323 j = pa[i]; 1324 sbuf1[j][0] = 0; 1325 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 1326 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 1327 } 1328 1329 /* Parse the isrow and copy data into outbuf */ 1330 for (i=0; i<ismax; i++) { 1331 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1332 irow_i = irow[i]; 1333 jmax = nrow[i]; 1334 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 1335 l = 0; 1336 row = irow_i[j]; 1337 while (row >= C->rmap->range[l+1]) l++; 1338 proc = l; 1339 if (proc != rank) { /* copy to the outgoing buf*/ 1340 ctr[proc]++; 1341 *ptr[proc] = row; 1342 ptr[proc]++; 1343 } 1344 } 1345 /* Update the headers for the current IS */ 1346 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1347 if ((ctr_j = ctr[j])) { 1348 sbuf1_j = sbuf1[j]; 1349 k = ++sbuf1_j[0]; 1350 sbuf1_j[2*k] = ctr_j; 1351 sbuf1_j[2*k-1] = i; 1352 } 1353 } 1354 } 1355 1356 /* Now post the sends */ 1357 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1358 for (i=0; i<nrqs; ++i) { 1359 j = pa[i]; 1360 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 1361 } 1362 1363 /* Post Receives to capture the buffer size */ 1364 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 1365 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 1366 rbuf2[0] = tmp + msz; 1367 for (i=1; i<nrqs; ++i) { 1368 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 1369 } 1370 for (i=0; i<nrqs; ++i) { 1371 j = pa[i]; 1372 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 1373 } 1374 1375 /* Send to other procs the buf size they should allocate */ 1376 1377 1378 /* Receive messages*/ 1379 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 1380 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1381 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 1382 { 1383 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 1384 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 1385 PetscInt *sbuf2_i; 1386 1387 for (i=0; i<nrqr; ++i) { 1388 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1389 1390 req_size[idex] = 0; 1391 rbuf1_i = rbuf1[idex]; 1392 start = 2*rbuf1_i[0] + 1; 1393 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1394 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 1395 sbuf2_i = sbuf2[idex]; 1396 for (j=start; j<end; j++) { 1397 id = rbuf1_i[j] - rstart; 1398 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1399 sbuf2_i[j] = ncols; 1400 req_size[idex] += ncols; 1401 } 1402 req_source[idex] = r_status1[i].MPI_SOURCE; 1403 /* form the header */ 1404 sbuf2_i[0] = req_size[idex]; 1405 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1406 1407 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1408 } 1409 } 1410 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1411 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1412 1413 /* recv buffer sizes */ 1414 /* Receive messages*/ 1415 1416 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 1417 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1418 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 1419 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1420 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 1421 1422 for (i=0; i<nrqs; ++i) { 1423 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1424 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 1425 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 1426 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1427 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1428 } 1429 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1430 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1431 1432 /* Wait on sends1 and sends2 */ 1433 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1434 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 1435 1436 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1437 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1438 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1439 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1440 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1441 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1442 1443 /* Now allocate buffers for a->j, and send them off */ 1444 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1445 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1446 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1447 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1448 1449 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 1450 { 1451 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1452 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1453 PetscInt cend = C->cmap->rend; 1454 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1455 1456 for (i=0; i<nrqr; i++) { 1457 rbuf1_i = rbuf1[i]; 1458 sbuf_aj_i = sbuf_aj[i]; 1459 ct1 = 2*rbuf1_i[0] + 1; 1460 ct2 = 0; 1461 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1462 kmax = rbuf1[i][2*j]; 1463 for (k=0; k<kmax; k++,ct1++) { 1464 row = rbuf1_i[ct1] - rstart; 1465 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1466 ncols = nzA + nzB; 1467 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1468 1469 /* load the column indices for this row into cols*/ 1470 cols = sbuf_aj_i + ct2; 1471 1472 lwrite = 0; 1473 for (l=0; l<nzB; l++) { 1474 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1475 } 1476 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1477 for (l=0; l<nzB; l++) { 1478 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1479 } 1480 1481 ct2 += ncols; 1482 } 1483 } 1484 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1485 } 1486 } 1487 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 1488 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 1489 1490 /* Allocate buffers for a->a, and send them off */ 1491 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1492 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1493 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1494 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1495 1496 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1497 { 1498 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1499 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1500 PetscInt cend = C->cmap->rend; 1501 PetscInt *b_j = b->j; 1502 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1503 1504 for (i=0; i<nrqr; i++) { 1505 rbuf1_i = rbuf1[i]; 1506 sbuf_aa_i = sbuf_aa[i]; 1507 ct1 = 2*rbuf1_i[0]+1; 1508 ct2 = 0; 1509 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1510 kmax = rbuf1_i[2*j]; 1511 for (k=0; k<kmax; k++,ct1++) { 1512 row = rbuf1_i[ct1] - rstart; 1513 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1514 ncols = nzA + nzB; 1515 cworkB = b_j + b_i[row]; 1516 vworkA = a_a + a_i[row]; 1517 vworkB = b_a + b_i[row]; 1518 1519 /* load the column values for this row into vals*/ 1520 vals = sbuf_aa_i+ct2; 1521 1522 lwrite = 0; 1523 for (l=0; l<nzB; l++) { 1524 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1525 } 1526 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1527 for (l=0; l<nzB; l++) { 1528 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1529 } 1530 1531 ct2 += ncols; 1532 } 1533 } 1534 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1535 } 1536 } 1537 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1538 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1539 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1540 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1541 1542 /* Form the matrix */ 1543 /* create col map: global col of C -> local col of submatrices */ 1544 { 1545 const PetscInt *icol_i; 1546 #if defined(PETSC_USE_CTABLE) 1547 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 1548 for (i=0; i<ismax; i++) { 1549 if (!allcolumns[i]) { 1550 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1551 1552 jmax = ncol[i]; 1553 icol_i = icol[i]; 1554 cmap_i = cmap[i]; 1555 for (j=0; j<jmax; j++) { 1556 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1557 } 1558 } else { 1559 cmap[i] = NULL; 1560 } 1561 } 1562 #else 1563 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1564 for (i=0; i<ismax; i++) { 1565 if (!allcolumns[i]) { 1566 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1567 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1568 jmax = ncol[i]; 1569 icol_i = icol[i]; 1570 cmap_i = cmap[i]; 1571 for (j=0; j<jmax; j++) { 1572 cmap_i[icol_i[j]] = j+1; 1573 } 1574 } else { 1575 cmap[i] = NULL; 1576 } 1577 } 1578 #endif 1579 } 1580 1581 /* Create lens which is required for MatCreate... */ 1582 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1583 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1584 if (ismax) { 1585 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1586 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1587 } 1588 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1589 1590 /* Update lens from local data */ 1591 for (i=0; i<ismax; i++) { 1592 jmax = nrow[i]; 1593 if (!allcolumns[i]) cmap_i = cmap[i]; 1594 irow_i = irow[i]; 1595 lens_i = lens[i]; 1596 for (j=0; j<jmax; j++) { 1597 l = 0; 1598 row = irow_i[j]; 1599 while (row >= C->rmap->range[l+1]) l++; 1600 proc = l; 1601 if (proc == rank) { 1602 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1603 if (!allcolumns[i]) { 1604 for (k=0; k<ncols; k++) { 1605 #if defined(PETSC_USE_CTABLE) 1606 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1607 #else 1608 tcol = cmap_i[cols[k]]; 1609 #endif 1610 if (tcol) lens_i[j]++; 1611 } 1612 } else { /* allcolumns */ 1613 lens_i[j] = ncols; 1614 } 1615 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1616 } 1617 } 1618 } 1619 1620 /* Create row map: global row of C -> local row of submatrices */ 1621 #if defined(PETSC_USE_CTABLE) 1622 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 1623 for (i=0; i<ismax; i++) { 1624 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1625 irow_i = irow[i]; 1626 jmax = nrow[i]; 1627 for (j=0; j<jmax; j++) { 1628 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1629 } 1630 } 1631 #else 1632 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1633 if (ismax) { 1634 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1635 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1636 } 1637 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1638 for (i=0; i<ismax; i++) { 1639 rmap_i = rmap[i]; 1640 irow_i = irow[i]; 1641 jmax = nrow[i]; 1642 for (j=0; j<jmax; j++) { 1643 rmap_i[irow_i[j]] = j; 1644 } 1645 } 1646 #endif 1647 1648 /* Update lens from offproc data */ 1649 { 1650 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1651 1652 for (tmp2=0; tmp2<nrqs; tmp2++) { 1653 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1654 idex = pa[idex2]; 1655 sbuf1_i = sbuf1[idex]; 1656 jmax = sbuf1_i[0]; 1657 ct1 = 2*jmax+1; 1658 ct2 = 0; 1659 rbuf2_i = rbuf2[idex2]; 1660 rbuf3_i = rbuf3[idex2]; 1661 for (j=1; j<=jmax; j++) { 1662 is_no = sbuf1_i[2*j-1]; 1663 max1 = sbuf1_i[2*j]; 1664 lens_i = lens[is_no]; 1665 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1666 rmap_i = rmap[is_no]; 1667 for (k=0; k<max1; k++,ct1++) { 1668 #if defined(PETSC_USE_CTABLE) 1669 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1670 row--; 1671 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1672 #else 1673 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1674 #endif 1675 max2 = rbuf2_i[ct1]; 1676 for (l=0; l<max2; l++,ct2++) { 1677 if (!allcolumns[is_no]) { 1678 #if defined(PETSC_USE_CTABLE) 1679 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1680 #else 1681 tcol = cmap_i[rbuf3_i[ct2]]; 1682 #endif 1683 if (tcol) lens_i[row]++; 1684 } else { /* allcolumns */ 1685 lens_i[row]++; /* lens_i[row] += max2 ? */ 1686 } 1687 } 1688 } 1689 } 1690 } 1691 } 1692 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1693 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1694 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1695 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1696 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1697 1698 /* Create the submatrices */ 1699 if (scall == MAT_REUSE_MATRIX) { 1700 PetscBool flag; 1701 1702 /* 1703 Assumes new rows are same length as the old rows,hence bug! 1704 */ 1705 for (i=0; i<ismax; i++) { 1706 mat = (Mat_SeqAIJ*)(submats[i]->data); 1707 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1708 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1709 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1710 /* Initial matrix as if empty */ 1711 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1712 1713 submats[i]->factortype = C->factortype; 1714 } 1715 } else { 1716 for (i=0; i<ismax; i++) { 1717 PetscInt rbs,cbs; 1718 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1719 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1720 1721 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1722 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1723 1724 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1725 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1726 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1727 } 1728 } 1729 1730 /* Assemble the matrices */ 1731 /* First assemble the local rows */ 1732 { 1733 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1734 PetscScalar *imat_a; 1735 1736 for (i=0; i<ismax; i++) { 1737 mat = (Mat_SeqAIJ*)submats[i]->data; 1738 imat_ilen = mat->ilen; 1739 imat_j = mat->j; 1740 imat_i = mat->i; 1741 imat_a = mat->a; 1742 1743 if (!allcolumns[i]) cmap_i = cmap[i]; 1744 rmap_i = rmap[i]; 1745 irow_i = irow[i]; 1746 jmax = nrow[i]; 1747 for (j=0; j<jmax; j++) { 1748 l = 0; 1749 row = irow_i[j]; 1750 while (row >= C->rmap->range[l+1]) l++; 1751 proc = l; 1752 if (proc == rank) { 1753 old_row = row; 1754 #if defined(PETSC_USE_CTABLE) 1755 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1756 row--; 1757 #else 1758 row = rmap_i[row]; 1759 #endif 1760 ilen_row = imat_ilen[row]; 1761 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1762 mat_i = imat_i[row]; 1763 mat_a = imat_a + mat_i; 1764 mat_j = imat_j + mat_i; 1765 if (!allcolumns[i]) { 1766 for (k=0; k<ncols; k++) { 1767 #if defined(PETSC_USE_CTABLE) 1768 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1769 #else 1770 tcol = cmap_i[cols[k]]; 1771 #endif 1772 if (tcol) { 1773 *mat_j++ = tcol - 1; 1774 *mat_a++ = vals[k]; 1775 ilen_row++; 1776 } 1777 } 1778 } else { /* allcolumns */ 1779 for (k=0; k<ncols; k++) { 1780 *mat_j++ = cols[k]; /* global col index! */ 1781 *mat_a++ = vals[k]; 1782 ilen_row++; 1783 } 1784 } 1785 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1786 1787 imat_ilen[row] = ilen_row; 1788 } 1789 } 1790 } 1791 } 1792 1793 /* Now assemble the off proc rows*/ 1794 { 1795 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1796 PetscInt *imat_j,*imat_i; 1797 PetscScalar *imat_a,*rbuf4_i; 1798 1799 for (tmp2=0; tmp2<nrqs; tmp2++) { 1800 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1801 idex = pa[idex2]; 1802 sbuf1_i = sbuf1[idex]; 1803 jmax = sbuf1_i[0]; 1804 ct1 = 2*jmax + 1; 1805 ct2 = 0; 1806 rbuf2_i = rbuf2[idex2]; 1807 rbuf3_i = rbuf3[idex2]; 1808 rbuf4_i = rbuf4[idex2]; 1809 for (j=1; j<=jmax; j++) { 1810 is_no = sbuf1_i[2*j-1]; 1811 rmap_i = rmap[is_no]; 1812 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1813 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1814 imat_ilen = mat->ilen; 1815 imat_j = mat->j; 1816 imat_i = mat->i; 1817 imat_a = mat->a; 1818 max1 = sbuf1_i[2*j]; 1819 for (k=0; k<max1; k++,ct1++) { 1820 row = sbuf1_i[ct1]; 1821 #if defined(PETSC_USE_CTABLE) 1822 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1823 row--; 1824 #else 1825 row = rmap_i[row]; 1826 #endif 1827 ilen = imat_ilen[row]; 1828 mat_i = imat_i[row]; 1829 mat_a = imat_a + mat_i; 1830 mat_j = imat_j + mat_i; 1831 max2 = rbuf2_i[ct1]; 1832 if (!allcolumns[is_no]) { 1833 for (l=0; l<max2; l++,ct2++) { 1834 1835 #if defined(PETSC_USE_CTABLE) 1836 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1837 #else 1838 tcol = cmap_i[rbuf3_i[ct2]]; 1839 #endif 1840 if (tcol) { 1841 *mat_j++ = tcol - 1; 1842 *mat_a++ = rbuf4_i[ct2]; 1843 ilen++; 1844 } 1845 } 1846 } else { /* allcolumns */ 1847 for (l=0; l<max2; l++,ct2++) { 1848 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1849 *mat_a++ = rbuf4_i[ct2]; 1850 ilen++; 1851 } 1852 } 1853 imat_ilen[row] = ilen; 1854 } 1855 } 1856 } 1857 } 1858 1859 /* sort the rows */ 1860 { 1861 PetscInt *imat_ilen,*imat_j,*imat_i; 1862 PetscScalar *imat_a; 1863 1864 for (i=0; i<ismax; i++) { 1865 mat = (Mat_SeqAIJ*)submats[i]->data; 1866 imat_j = mat->j; 1867 imat_i = mat->i; 1868 imat_a = mat->a; 1869 imat_ilen = mat->ilen; 1870 1871 if (allcolumns[i]) continue; 1872 jmax = nrow[i]; 1873 for (j=0; j<jmax; j++) { 1874 PetscInt ilen; 1875 1876 mat_i = imat_i[j]; 1877 mat_a = imat_a + mat_i; 1878 mat_j = imat_j + mat_i; 1879 ilen = imat_ilen[j]; 1880 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 1881 } 1882 } 1883 } 1884 1885 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1886 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1887 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1888 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1889 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1890 1891 /* Restore the indices */ 1892 for (i=0; i<ismax; i++) { 1893 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1894 if (!allcolumns[i]) { 1895 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1896 } 1897 } 1898 1899 /* Destroy allocated memory */ 1900 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1901 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1902 ierr = PetscFree(pa);CHKERRQ(ierr); 1903 1904 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1905 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1906 for (i=0; i<nrqr; ++i) { 1907 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1908 } 1909 for (i=0; i<nrqs; ++i) { 1910 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1911 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1912 } 1913 1914 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1915 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1916 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1917 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1918 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1919 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1920 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1921 1922 #if defined(PETSC_USE_CTABLE) 1923 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1924 #else 1925 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1926 #endif 1927 ierr = PetscFree(rmap);CHKERRQ(ierr); 1928 1929 for (i=0; i<ismax; i++) { 1930 if (!allcolumns[i]) { 1931 #if defined(PETSC_USE_CTABLE) 1932 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1933 #else 1934 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1935 #endif 1936 } 1937 } 1938 ierr = PetscFree(cmap);CHKERRQ(ierr); 1939 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1940 ierr = PetscFree(lens);CHKERRQ(ierr); 1941 1942 for (i=0; i<ismax; i++) { 1943 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1944 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1945 } 1946 PetscFunctionReturn(0); 1947 } 1948 1949 /* 1950 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 1951 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 1952 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 1953 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 1954 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 1955 state, and needs to be "assembled" later by compressing B's column space. 1956 1957 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 1958 Following this call, C->A & C->B have been created, even if empty. 1959 */ 1960 #undef __FUNCT__ 1961 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 1962 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 1963 { 1964 /* If making this function public, change the error returned in this function away from _PLIB. */ 1965 PetscErrorCode ierr; 1966 Mat_MPIAIJ *aij; 1967 Mat_SeqAIJ *Baij; 1968 PetscBool seqaij,Bdisassembled; 1969 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 1970 PetscScalar v; 1971 const PetscInt *rowindices,*colindices; 1972 1973 PetscFunctionBegin; 1974 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 1975 if (A) { 1976 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 1977 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 1978 if (rowemb) { 1979 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 1980 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 1981 } else { 1982 if (C->rmap->n != A->rmap->n) { 1983 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 1984 } 1985 } 1986 if (dcolemb) { 1987 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 1988 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 1989 } else { 1990 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 1991 } 1992 } 1993 if (B) { 1994 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 1995 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 1996 if (rowemb) { 1997 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 1998 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 1999 } else { 2000 if (C->rmap->n != B->rmap->n) { 2001 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2002 } 2003 } 2004 if (ocolemb) { 2005 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2006 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2007 } else { 2008 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2009 } 2010 } 2011 2012 aij = (Mat_MPIAIJ*)(C->data); 2013 if (!aij->A) { 2014 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2015 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2016 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2017 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2018 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2019 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2020 } 2021 if (A) { 2022 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2023 } else { 2024 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2025 } 2026 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2027 /* 2028 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2029 need to "disassemble" B -- convert it to using C's global indices. 2030 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2031 2032 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2033 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2034 2035 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2036 At least avoid calling MatSetValues() and the implied searches? 2037 */ 2038 2039 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2040 #if defined(PETSC_USE_CTABLE) 2041 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2042 #else 2043 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2044 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2045 if (aij->B) { 2046 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2047 } 2048 #endif 2049 ngcol = 0; 2050 if (aij->lvec) { 2051 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2052 } 2053 if (aij->garray) { 2054 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2055 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2056 } 2057 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2058 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2059 } 2060 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2061 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2062 } 2063 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2064 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2065 } 2066 } 2067 Bdisassembled = PETSC_FALSE; 2068 if (!aij->B) { 2069 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2070 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2071 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2072 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2073 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2074 Bdisassembled = PETSC_TRUE; 2075 } 2076 if (B) { 2077 Baij = (Mat_SeqAIJ*)(B->data); 2078 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2079 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2080 for (i=0; i<B->rmap->n; i++) { 2081 nz[i] = Baij->i[i+1] - Baij->i[i]; 2082 } 2083 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2084 ierr = PetscFree(nz);CHKERRQ(ierr); 2085 } 2086 2087 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 2088 shift = rend-rstart; 2089 count = 0; 2090 rowindices = NULL; 2091 colindices = NULL; 2092 if (rowemb) { 2093 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 2094 } 2095 if (ocolemb) { 2096 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 2097 } 2098 for (i=0; i<B->rmap->n; i++) { 2099 PetscInt row; 2100 row = i; 2101 if (rowindices) row = rowindices[i]; 2102 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 2103 col = Baij->j[count]; 2104 if (colindices) col = colindices[col]; 2105 if (Bdisassembled && col>=rstart) col += shift; 2106 v = Baij->a[count]; 2107 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 2108 ++count; 2109 } 2110 } 2111 /* No assembly for aij->B is necessary. */ 2112 /* FIXME: set aij->B's nonzerostate correctly. */ 2113 } else { 2114 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 2115 } 2116 C->preallocated = PETSC_TRUE; 2117 C->was_assembled = PETSC_FALSE; 2118 C->assembled = PETSC_FALSE; 2119 /* 2120 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2121 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2122 */ 2123 PetscFunctionReturn(0); 2124 } 2125 2126 #undef __FUNCT__ 2127 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 2128 /* 2129 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 2130 */ 2131 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 2132 { 2133 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 2134 2135 PetscFunctionBegin; 2136 PetscValidPointer(A,2); 2137 PetscValidPointer(B,3); 2138 /* FIXME: make sure C is assembled */ 2139 *A = aij->A; 2140 *B = aij->B; 2141 /* Note that we don't incref *A and *B, so be careful! */ 2142 PetscFunctionReturn(0); 2143 } 2144 2145 /* 2146 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 2147 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 2148 */ 2149 #undef __FUNCT__ 2150 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 2151 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 2152 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 2153 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 2154 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 2155 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 2156 { 2157 PetscErrorCode ierr; 2158 PetscMPIInt isize,flag; 2159 PetscInt i,ii,cismax,ispar,ciscol_localsize; 2160 Mat *A,*B; 2161 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 2162 2163 PetscFunctionBegin; 2164 if (!ismax) PetscFunctionReturn(0); 2165 2166 for (i = 0, cismax = 0; i < ismax; ++i) { 2167 PetscMPIInt isize; 2168 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 2169 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2170 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 2171 if (isize > 1) ++cismax; 2172 } 2173 /* 2174 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 2175 ispar counts the number of parallel ISs across C's comm. 2176 */ 2177 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2178 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 2179 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2180 PetscFunctionReturn(0); 2181 } 2182 2183 /* if (ispar) */ 2184 /* 2185 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 2186 These are used to extract the off-diag portion of the resulting parallel matrix. 2187 The row IS for the off-diag portion is the same as for the diag portion, 2188 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 2189 */ 2190 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 2191 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 2192 for (i = 0, ii = 0; i < ismax; ++i) { 2193 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2194 if (isize > 1) { 2195 /* 2196 TODO: This is the part that's ***NOT SCALABLE***. 2197 To fix this we need to extract just the indices of C's nonzero columns 2198 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 2199 part of iscol[i] -- without actually computing ciscol[ii]. This also has 2200 to be done without serializing on the IS list, so, most likely, it is best 2201 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 2202 */ 2203 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 2204 /* Now we have to 2205 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 2206 were sorted on each rank, concatenated they might no longer be sorted; 2207 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 2208 indices in the nondecreasing order to the original index positions. 2209 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 2210 */ 2211 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 2212 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 2213 ++ii; 2214 } 2215 } 2216 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 2217 for (i = 0, ii = 0; i < ismax; ++i) { 2218 PetscInt j,issize; 2219 const PetscInt *indices; 2220 2221 /* 2222 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 2223 */ 2224 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 2225 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 2226 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 2227 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 2228 for (j = 1; j < issize; ++j) { 2229 if (indices[j] == indices[j-1]) { 2230 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 2231 } 2232 } 2233 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 2234 2235 2236 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 2237 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 2238 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 2239 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 2240 for (j = 1; j < issize; ++j) { 2241 if (indices[j-1] == indices[j]) { 2242 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 2243 } 2244 } 2245 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 2246 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2247 if (isize > 1) { 2248 cisrow[ii] = isrow[i]; 2249 ++ii; 2250 } 2251 } 2252 /* 2253 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 2254 array of sequential matrices underlying the resulting parallel matrices. 2255 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 2256 contain duplicates. 2257 2258 There are as many diag matrices as there are original index sets. There are only as many parallel 2259 and off-diag matrices, as there are parallel (comm size > 1) index sets. 2260 2261 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 2262 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 2263 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 2264 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 2265 with A[i] and B[ii] extracted from the corresponding MPI submat. 2266 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 2267 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 2268 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 2269 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 2270 values into A[i] and B[ii] sitting inside the corresponding submat. 2271 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 2272 A[i], B[ii], AA[i] or BB[ii] matrices. 2273 */ 2274 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 2275 if (scall == MAT_INITIAL_MATRIX) { 2276 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 2277 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 2278 } else { 2279 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 2280 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 2281 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 2282 for (i = 0, ii = 0; i < ismax; ++i) { 2283 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2284 if (isize > 1) { 2285 Mat AA,BB; 2286 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 2287 if (!isrow_p[i] && !iscol_p[i]) { 2288 A[i] = AA; 2289 } else { 2290 /* TODO: extract A[i] composed on AA. */ 2291 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 2292 } 2293 if (!isrow_p[i] && !ciscol_p[ii]) { 2294 B[ii] = BB; 2295 } else { 2296 /* TODO: extract B[ii] composed on BB. */ 2297 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 2298 } 2299 ++ii; 2300 } else { 2301 if (!isrow_p[i] && !iscol_p[i]) { 2302 A[i] = (*submat)[i]; 2303 } else { 2304 /* TODO: extract A[i] composed on (*submat)[i]. */ 2305 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 2306 } 2307 } 2308 } 2309 } 2310 /* Now obtain the sequential A and B submatrices separately. */ 2311 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 2312 /* I did not figure out a good way to fix it right now. 2313 * Local column size of B[i] is different from the size of ciscol[i]. 2314 * B[i]'s size is finally determined by MatAssembly, while 2315 * ciscol[i] is computed as the complement of iscol[i]. 2316 * It is better to keep only nonzero indices when computing 2317 * the complement ciscol[i]. 2318 * */ 2319 if(scall==MAT_REUSE_MATRIX){ 2320 for(i=0; i<cismax; i++){ 2321 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 2322 B[i]->cmap->n = ciscol_localsize; 2323 } 2324 } 2325 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 2326 /* 2327 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 2328 matrices A & B have been extracted directly into the parallel matrices containing them, or 2329 simply into the sequential matrix identical with the corresponding A (if isize == 1). 2330 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 2331 to have the same sparsity pattern. 2332 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 2333 must be constructed for C. This is done by setseqmat(s). 2334 */ 2335 for (i = 0, ii = 0; i < ismax; ++i) { 2336 /* 2337 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 2338 That way we can avoid sorting and computing permutations when reusing. 2339 To this end: 2340 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 2341 - if caching arrays to hold the ISs, make and compose a container for them so that it can 2342 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 2343 */ 2344 MatStructure pattern; 2345 if (scall == MAT_INITIAL_MATRIX) { 2346 pattern = DIFFERENT_NONZERO_PATTERN; 2347 } else { 2348 pattern = SAME_NONZERO_PATTERN; 2349 } 2350 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2351 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 2352 if (isize > 1) { 2353 if (scall == MAT_INITIAL_MATRIX) { 2354 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 2355 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2356 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 2357 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 2358 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 2359 } 2360 /* 2361 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 2362 */ 2363 { 2364 Mat AA,BB; 2365 AA = NULL; 2366 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 2367 BB = NULL; 2368 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 2369 if (AA || BB) { 2370 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 2371 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2372 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2373 } 2374 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 2375 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 2376 ierr = MatDestroy(&AA);CHKERRQ(ierr); 2377 } 2378 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 2379 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 2380 ierr = MatDestroy(&BB);CHKERRQ(ierr); 2381 } 2382 } 2383 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 2384 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 2385 ++ii; 2386 } else { /* if (isize == 1) */ 2387 if (scall == MAT_INITIAL_MATRIX) { 2388 if (isrow_p[i] || iscol_p[i]) { 2389 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 2390 } else (*submat)[i] = A[i]; 2391 } 2392 if (isrow_p[i] || iscol_p[i]) { 2393 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 2394 /* Otherwise A is extracted straight into (*submats)[i]. */ 2395 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 2396 ierr = MatDestroy(A+i);CHKERRQ(ierr); 2397 } 2398 } 2399 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 2400 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 2401 } 2402 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 2403 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 2404 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 2405 ierr = PetscFree(A);CHKERRQ(ierr); 2406 ierr = PetscFree(B);CHKERRQ(ierr); 2407 PetscFunctionReturn(0); 2408 } 2409 2410 2411 2412 #undef __FUNCT__ 2413 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 2414 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2415 { 2416 PetscErrorCode ierr; 2417 2418 PetscFunctionBegin; 2419 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 2420 PetscFunctionReturn(0); 2421 } 2422