1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 #include <../src/mat/impls/aij/mpi/mpiaij.h> 8 #include <petscbt.h> 9 #include <petscsf.h> 10 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 16 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 20 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 21 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 25 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 26 { 27 PetscErrorCode ierr; 28 PetscInt i; 29 30 PetscFunctionBegin; 31 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 32 for (i=0; i<ov; ++i) { 33 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 34 } 35 PetscFunctionReturn(0); 36 } 37 38 #undef __FUNCT__ 39 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 40 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 41 { 42 PetscErrorCode ierr; 43 PetscInt i; 44 45 PetscFunctionBegin; 46 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 47 for (i=0; i<ov; ++i) { 48 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 49 } 50 PetscFunctionReturn(0); 51 } 52 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 56 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 57 { 58 PetscErrorCode ierr; 59 MPI_Comm comm; 60 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 61 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 62 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 63 const PetscInt *indices_i,**indices; 64 PetscLayout rmap; 65 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 66 PetscSF sf; 67 PetscSFNode *remote; 68 69 PetscFunctionBegin; 70 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 71 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 72 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 73 /* get row map to determine where rows should be going */ 74 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 75 /* retrieve IS data and put all together so that we 76 * can optimize communication 77 * */ 78 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 79 for (i=0,tlength=0; i<nidx; i++){ 80 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 81 tlength += length[i]; 82 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 83 } 84 /* find these rows on remote processors */ 85 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 86 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 87 nrrows = 0; 88 for (i=0; i<nidx; i++){ 89 length_i = length[i]; 90 indices_i = indices[i]; 91 for (j=0; j<length_i; j++){ 92 owner = -1; 93 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 94 /* remote processors */ 95 if (owner != rank){ 96 tosizes_temp[owner]++; /* number of rows to owner */ 97 rrow_ranks[nrrows] = owner; /* processor */ 98 rrow_isids[nrrows] = i; /* is id */ 99 remoterows[nrrows++] = indices_i[j]; /* row */ 100 } 101 } 102 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 103 } 104 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 105 /* test if we need to exchange messages 106 * generally speaking, we do not need to exchange 107 * data when overlap is 1 108 * */ 109 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 110 /* we do not have any messages 111 * It usually corresponds to overlap 1 112 * */ 113 if (!reducednrrows){ 114 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 115 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 116 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 117 PetscFunctionReturn(0); 118 } 119 nto = 0; 120 /* send sizes and ranks for building a two-sided communcation */ 121 for (i=0; i<size; i++){ 122 if (tosizes_temp[i]){ 123 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 124 tosizes_temp[i] = nto; /* a map from processor to index */ 125 toranks[nto++] = i; /* processor */ 126 } 127 } 128 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 129 for (i=0; i<nto; i++){ 130 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 131 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 132 } 133 /* send information to other processors */ 134 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 135 nrecvrows = 0; 136 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 137 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 138 nrecvrows = 0; 139 for (i=0; i<nfrom; i++){ 140 for (j=0; j<fromsizes[2*i]; j++){ 141 remote[nrecvrows].rank = fromranks[i]; 142 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 143 } 144 } 145 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 146 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 147 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 148 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 149 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 150 /* message pair <no of is, row> */ 151 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 152 for (i=0; i<nrrows; i++){ 153 owner = rrow_ranks[i]; /* processor */ 154 j = tosizes_temp[owner]; /* index */ 155 todata[toffsets[j]++] = rrow_isids[i]; 156 todata[toffsets[j]++] = remoterows[i]; 157 } 158 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 159 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 160 ierr = PetscFree(toffsets);CHKERRQ(ierr); 161 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 163 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 164 /* send rows belonging to the remote so that then we could get the overlapping data back */ 165 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 166 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 167 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 168 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 169 ierr = PetscFree(fromranks);CHKERRQ(ierr); 170 nrecvrows = 0; 171 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 172 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 173 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 174 nrecvrows = 0; 175 for (i=0; i<nto; i++){ 176 for (j=0; j<tosizes[2*i]; j++){ 177 remote[nrecvrows].rank = toranks[i]; 178 remote[nrecvrows++].index = tosizes[2*i+1]+j; 179 } 180 } 181 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 182 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 183 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 184 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 185 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 186 /* overlap communication and computation */ 187 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 188 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 189 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 190 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 191 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 192 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 193 ierr = PetscFree(toranks);CHKERRQ(ierr); 194 ierr = PetscFree(tosizes);CHKERRQ(ierr); 195 ierr = PetscFree(todata);CHKERRQ(ierr); 196 PetscFunctionReturn(0); 197 } 198 199 #undef __FUNCT__ 200 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 201 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 202 { 203 PetscInt *isz,isz_i,i,j,is_id, data_size; 204 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 205 const PetscInt *indices_i_temp; 206 PetscErrorCode ierr; 207 208 PetscFunctionBegin; 209 max_lsize = 0; 210 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 211 for (i=0; i<nidx; i++){ 212 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 213 max_lsize = lsize>max_lsize ? lsize:max_lsize; 214 isz[i] = lsize; 215 } 216 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 217 for (i=0; i<nidx; i++){ 218 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 219 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 220 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 221 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 222 } 223 /* retrieve information to get row id and its overlap */ 224 for (i=0; i<nrecvs; ){ 225 is_id = recvdata[i++]; 226 data_size = recvdata[i++]; 227 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 228 isz_i = isz[is_id]; 229 for (j=0; j< data_size; j++){ 230 col = recvdata[i++]; 231 indices_i[isz_i++] = col; 232 } 233 isz[is_id] = isz_i; 234 } 235 /* remove duplicate entities */ 236 for (i=0; i<nidx; i++){ 237 indices_i = indices_temp+(max_lsize+nrecvs)*i; 238 isz_i = isz[i]; 239 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 240 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 241 } 242 ierr = PetscFree(isz);CHKERRQ(ierr); 243 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 244 PetscFunctionReturn(0); 245 } 246 247 #undef __FUNCT__ 248 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 249 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 250 { 251 PetscLayout rmap,cmap; 252 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 253 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 254 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 255 const PetscInt *gcols,*ai,*aj,*bi,*bj; 256 Mat amat,bmat; 257 PetscMPIInt rank; 258 PetscBool done; 259 MPI_Comm comm; 260 PetscErrorCode ierr; 261 262 PetscFunctionBegin; 263 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 264 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 265 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 266 /* Even if the mat is symmetric, we still assume it is not symmetric */ 267 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 268 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 269 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 270 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 271 /* total number of nonzero values is used to estimate the memory usage in the next step */ 272 tnz = ai[an]+bi[bn]; 273 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 274 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 275 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 276 /* to find the longest message */ 277 max_fszs = 0; 278 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 279 /* better way to estimate number of nonzero in the mat??? */ 280 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 281 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 282 rows_pos = 0; 283 totalrows = 0; 284 for (i=0; i<nfrom; i++){ 285 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 286 /* group data together */ 287 for (j=0; j<fromsizes[2*i]; j+=2){ 288 is_id = fromrows[rows_pos++];/* no of is */ 289 rows_i = rows_data[is_id]; 290 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 291 } 292 /* estimate a space to avoid multiple allocations */ 293 for (j=0; j<nidx; j++){ 294 indvc_ij = 0; 295 rows_i = rows_data[j]; 296 for (l=0; l<rows_pos_i[j]; l++){ 297 row = rows_i[l]-rstart; 298 start = ai[row]; 299 end = ai[row+1]; 300 for (k=start; k<end; k++){ /* Amat */ 301 col = aj[k] + cstart; 302 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 303 } 304 start = bi[row]; 305 end = bi[row+1]; 306 for (k=start; k<end; k++) { /* Bmat */ 307 col = gcols[bj[k]]; 308 indices_tmp[indvc_ij++] = col; 309 } 310 } 311 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 312 indv_counts[i*nidx+j] = indvc_ij; 313 totalrows += indvc_ij; 314 } 315 } 316 /* message triple <no of is, number of rows, rows> */ 317 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 318 totalrows = 0; 319 rows_pos = 0; 320 /* use this code again */ 321 for (i=0;i<nfrom;i++){ 322 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 323 for (j=0; j<fromsizes[2*i]; j+=2){ 324 is_id = fromrows[rows_pos++]; 325 rows_i = rows_data[is_id]; 326 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 327 } 328 /* add data */ 329 for (j=0; j<nidx; j++){ 330 if (!indv_counts[i*nidx+j]) continue; 331 indvc_ij = 0; 332 sbdata[totalrows++] = j; 333 sbdata[totalrows++] = indv_counts[i*nidx+j]; 334 sbsizes[2*i] += 2; 335 rows_i = rows_data[j]; 336 for (l=0; l<rows_pos_i[j]; l++){ 337 row = rows_i[l]-rstart; 338 start = ai[row]; 339 end = ai[row+1]; 340 for (k=start; k<end; k++){ /* Amat */ 341 col = aj[k] + cstart; 342 indices_tmp[indvc_ij++] = col; 343 } 344 start = bi[row]; 345 end = bi[row+1]; 346 for (k=start; k<end; k++) { /* Bmat */ 347 col = gcols[bj[k]]; 348 indices_tmp[indvc_ij++] = col; 349 } 350 } 351 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 352 sbsizes[2*i] += indvc_ij; 353 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 354 totalrows += indvc_ij; 355 } 356 } 357 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 358 for (i=0; i<nfrom; i++){ 359 offsets[i+1] = offsets[i] + sbsizes[2*i]; 360 sbsizes[2*i+1] = offsets[i]; 361 } 362 ierr = PetscFree(offsets);CHKERRQ(ierr); 363 if (sbrowsizes) *sbrowsizes = sbsizes; 364 if (sbrows) *sbrows = sbdata; 365 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 367 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 368 PetscFunctionReturn(0); 369 } 370 371 #undef __FUNCT__ 372 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 373 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 374 { 375 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 376 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 377 PetscInt lsize,lsize_tmp,owner; 378 PetscMPIInt rank; 379 Mat amat,bmat; 380 PetscBool done; 381 PetscLayout cmap,rmap; 382 MPI_Comm comm; 383 PetscErrorCode ierr; 384 385 PetscFunctionBegin; 386 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 387 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 388 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 389 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 390 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 391 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 392 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 393 /* is it a safe way to compute number of nonzero values ? */ 394 tnz = ai[an]+bi[bn]; 395 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 397 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 398 /* it is a better way to estimate memory than the old implementation 399 * where global size of matrix is used 400 * */ 401 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 402 for (i=0; i<nidx; i++) { 403 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 404 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 405 lsize_tmp = 0; 406 for (j=0; j<lsize; j++) { 407 owner = -1; 408 row = indices[j]; 409 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 410 if (owner != rank) continue; 411 /* local number */ 412 row -= rstart; 413 start = ai[row]; 414 end = ai[row+1]; 415 for (k=start; k<end; k++) { /* Amat */ 416 col = aj[k] + cstart; 417 indices_temp[lsize_tmp++] = col; 418 } 419 start = bi[row]; 420 end = bi[row+1]; 421 for (k=start; k<end; k++) { /* Bmat */ 422 col = gcols[bj[k]]; 423 indices_temp[lsize_tmp++] = col; 424 } 425 } 426 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 427 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 428 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 429 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 430 } 431 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 433 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 434 PetscFunctionReturn(0); 435 } 436 437 438 /* 439 Sample message format: 440 If a processor A wants processor B to process some elements corresponding 441 to index sets is[1],is[5] 442 mesg [0] = 2 (no of index sets in the mesg) 443 ----------- 444 mesg [1] = 1 => is[1] 445 mesg [2] = sizeof(is[1]); 446 ----------- 447 mesg [3] = 5 => is[5] 448 mesg [4] = sizeof(is[5]); 449 ----------- 450 mesg [5] 451 mesg [n] datas[1] 452 ----------- 453 mesg[n+1] 454 mesg[m] data(is[5]) 455 ----------- 456 457 Notes: 458 nrqs - no of requests sent (or to be sent out) 459 nrqr - no of requests recieved (which have to be or which have been processed 460 */ 461 #undef __FUNCT__ 462 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 463 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 464 { 465 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 466 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 467 const PetscInt **idx,*idx_i; 468 PetscInt *n,**data,len; 469 PetscErrorCode ierr; 470 PetscMPIInt size,rank,tag1,tag2; 471 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 472 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 473 PetscBT *table; 474 MPI_Comm comm; 475 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 476 MPI_Status *s_status,*recv_status; 477 char *t_p; 478 479 PetscFunctionBegin; 480 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 481 size = c->size; 482 rank = c->rank; 483 M = C->rmap->N; 484 485 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 486 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 487 488 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 489 490 for (i=0; i<imax; i++) { 491 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 492 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 493 } 494 495 /* evaluate communication - mesg to who,length of mesg, and buffer space 496 required. Based on this, buffers are allocated, and data copied into them */ 497 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 498 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 499 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 500 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 501 for (i=0; i<imax; i++) { 502 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 503 idx_i = idx[i]; 504 len = n[i]; 505 for (j=0; j<len; j++) { 506 row = idx_i[j]; 507 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 508 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 509 w4[proc]++; 510 } 511 for (j=0; j<size; j++) { 512 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 513 } 514 } 515 516 nrqs = 0; /* no of outgoing messages */ 517 msz = 0; /* total mesg length (for all proc */ 518 w1[rank] = 0; /* no mesg sent to intself */ 519 w3[rank] = 0; 520 for (i=0; i<size; i++) { 521 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 522 } 523 /* pa - is list of processors to communicate with */ 524 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 525 for (i=0,j=0; i<size; i++) { 526 if (w1[i]) {pa[j] = i; j++;} 527 } 528 529 /* Each message would have a header = 1 + 2*(no of IS) + data */ 530 for (i=0; i<nrqs; i++) { 531 j = pa[i]; 532 w1[j] += w2[j] + 2*w3[j]; 533 msz += w1[j]; 534 } 535 536 /* Determine the number of messages to expect, their lengths, from from-ids */ 537 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 538 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 539 540 /* Now post the Irecvs corresponding to these messages */ 541 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 542 543 /* Allocate Memory for outgoing messages */ 544 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 545 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 546 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 547 548 { 549 PetscInt *iptr = tmp,ict = 0; 550 for (i=0; i<nrqs; i++) { 551 j = pa[i]; 552 iptr += ict; 553 outdat[j] = iptr; 554 ict = w1[j]; 555 } 556 } 557 558 /* Form the outgoing messages */ 559 /* plug in the headers */ 560 for (i=0; i<nrqs; i++) { 561 j = pa[i]; 562 outdat[j][0] = 0; 563 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 564 ptr[j] = outdat[j] + 2*w3[j] + 1; 565 } 566 567 /* Memory for doing local proc's work */ 568 { 569 PetscInt Mimax = 0,M_BPB_imax = 0; 570 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 571 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 572 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 573 for (i=0; i<imax; i++) { 574 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 575 data[i] = d_p + M*i; 576 } 577 } 578 579 /* Parse the IS and update local tables and the outgoing buf with the data */ 580 { 581 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 582 PetscBT table_i; 583 584 for (i=0; i<imax; i++) { 585 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 586 n_i = n[i]; 587 table_i = table[i]; 588 idx_i = idx[i]; 589 data_i = data[i]; 590 isz_i = isz[i]; 591 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 592 row = idx_i[j]; 593 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 594 if (proc != rank) { /* copy to the outgoing buffer */ 595 ctr[proc]++; 596 *ptr[proc] = row; 597 ptr[proc]++; 598 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 599 } 600 /* Update the headers for the current IS */ 601 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 602 if ((ctr_j = ctr[j])) { 603 outdat_j = outdat[j]; 604 k = ++outdat_j[0]; 605 outdat_j[2*k] = ctr_j; 606 outdat_j[2*k-1] = i; 607 } 608 } 609 isz[i] = isz_i; 610 } 611 } 612 613 /* Now post the sends */ 614 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 615 for (i=0; i<nrqs; ++i) { 616 j = pa[i]; 617 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 618 } 619 620 /* No longer need the original indices */ 621 for (i=0; i<imax; ++i) { 622 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 623 } 624 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 625 626 for (i=0; i<imax; ++i) { 627 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 628 } 629 630 /* Do Local work */ 631 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 632 633 /* Receive messages */ 634 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 635 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 636 637 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 638 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 639 640 /* Phase 1 sends are complete - deallocate buffers */ 641 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 642 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 643 644 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 645 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 646 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 647 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 648 ierr = PetscFree(rbuf);CHKERRQ(ierr); 649 650 651 /* Send the data back */ 652 /* Do a global reduction to know the buffer space req for incoming messages */ 653 { 654 PetscMPIInt *rw1; 655 656 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 657 658 for (i=0; i<nrqr; ++i) { 659 proc = recv_status[i].MPI_SOURCE; 660 661 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 662 rw1[proc] = isz1[i]; 663 } 664 ierr = PetscFree(onodes1);CHKERRQ(ierr); 665 ierr = PetscFree(olengths1);CHKERRQ(ierr); 666 667 /* Determine the number of messages to expect, their lengths, from from-ids */ 668 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 669 ierr = PetscFree(rw1);CHKERRQ(ierr); 670 } 671 /* Now post the Irecvs corresponding to these messages */ 672 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 673 674 /* Now post the sends */ 675 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 676 for (i=0; i<nrqr; ++i) { 677 j = recv_status[i].MPI_SOURCE; 678 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 679 } 680 681 /* receive work done on other processors */ 682 { 683 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 684 PetscMPIInt idex; 685 PetscBT table_i; 686 MPI_Status *status2; 687 688 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 689 for (i=0; i<nrqs; ++i) { 690 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 691 /* Process the message */ 692 rbuf2_i = rbuf2[idex]; 693 ct1 = 2*rbuf2_i[0]+1; 694 jmax = rbuf2[idex][0]; 695 for (j=1; j<=jmax; j++) { 696 max = rbuf2_i[2*j]; 697 is_no = rbuf2_i[2*j-1]; 698 isz_i = isz[is_no]; 699 data_i = data[is_no]; 700 table_i = table[is_no]; 701 for (k=0; k<max; k++,ct1++) { 702 row = rbuf2_i[ct1]; 703 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 704 } 705 isz[is_no] = isz_i; 706 } 707 } 708 709 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 710 ierr = PetscFree(status2);CHKERRQ(ierr); 711 } 712 713 for (i=0; i<imax; ++i) { 714 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 715 } 716 717 ierr = PetscFree(onodes2);CHKERRQ(ierr); 718 ierr = PetscFree(olengths2);CHKERRQ(ierr); 719 720 ierr = PetscFree(pa);CHKERRQ(ierr); 721 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 722 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 723 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 724 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 725 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 726 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 727 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 728 ierr = PetscFree(s_status);CHKERRQ(ierr); 729 ierr = PetscFree(recv_status);CHKERRQ(ierr); 730 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 731 ierr = PetscFree(xdata);CHKERRQ(ierr); 732 ierr = PetscFree(isz1);CHKERRQ(ierr); 733 PetscFunctionReturn(0); 734 } 735 736 #undef __FUNCT__ 737 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 738 /* 739 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 740 the work on the local processor. 741 742 Inputs: 743 C - MAT_MPIAIJ; 744 imax - total no of index sets processed at a time; 745 table - an array of char - size = m bits. 746 747 Output: 748 isz - array containing the count of the solution elements corresponding 749 to each index set; 750 data - pointer to the solutions 751 */ 752 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 753 { 754 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 755 Mat A = c->A,B = c->B; 756 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 757 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 758 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 759 PetscBT table_i; 760 761 PetscFunctionBegin; 762 rstart = C->rmap->rstart; 763 cstart = C->cmap->rstart; 764 ai = a->i; 765 aj = a->j; 766 bi = b->i; 767 bj = b->j; 768 garray = c->garray; 769 770 771 for (i=0; i<imax; i++) { 772 data_i = data[i]; 773 table_i = table[i]; 774 isz_i = isz[i]; 775 for (j=0,max=isz[i]; j<max; j++) { 776 row = data_i[j] - rstart; 777 start = ai[row]; 778 end = ai[row+1]; 779 for (k=start; k<end; k++) { /* Amat */ 780 val = aj[k] + cstart; 781 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 782 } 783 start = bi[row]; 784 end = bi[row+1]; 785 for (k=start; k<end; k++) { /* Bmat */ 786 val = garray[bj[k]]; 787 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 788 } 789 } 790 isz[i] = isz_i; 791 } 792 PetscFunctionReturn(0); 793 } 794 795 #undef __FUNCT__ 796 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 797 /* 798 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 799 and return the output 800 801 Input: 802 C - the matrix 803 nrqr - no of messages being processed. 804 rbuf - an array of pointers to the recieved requests 805 806 Output: 807 xdata - array of messages to be sent back 808 isz1 - size of each message 809 810 For better efficiency perhaps we should malloc separately each xdata[i], 811 then if a remalloc is required we need only copy the data for that one row 812 rather then all previous rows as it is now where a single large chunck of 813 memory is used. 814 815 */ 816 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 817 { 818 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 819 Mat A = c->A,B = c->B; 820 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 821 PetscErrorCode ierr; 822 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 823 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 824 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 825 PetscInt *rbuf_i,kmax,rbuf_0; 826 PetscBT xtable; 827 828 PetscFunctionBegin; 829 m = C->rmap->N; 830 rstart = C->rmap->rstart; 831 cstart = C->cmap->rstart; 832 ai = a->i; 833 aj = a->j; 834 bi = b->i; 835 bj = b->j; 836 garray = c->garray; 837 838 839 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 840 rbuf_i = rbuf[i]; 841 rbuf_0 = rbuf_i[0]; 842 ct += rbuf_0; 843 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 844 } 845 846 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 847 else max1 = 1; 848 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 849 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 850 ++no_malloc; 851 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 852 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 853 854 ct3 = 0; 855 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 856 rbuf_i = rbuf[i]; 857 rbuf_0 = rbuf_i[0]; 858 ct1 = 2*rbuf_0+1; 859 ct2 = ct1; 860 ct3 += ct1; 861 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 862 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 863 oct2 = ct2; 864 kmax = rbuf_i[2*j]; 865 for (k=0; k<kmax; k++,ct1++) { 866 row = rbuf_i[ct1]; 867 if (!PetscBTLookupSet(xtable,row)) { 868 if (!(ct3 < mem_estimate)) { 869 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 870 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 871 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 872 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 873 xdata[0] = tmp; 874 mem_estimate = new_estimate; ++no_malloc; 875 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 876 } 877 xdata[i][ct2++] = row; 878 ct3++; 879 } 880 } 881 for (k=oct2,max2=ct2; k<max2; k++) { 882 row = xdata[i][k] - rstart; 883 start = ai[row]; 884 end = ai[row+1]; 885 for (l=start; l<end; l++) { 886 val = aj[l] + cstart; 887 if (!PetscBTLookupSet(xtable,val)) { 888 if (!(ct3 < mem_estimate)) { 889 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 890 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 891 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 892 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 893 xdata[0] = tmp; 894 mem_estimate = new_estimate; ++no_malloc; 895 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 896 } 897 xdata[i][ct2++] = val; 898 ct3++; 899 } 900 } 901 start = bi[row]; 902 end = bi[row+1]; 903 for (l=start; l<end; l++) { 904 val = garray[bj[l]]; 905 if (!PetscBTLookupSet(xtable,val)) { 906 if (!(ct3 < mem_estimate)) { 907 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 908 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 909 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 910 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 911 xdata[0] = tmp; 912 mem_estimate = new_estimate; ++no_malloc; 913 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 914 } 915 xdata[i][ct2++] = val; 916 ct3++; 917 } 918 } 919 } 920 /* Update the header*/ 921 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 922 xdata[i][2*j-1] = rbuf_i[2*j-1]; 923 } 924 xdata[i][0] = rbuf_0; 925 xdata[i+1] = xdata[i] + ct2; 926 isz1[i] = ct2; /* size of each message */ 927 } 928 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 929 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 930 PetscFunctionReturn(0); 931 } 932 /* -------------------------------------------------------------------------*/ 933 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 934 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 935 /* 936 Every processor gets the entire matrix 937 */ 938 #undef __FUNCT__ 939 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 940 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 941 { 942 Mat B; 943 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 944 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 945 PetscErrorCode ierr; 946 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 947 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 948 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 949 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 950 951 PetscFunctionBegin; 952 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 953 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 954 955 if (scall == MAT_INITIAL_MATRIX) { 956 /* ---------------------------------------------------------------- 957 Tell every processor the number of nonzeros per row 958 */ 959 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 960 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 961 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 962 } 963 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 964 for (i=0; i<size; i++) { 965 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 966 displs[i] = A->rmap->range[i]; 967 } 968 #if defined(PETSC_HAVE_MPI_IN_PLACE) 969 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 970 #else 971 sendcount = A->rmap->rend - A->rmap->rstart; 972 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 973 #endif 974 /* --------------------------------------------------------------- 975 Create the sequential matrix of the same type as the local block diagonal 976 */ 977 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 978 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 979 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 980 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 981 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 982 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 983 **Bin = B; 984 b = (Mat_SeqAIJ*)B->data; 985 986 /*-------------------------------------------------------------------- 987 Copy my part of matrix column indices over 988 */ 989 sendcount = ad->nz + bd->nz; 990 jsendbuf = b->j + b->i[rstarts[rank]]; 991 a_jsendbuf = ad->j; 992 b_jsendbuf = bd->j; 993 n = A->rmap->rend - A->rmap->rstart; 994 cnt = 0; 995 for (i=0; i<n; i++) { 996 997 /* put in lower diagonal portion */ 998 m = bd->i[i+1] - bd->i[i]; 999 while (m > 0) { 1000 /* is it above diagonal (in bd (compressed) numbering) */ 1001 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1002 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1003 m--; 1004 } 1005 1006 /* put in diagonal portion */ 1007 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1008 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1009 } 1010 1011 /* put in upper diagonal portion */ 1012 while (m-- > 0) { 1013 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1014 } 1015 } 1016 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1017 1018 /*-------------------------------------------------------------------- 1019 Gather all column indices to all processors 1020 */ 1021 for (i=0; i<size; i++) { 1022 recvcounts[i] = 0; 1023 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1024 recvcounts[i] += lens[j]; 1025 } 1026 } 1027 displs[0] = 0; 1028 for (i=1; i<size; i++) { 1029 displs[i] = displs[i-1] + recvcounts[i-1]; 1030 } 1031 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1032 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1033 #else 1034 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1035 #endif 1036 /*-------------------------------------------------------------------- 1037 Assemble the matrix into useable form (note numerical values not yet set) 1038 */ 1039 /* set the b->ilen (length of each row) values */ 1040 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1041 /* set the b->i indices */ 1042 b->i[0] = 0; 1043 for (i=1; i<=A->rmap->N; i++) { 1044 b->i[i] = b->i[i-1] + lens[i-1]; 1045 } 1046 ierr = PetscFree(lens);CHKERRQ(ierr); 1047 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1048 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1049 1050 } else { 1051 B = **Bin; 1052 b = (Mat_SeqAIJ*)B->data; 1053 } 1054 1055 /*-------------------------------------------------------------------- 1056 Copy my part of matrix numerical values into the values location 1057 */ 1058 if (flag == MAT_GET_VALUES) { 1059 sendcount = ad->nz + bd->nz; 1060 sendbuf = b->a + b->i[rstarts[rank]]; 1061 a_sendbuf = ad->a; 1062 b_sendbuf = bd->a; 1063 b_sendj = bd->j; 1064 n = A->rmap->rend - A->rmap->rstart; 1065 cnt = 0; 1066 for (i=0; i<n; i++) { 1067 1068 /* put in lower diagonal portion */ 1069 m = bd->i[i+1] - bd->i[i]; 1070 while (m > 0) { 1071 /* is it above diagonal (in bd (compressed) numbering) */ 1072 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1073 sendbuf[cnt++] = *b_sendbuf++; 1074 m--; 1075 b_sendj++; 1076 } 1077 1078 /* put in diagonal portion */ 1079 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1080 sendbuf[cnt++] = *a_sendbuf++; 1081 } 1082 1083 /* put in upper diagonal portion */ 1084 while (m-- > 0) { 1085 sendbuf[cnt++] = *b_sendbuf++; 1086 b_sendj++; 1087 } 1088 } 1089 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1090 1091 /* ----------------------------------------------------------------- 1092 Gather all numerical values to all processors 1093 */ 1094 if (!recvcounts) { 1095 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1096 } 1097 for (i=0; i<size; i++) { 1098 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1099 } 1100 displs[0] = 0; 1101 for (i=1; i<size; i++) { 1102 displs[i] = displs[i-1] + recvcounts[i-1]; 1103 } 1104 recvbuf = b->a; 1105 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1106 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1107 #else 1108 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1109 #endif 1110 } /* endof (flag == MAT_GET_VALUES) */ 1111 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1112 1113 if (A->symmetric) { 1114 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1115 } else if (A->hermitian) { 1116 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1117 } else if (A->structurally_symmetric) { 1118 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1119 } 1120 PetscFunctionReturn(0); 1121 } 1122 1123 1124 1125 #undef __FUNCT__ 1126 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 1127 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1128 { 1129 PetscErrorCode ierr; 1130 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 1131 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 1132 1133 PetscFunctionBegin; 1134 1135 /* 1136 Check for special case: each processor gets entire matrix 1137 */ 1138 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1139 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1140 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1141 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1142 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1143 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1144 wantallmatrix = PETSC_TRUE; 1145 1146 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1147 } 1148 } 1149 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1150 if (twantallmatrix) { 1151 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 1152 PetscFunctionReturn(0); 1153 } 1154 1155 /* Allocate memory to hold all the submatrices */ 1156 if (scall != MAT_REUSE_MATRIX) { 1157 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 1158 } 1159 1160 /* Check for special case: each processor gets entire matrix columns */ 1161 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 1162 for (i=0; i<ismax; i++) { 1163 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 1164 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 1165 if (colflag && ncol == C->cmap->N) { 1166 allcolumns[i] = PETSC_TRUE; 1167 } else { 1168 allcolumns[i] = PETSC_FALSE; 1169 } 1170 } 1171 1172 /* Determine the number of stages through which submatrices are done */ 1173 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1174 1175 /* 1176 Each stage will extract nmax submatrices. 1177 nmax is determined by the matrix column dimension. 1178 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1179 */ 1180 if (!nmax) nmax = 1; 1181 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 1182 1183 /* Make sure every processor loops through the nstages */ 1184 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1185 1186 for (i=0,pos=0; i<nstages; i++) { 1187 if (pos+nmax <= ismax) max_no = nmax; 1188 else if (pos == ismax) max_no = 0; 1189 else max_no = ismax-pos; 1190 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 1191 pos += max_no; 1192 } 1193 1194 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 1195 PetscFunctionReturn(0); 1196 } 1197 1198 /* -------------------------------------------------------------------------*/ 1199 #undef __FUNCT__ 1200 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 1201 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 1202 { 1203 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1204 Mat A = c->A; 1205 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 1206 const PetscInt **icol,**irow; 1207 PetscInt *nrow,*ncol,start; 1208 PetscErrorCode ierr; 1209 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 1210 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 1211 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 1212 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 1213 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 1214 #if defined(PETSC_USE_CTABLE) 1215 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 1216 #else 1217 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 1218 #endif 1219 const PetscInt *irow_i; 1220 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 1221 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1222 MPI_Request *r_waits4,*s_waits3,*s_waits4; 1223 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 1224 MPI_Status *r_status3,*r_status4,*s_status4; 1225 MPI_Comm comm; 1226 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 1227 PetscMPIInt *onodes1,*olengths1; 1228 PetscMPIInt idex,idex2,end; 1229 1230 PetscFunctionBegin; 1231 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1232 tag0 = ((PetscObject)C)->tag; 1233 size = c->size; 1234 rank = c->rank; 1235 1236 /* Get some new tags to keep the communication clean */ 1237 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 1238 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1239 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1240 1241 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 1242 1243 for (i=0; i<ismax; i++) { 1244 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 1245 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 1246 if (allcolumns[i]) { 1247 icol[i] = NULL; 1248 ncol[i] = C->cmap->N; 1249 } else { 1250 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 1251 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 1252 } 1253 } 1254 1255 /* evaluate communication - mesg to who, length of mesg, and buffer space 1256 required. Based on this, buffers are allocated, and data copied into them*/ 1257 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 1258 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1259 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1260 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1261 for (i=0; i<ismax; i++) { 1262 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1263 jmax = nrow[i]; 1264 irow_i = irow[i]; 1265 for (j=0; j<jmax; j++) { 1266 l = 0; 1267 row = irow_i[j]; 1268 while (row >= C->rmap->range[l+1]) l++; 1269 proc = l; 1270 w4[proc]++; 1271 } 1272 for (j=0; j<size; j++) { 1273 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 1274 } 1275 } 1276 1277 nrqs = 0; /* no of outgoing messages */ 1278 msz = 0; /* total mesg length (for all procs) */ 1279 w1[rank] = 0; /* no mesg sent to self */ 1280 w3[rank] = 0; 1281 for (i=0; i<size; i++) { 1282 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 1283 } 1284 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1285 for (i=0,j=0; i<size; i++) { 1286 if (w1[i]) { pa[j] = i; j++; } 1287 } 1288 1289 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1290 for (i=0; i<nrqs; i++) { 1291 j = pa[i]; 1292 w1[j] += w2[j] + 2* w3[j]; 1293 msz += w1[j]; 1294 } 1295 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1296 1297 /* Determine the number of messages to expect, their lengths, from from-ids */ 1298 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1299 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1300 1301 /* Now post the Irecvs corresponding to these messages */ 1302 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1303 1304 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1305 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1306 1307 /* Allocate Memory for outgoing messages */ 1308 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1309 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1310 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1311 1312 { 1313 PetscInt *iptr = tmp,ict = 0; 1314 for (i=0; i<nrqs; i++) { 1315 j = pa[i]; 1316 iptr += ict; 1317 sbuf1[j] = iptr; 1318 ict = w1[j]; 1319 } 1320 } 1321 1322 /* Form the outgoing messages */ 1323 /* Initialize the header space */ 1324 for (i=0; i<nrqs; i++) { 1325 j = pa[i]; 1326 sbuf1[j][0] = 0; 1327 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 1328 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 1329 } 1330 1331 /* Parse the isrow and copy data into outbuf */ 1332 for (i=0; i<ismax; i++) { 1333 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1334 irow_i = irow[i]; 1335 jmax = nrow[i]; 1336 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 1337 l = 0; 1338 row = irow_i[j]; 1339 while (row >= C->rmap->range[l+1]) l++; 1340 proc = l; 1341 if (proc != rank) { /* copy to the outgoing buf*/ 1342 ctr[proc]++; 1343 *ptr[proc] = row; 1344 ptr[proc]++; 1345 } 1346 } 1347 /* Update the headers for the current IS */ 1348 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1349 if ((ctr_j = ctr[j])) { 1350 sbuf1_j = sbuf1[j]; 1351 k = ++sbuf1_j[0]; 1352 sbuf1_j[2*k] = ctr_j; 1353 sbuf1_j[2*k-1] = i; 1354 } 1355 } 1356 } 1357 1358 /* Now post the sends */ 1359 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1360 for (i=0; i<nrqs; ++i) { 1361 j = pa[i]; 1362 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 1363 } 1364 1365 /* Post Receives to capture the buffer size */ 1366 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 1367 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 1368 rbuf2[0] = tmp + msz; 1369 for (i=1; i<nrqs; ++i) { 1370 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 1371 } 1372 for (i=0; i<nrqs; ++i) { 1373 j = pa[i]; 1374 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 1375 } 1376 1377 /* Send to other procs the buf size they should allocate */ 1378 1379 1380 /* Receive messages*/ 1381 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 1382 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1383 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 1384 { 1385 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 1386 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 1387 PetscInt *sbuf2_i; 1388 1389 for (i=0; i<nrqr; ++i) { 1390 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1391 1392 req_size[idex] = 0; 1393 rbuf1_i = rbuf1[idex]; 1394 start = 2*rbuf1_i[0] + 1; 1395 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1396 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 1397 sbuf2_i = sbuf2[idex]; 1398 for (j=start; j<end; j++) { 1399 id = rbuf1_i[j] - rstart; 1400 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1401 sbuf2_i[j] = ncols; 1402 req_size[idex] += ncols; 1403 } 1404 req_source[idex] = r_status1[i].MPI_SOURCE; 1405 /* form the header */ 1406 sbuf2_i[0] = req_size[idex]; 1407 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1408 1409 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1410 } 1411 } 1412 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1413 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1414 1415 /* recv buffer sizes */ 1416 /* Receive messages*/ 1417 1418 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 1419 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1420 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 1421 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1422 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 1423 1424 for (i=0; i<nrqs; ++i) { 1425 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1426 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 1427 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 1428 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1429 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1430 } 1431 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1432 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1433 1434 /* Wait on sends1 and sends2 */ 1435 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1436 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 1437 1438 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1439 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1440 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1441 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1442 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1443 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1444 1445 /* Now allocate buffers for a->j, and send them off */ 1446 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1447 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1448 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1449 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1450 1451 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 1452 { 1453 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1454 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1455 PetscInt cend = C->cmap->rend; 1456 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1457 1458 for (i=0; i<nrqr; i++) { 1459 rbuf1_i = rbuf1[i]; 1460 sbuf_aj_i = sbuf_aj[i]; 1461 ct1 = 2*rbuf1_i[0] + 1; 1462 ct2 = 0; 1463 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1464 kmax = rbuf1[i][2*j]; 1465 for (k=0; k<kmax; k++,ct1++) { 1466 row = rbuf1_i[ct1] - rstart; 1467 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1468 ncols = nzA + nzB; 1469 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1470 1471 /* load the column indices for this row into cols*/ 1472 cols = sbuf_aj_i + ct2; 1473 1474 lwrite = 0; 1475 for (l=0; l<nzB; l++) { 1476 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1477 } 1478 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1479 for (l=0; l<nzB; l++) { 1480 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1481 } 1482 1483 ct2 += ncols; 1484 } 1485 } 1486 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1487 } 1488 } 1489 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 1490 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 1491 1492 /* Allocate buffers for a->a, and send them off */ 1493 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1494 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1495 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1496 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1497 1498 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1499 { 1500 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1501 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1502 PetscInt cend = C->cmap->rend; 1503 PetscInt *b_j = b->j; 1504 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1505 1506 for (i=0; i<nrqr; i++) { 1507 rbuf1_i = rbuf1[i]; 1508 sbuf_aa_i = sbuf_aa[i]; 1509 ct1 = 2*rbuf1_i[0]+1; 1510 ct2 = 0; 1511 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1512 kmax = rbuf1_i[2*j]; 1513 for (k=0; k<kmax; k++,ct1++) { 1514 row = rbuf1_i[ct1] - rstart; 1515 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1516 ncols = nzA + nzB; 1517 cworkB = b_j + b_i[row]; 1518 vworkA = a_a + a_i[row]; 1519 vworkB = b_a + b_i[row]; 1520 1521 /* load the column values for this row into vals*/ 1522 vals = sbuf_aa_i+ct2; 1523 1524 lwrite = 0; 1525 for (l=0; l<nzB; l++) { 1526 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1527 } 1528 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1529 for (l=0; l<nzB; l++) { 1530 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1531 } 1532 1533 ct2 += ncols; 1534 } 1535 } 1536 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1537 } 1538 } 1539 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1540 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1541 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1542 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1543 1544 /* Form the matrix */ 1545 /* create col map: global col of C -> local col of submatrices */ 1546 { 1547 const PetscInt *icol_i; 1548 #if defined(PETSC_USE_CTABLE) 1549 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 1550 for (i=0; i<ismax; i++) { 1551 if (!allcolumns[i]) { 1552 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1553 1554 jmax = ncol[i]; 1555 icol_i = icol[i]; 1556 cmap_i = cmap[i]; 1557 for (j=0; j<jmax; j++) { 1558 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1559 } 1560 } else { 1561 cmap[i] = NULL; 1562 } 1563 } 1564 #else 1565 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1566 for (i=0; i<ismax; i++) { 1567 if (!allcolumns[i]) { 1568 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1569 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1570 jmax = ncol[i]; 1571 icol_i = icol[i]; 1572 cmap_i = cmap[i]; 1573 for (j=0; j<jmax; j++) { 1574 cmap_i[icol_i[j]] = j+1; 1575 } 1576 } else { 1577 cmap[i] = NULL; 1578 } 1579 } 1580 #endif 1581 } 1582 1583 /* Create lens which is required for MatCreate... */ 1584 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1585 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1586 if (ismax) { 1587 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1588 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1589 } 1590 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1591 1592 /* Update lens from local data */ 1593 for (i=0; i<ismax; i++) { 1594 jmax = nrow[i]; 1595 if (!allcolumns[i]) cmap_i = cmap[i]; 1596 irow_i = irow[i]; 1597 lens_i = lens[i]; 1598 for (j=0; j<jmax; j++) { 1599 l = 0; 1600 row = irow_i[j]; 1601 while (row >= C->rmap->range[l+1]) l++; 1602 proc = l; 1603 if (proc == rank) { 1604 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1605 if (!allcolumns[i]) { 1606 for (k=0; k<ncols; k++) { 1607 #if defined(PETSC_USE_CTABLE) 1608 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1609 #else 1610 tcol = cmap_i[cols[k]]; 1611 #endif 1612 if (tcol) lens_i[j]++; 1613 } 1614 } else { /* allcolumns */ 1615 lens_i[j] = ncols; 1616 } 1617 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1618 } 1619 } 1620 } 1621 1622 /* Create row map: global row of C -> local row of submatrices */ 1623 #if defined(PETSC_USE_CTABLE) 1624 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 1625 for (i=0; i<ismax; i++) { 1626 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1627 irow_i = irow[i]; 1628 jmax = nrow[i]; 1629 for (j=0; j<jmax; j++) { 1630 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1631 } 1632 } 1633 #else 1634 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1635 if (ismax) { 1636 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1637 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1638 } 1639 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1640 for (i=0; i<ismax; i++) { 1641 rmap_i = rmap[i]; 1642 irow_i = irow[i]; 1643 jmax = nrow[i]; 1644 for (j=0; j<jmax; j++) { 1645 rmap_i[irow_i[j]] = j; 1646 } 1647 } 1648 #endif 1649 1650 /* Update lens from offproc data */ 1651 { 1652 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1653 1654 for (tmp2=0; tmp2<nrqs; tmp2++) { 1655 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1656 idex = pa[idex2]; 1657 sbuf1_i = sbuf1[idex]; 1658 jmax = sbuf1_i[0]; 1659 ct1 = 2*jmax+1; 1660 ct2 = 0; 1661 rbuf2_i = rbuf2[idex2]; 1662 rbuf3_i = rbuf3[idex2]; 1663 for (j=1; j<=jmax; j++) { 1664 is_no = sbuf1_i[2*j-1]; 1665 max1 = sbuf1_i[2*j]; 1666 lens_i = lens[is_no]; 1667 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1668 rmap_i = rmap[is_no]; 1669 for (k=0; k<max1; k++,ct1++) { 1670 #if defined(PETSC_USE_CTABLE) 1671 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1672 row--; 1673 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1674 #else 1675 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1676 #endif 1677 max2 = rbuf2_i[ct1]; 1678 for (l=0; l<max2; l++,ct2++) { 1679 if (!allcolumns[is_no]) { 1680 #if defined(PETSC_USE_CTABLE) 1681 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1682 #else 1683 tcol = cmap_i[rbuf3_i[ct2]]; 1684 #endif 1685 if (tcol) lens_i[row]++; 1686 } else { /* allcolumns */ 1687 lens_i[row]++; /* lens_i[row] += max2 ? */ 1688 } 1689 } 1690 } 1691 } 1692 } 1693 } 1694 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1695 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1696 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1697 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1698 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1699 1700 /* Create the submatrices */ 1701 if (scall == MAT_REUSE_MATRIX) { 1702 PetscBool flag; 1703 1704 /* 1705 Assumes new rows are same length as the old rows,hence bug! 1706 */ 1707 for (i=0; i<ismax; i++) { 1708 mat = (Mat_SeqAIJ*)(submats[i]->data); 1709 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1710 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1711 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1712 /* Initial matrix as if empty */ 1713 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1714 1715 submats[i]->factortype = C->factortype; 1716 } 1717 } else { 1718 for (i=0; i<ismax; i++) { 1719 PetscInt rbs,cbs; 1720 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1721 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1722 1723 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1724 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1725 1726 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1727 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1728 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1729 } 1730 } 1731 1732 /* Assemble the matrices */ 1733 /* First assemble the local rows */ 1734 { 1735 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1736 PetscScalar *imat_a; 1737 1738 for (i=0; i<ismax; i++) { 1739 mat = (Mat_SeqAIJ*)submats[i]->data; 1740 imat_ilen = mat->ilen; 1741 imat_j = mat->j; 1742 imat_i = mat->i; 1743 imat_a = mat->a; 1744 1745 if (!allcolumns[i]) cmap_i = cmap[i]; 1746 rmap_i = rmap[i]; 1747 irow_i = irow[i]; 1748 jmax = nrow[i]; 1749 for (j=0; j<jmax; j++) { 1750 l = 0; 1751 row = irow_i[j]; 1752 while (row >= C->rmap->range[l+1]) l++; 1753 proc = l; 1754 if (proc == rank) { 1755 old_row = row; 1756 #if defined(PETSC_USE_CTABLE) 1757 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1758 row--; 1759 #else 1760 row = rmap_i[row]; 1761 #endif 1762 ilen_row = imat_ilen[row]; 1763 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1764 mat_i = imat_i[row]; 1765 mat_a = imat_a + mat_i; 1766 mat_j = imat_j + mat_i; 1767 if (!allcolumns[i]) { 1768 for (k=0; k<ncols; k++) { 1769 #if defined(PETSC_USE_CTABLE) 1770 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1771 #else 1772 tcol = cmap_i[cols[k]]; 1773 #endif 1774 if (tcol) { 1775 *mat_j++ = tcol - 1; 1776 *mat_a++ = vals[k]; 1777 ilen_row++; 1778 } 1779 } 1780 } else { /* allcolumns */ 1781 for (k=0; k<ncols; k++) { 1782 *mat_j++ = cols[k]; /* global col index! */ 1783 *mat_a++ = vals[k]; 1784 ilen_row++; 1785 } 1786 } 1787 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1788 1789 imat_ilen[row] = ilen_row; 1790 } 1791 } 1792 } 1793 } 1794 1795 /* Now assemble the off proc rows*/ 1796 { 1797 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1798 PetscInt *imat_j,*imat_i; 1799 PetscScalar *imat_a,*rbuf4_i; 1800 1801 for (tmp2=0; tmp2<nrqs; tmp2++) { 1802 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1803 idex = pa[idex2]; 1804 sbuf1_i = sbuf1[idex]; 1805 jmax = sbuf1_i[0]; 1806 ct1 = 2*jmax + 1; 1807 ct2 = 0; 1808 rbuf2_i = rbuf2[idex2]; 1809 rbuf3_i = rbuf3[idex2]; 1810 rbuf4_i = rbuf4[idex2]; 1811 for (j=1; j<=jmax; j++) { 1812 is_no = sbuf1_i[2*j-1]; 1813 rmap_i = rmap[is_no]; 1814 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1815 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1816 imat_ilen = mat->ilen; 1817 imat_j = mat->j; 1818 imat_i = mat->i; 1819 imat_a = mat->a; 1820 max1 = sbuf1_i[2*j]; 1821 for (k=0; k<max1; k++,ct1++) { 1822 row = sbuf1_i[ct1]; 1823 #if defined(PETSC_USE_CTABLE) 1824 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1825 row--; 1826 #else 1827 row = rmap_i[row]; 1828 #endif 1829 ilen = imat_ilen[row]; 1830 mat_i = imat_i[row]; 1831 mat_a = imat_a + mat_i; 1832 mat_j = imat_j + mat_i; 1833 max2 = rbuf2_i[ct1]; 1834 if (!allcolumns[is_no]) { 1835 for (l=0; l<max2; l++,ct2++) { 1836 1837 #if defined(PETSC_USE_CTABLE) 1838 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1839 #else 1840 tcol = cmap_i[rbuf3_i[ct2]]; 1841 #endif 1842 if (tcol) { 1843 *mat_j++ = tcol - 1; 1844 *mat_a++ = rbuf4_i[ct2]; 1845 ilen++; 1846 } 1847 } 1848 } else { /* allcolumns */ 1849 for (l=0; l<max2; l++,ct2++) { 1850 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1851 *mat_a++ = rbuf4_i[ct2]; 1852 ilen++; 1853 } 1854 } 1855 imat_ilen[row] = ilen; 1856 } 1857 } 1858 } 1859 } 1860 1861 /* sort the rows */ 1862 { 1863 PetscInt *imat_ilen,*imat_j,*imat_i; 1864 PetscScalar *imat_a; 1865 1866 for (i=0; i<ismax; i++) { 1867 mat = (Mat_SeqAIJ*)submats[i]->data; 1868 imat_j = mat->j; 1869 imat_i = mat->i; 1870 imat_a = mat->a; 1871 imat_ilen = mat->ilen; 1872 1873 if (allcolumns[i]) continue; 1874 jmax = nrow[i]; 1875 for (j=0; j<jmax; j++) { 1876 PetscInt ilen; 1877 1878 mat_i = imat_i[j]; 1879 mat_a = imat_a + mat_i; 1880 mat_j = imat_j + mat_i; 1881 ilen = imat_ilen[j]; 1882 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 1883 } 1884 } 1885 } 1886 1887 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1888 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1889 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1890 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1891 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1892 1893 /* Restore the indices */ 1894 for (i=0; i<ismax; i++) { 1895 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1896 if (!allcolumns[i]) { 1897 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1898 } 1899 } 1900 1901 /* Destroy allocated memory */ 1902 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1903 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1904 ierr = PetscFree(pa);CHKERRQ(ierr); 1905 1906 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1907 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1908 for (i=0; i<nrqr; ++i) { 1909 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1910 } 1911 for (i=0; i<nrqs; ++i) { 1912 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1913 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1914 } 1915 1916 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1917 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1918 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1919 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1920 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1921 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1922 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1923 1924 #if defined(PETSC_USE_CTABLE) 1925 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1926 #else 1927 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1928 #endif 1929 ierr = PetscFree(rmap);CHKERRQ(ierr); 1930 1931 for (i=0; i<ismax; i++) { 1932 if (!allcolumns[i]) { 1933 #if defined(PETSC_USE_CTABLE) 1934 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1935 #else 1936 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1937 #endif 1938 } 1939 } 1940 ierr = PetscFree(cmap);CHKERRQ(ierr); 1941 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1942 ierr = PetscFree(lens);CHKERRQ(ierr); 1943 1944 for (i=0; i<ismax; i++) { 1945 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1946 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1947 } 1948 PetscFunctionReturn(0); 1949 } 1950 1951 /* 1952 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 1953 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 1954 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 1955 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 1956 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 1957 state, and needs to be "assembled" later by compressing B's column space. 1958 1959 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 1960 Following this call, C->A & C->B have been created, even if empty. 1961 */ 1962 #undef __FUNCT__ 1963 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 1964 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 1965 { 1966 /* If making this function public, change the error returned in this function away from _PLIB. */ 1967 PetscErrorCode ierr; 1968 Mat_MPIAIJ *aij; 1969 Mat_SeqAIJ *Baij; 1970 PetscBool seqaij,Bdisassembled; 1971 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 1972 PetscScalar v; 1973 const PetscInt *rowindices,*colindices; 1974 1975 PetscFunctionBegin; 1976 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 1977 if (A) { 1978 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 1979 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 1980 if (rowemb) { 1981 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 1982 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 1983 } else { 1984 if (C->rmap->n != A->rmap->n) { 1985 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 1986 } 1987 } 1988 if (dcolemb) { 1989 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 1990 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 1991 } else { 1992 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 1993 } 1994 } 1995 if (B) { 1996 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 1997 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 1998 if (rowemb) { 1999 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2000 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2001 } else { 2002 if (C->rmap->n != B->rmap->n) { 2003 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2004 } 2005 } 2006 if (ocolemb) { 2007 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2008 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2009 } else { 2010 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2011 } 2012 } 2013 2014 aij = (Mat_MPIAIJ*)(C->data); 2015 if (!aij->A) { 2016 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2017 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2018 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2019 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2020 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2021 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2022 } 2023 if (A) { 2024 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2025 } else { 2026 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2027 } 2028 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2029 /* 2030 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2031 need to "disassemble" B -- convert it to using C's global indices. 2032 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2033 2034 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2035 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2036 2037 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2038 At least avoid calling MatSetValues() and the implied searches? 2039 */ 2040 2041 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2042 #if defined(PETSC_USE_CTABLE) 2043 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2044 #else 2045 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2046 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2047 if (aij->B) { 2048 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2049 } 2050 #endif 2051 ngcol = 0; 2052 if (aij->lvec) { 2053 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2054 } 2055 if (aij->garray) { 2056 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2057 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2058 } 2059 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2060 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2061 } 2062 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2063 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2064 } 2065 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2066 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2067 } 2068 } 2069 Bdisassembled = PETSC_FALSE; 2070 if (!aij->B) { 2071 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2072 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2073 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2074 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2075 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2076 Bdisassembled = PETSC_TRUE; 2077 } 2078 if (B) { 2079 Baij = (Mat_SeqAIJ*)(B->data); 2080 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2081 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2082 for (i=0; i<B->rmap->n; i++) { 2083 nz[i] = Baij->i[i+1] - Baij->i[i]; 2084 } 2085 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2086 ierr = PetscFree(nz);CHKERRQ(ierr); 2087 } 2088 2089 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 2090 shift = rend-rstart; 2091 count = 0; 2092 rowindices = NULL; 2093 colindices = NULL; 2094 if (rowemb) { 2095 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 2096 } 2097 if (ocolemb) { 2098 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 2099 } 2100 for (i=0; i<B->rmap->n; i++) { 2101 PetscInt row; 2102 row = i; 2103 if (rowindices) row = rowindices[i]; 2104 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 2105 col = Baij->j[count]; 2106 if (colindices) col = colindices[col]; 2107 if (Bdisassembled && col>=rstart) col += shift; 2108 v = Baij->a[count]; 2109 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 2110 ++count; 2111 } 2112 } 2113 /* No assembly for aij->B is necessary. */ 2114 /* FIXME: set aij->B's nonzerostate correctly. */ 2115 } else { 2116 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 2117 } 2118 C->preallocated = PETSC_TRUE; 2119 C->was_assembled = PETSC_FALSE; 2120 C->assembled = PETSC_FALSE; 2121 /* 2122 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2123 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2124 */ 2125 PetscFunctionReturn(0); 2126 } 2127 2128 #undef __FUNCT__ 2129 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 2130 /* 2131 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 2132 */ 2133 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 2134 { 2135 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 2136 2137 PetscFunctionBegin; 2138 PetscValidPointer(A,2); 2139 PetscValidPointer(B,3); 2140 /* FIXME: make sure C is assembled */ 2141 *A = aij->A; 2142 *B = aij->B; 2143 /* Note that we don't incref *A and *B, so be careful! */ 2144 PetscFunctionReturn(0); 2145 } 2146 2147 /* 2148 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 2149 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 2150 */ 2151 #undef __FUNCT__ 2152 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 2153 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 2154 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 2155 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 2156 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 2157 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 2158 { 2159 PetscErrorCode ierr; 2160 PetscMPIInt isize,flag; 2161 PetscInt i,ii,cismax,ispar,ciscol_localsize; 2162 Mat *A,*B; 2163 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 2164 2165 PetscFunctionBegin; 2166 if (!ismax) PetscFunctionReturn(0); 2167 2168 for (i = 0, cismax = 0; i < ismax; ++i) { 2169 PetscMPIInt isize; 2170 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 2171 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2172 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 2173 if (isize > 1) ++cismax; 2174 } 2175 /* 2176 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 2177 ispar counts the number of parallel ISs across C's comm. 2178 */ 2179 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2180 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 2181 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2182 PetscFunctionReturn(0); 2183 } 2184 2185 /* if (ispar) */ 2186 /* 2187 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 2188 These are used to extract the off-diag portion of the resulting parallel matrix. 2189 The row IS for the off-diag portion is the same as for the diag portion, 2190 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 2191 */ 2192 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 2193 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 2194 for (i = 0, ii = 0; i < ismax; ++i) { 2195 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2196 if (isize > 1) { 2197 /* 2198 TODO: This is the part that's ***NOT SCALABLE***. 2199 To fix this we need to extract just the indices of C's nonzero columns 2200 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 2201 part of iscol[i] -- without actually computing ciscol[ii]. This also has 2202 to be done without serializing on the IS list, so, most likely, it is best 2203 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 2204 */ 2205 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 2206 /* Now we have to 2207 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 2208 were sorted on each rank, concatenated they might no longer be sorted; 2209 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 2210 indices in the nondecreasing order to the original index positions. 2211 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 2212 */ 2213 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 2214 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 2215 ++ii; 2216 } 2217 } 2218 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 2219 for (i = 0, ii = 0; i < ismax; ++i) { 2220 PetscInt j,issize; 2221 const PetscInt *indices; 2222 2223 /* 2224 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 2225 */ 2226 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 2227 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 2228 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 2229 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 2230 for (j = 1; j < issize; ++j) { 2231 if (indices[j] == indices[j-1]) { 2232 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 2233 } 2234 } 2235 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 2236 2237 2238 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 2239 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 2240 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 2241 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 2242 for (j = 1; j < issize; ++j) { 2243 if (indices[j-1] == indices[j]) { 2244 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 2245 } 2246 } 2247 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 2248 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2249 if (isize > 1) { 2250 cisrow[ii] = isrow[i]; 2251 ++ii; 2252 } 2253 } 2254 /* 2255 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 2256 array of sequential matrices underlying the resulting parallel matrices. 2257 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 2258 contain duplicates. 2259 2260 There are as many diag matrices as there are original index sets. There are only as many parallel 2261 and off-diag matrices, as there are parallel (comm size > 1) index sets. 2262 2263 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 2264 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 2265 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 2266 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 2267 with A[i] and B[ii] extracted from the corresponding MPI submat. 2268 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 2269 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 2270 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 2271 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 2272 values into A[i] and B[ii] sitting inside the corresponding submat. 2273 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 2274 A[i], B[ii], AA[i] or BB[ii] matrices. 2275 */ 2276 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 2277 if (scall == MAT_INITIAL_MATRIX) { 2278 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 2279 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 2280 } else { 2281 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 2282 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 2283 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 2284 for (i = 0, ii = 0; i < ismax; ++i) { 2285 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2286 if (isize > 1) { 2287 Mat AA,BB; 2288 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 2289 if (!isrow_p[i] && !iscol_p[i]) { 2290 A[i] = AA; 2291 } else { 2292 /* TODO: extract A[i] composed on AA. */ 2293 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 2294 } 2295 if (!isrow_p[i] && !ciscol_p[ii]) { 2296 B[ii] = BB; 2297 } else { 2298 /* TODO: extract B[ii] composed on BB. */ 2299 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 2300 } 2301 ++ii; 2302 } else { 2303 if (!isrow_p[i] && !iscol_p[i]) { 2304 A[i] = (*submat)[i]; 2305 } else { 2306 /* TODO: extract A[i] composed on (*submat)[i]. */ 2307 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 2308 } 2309 } 2310 } 2311 } 2312 /* Now obtain the sequential A and B submatrices separately. */ 2313 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 2314 /* I did not figure out a good way to fix it right now. 2315 * Local column size of B[i] is different from the size of ciscol[i]. 2316 * B[i]'s size is finally determined by MatAssembly, while 2317 * ciscol[i] is computed as the complement of iscol[i]. 2318 * It is better to keep only nonzero indices when computing 2319 * the complement ciscol[i]. 2320 * */ 2321 if(scall==MAT_REUSE_MATRIX){ 2322 for(i=0; i<cismax; i++){ 2323 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 2324 B[i]->cmap->n = ciscol_localsize; 2325 } 2326 } 2327 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 2328 /* 2329 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 2330 matrices A & B have been extracted directly into the parallel matrices containing them, or 2331 simply into the sequential matrix identical with the corresponding A (if isize == 1). 2332 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 2333 to have the same sparsity pattern. 2334 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 2335 must be constructed for C. This is done by setseqmat(s). 2336 */ 2337 for (i = 0, ii = 0; i < ismax; ++i) { 2338 /* 2339 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 2340 That way we can avoid sorting and computing permutations when reusing. 2341 To this end: 2342 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 2343 - if caching arrays to hold the ISs, make and compose a container for them so that it can 2344 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 2345 */ 2346 MatStructure pattern; 2347 if (scall == MAT_INITIAL_MATRIX) { 2348 pattern = DIFFERENT_NONZERO_PATTERN; 2349 } else { 2350 pattern = SAME_NONZERO_PATTERN; 2351 } 2352 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2353 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 2354 if (isize > 1) { 2355 if (scall == MAT_INITIAL_MATRIX) { 2356 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 2357 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2358 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 2359 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 2360 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 2361 } 2362 /* 2363 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 2364 */ 2365 { 2366 Mat AA,BB; 2367 AA = NULL; 2368 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 2369 BB = NULL; 2370 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 2371 if (AA || BB) { 2372 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 2373 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2374 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2375 } 2376 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 2377 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 2378 ierr = MatDestroy(&AA);CHKERRQ(ierr); 2379 } 2380 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 2381 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 2382 ierr = MatDestroy(&BB);CHKERRQ(ierr); 2383 } 2384 } 2385 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 2386 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 2387 ++ii; 2388 } else { /* if (isize == 1) */ 2389 if (scall == MAT_INITIAL_MATRIX) { 2390 if (isrow_p[i] || iscol_p[i]) { 2391 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 2392 } else (*submat)[i] = A[i]; 2393 } 2394 if (isrow_p[i] || iscol_p[i]) { 2395 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 2396 /* Otherwise A is extracted straight into (*submats)[i]. */ 2397 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 2398 ierr = MatDestroy(A+i);CHKERRQ(ierr); 2399 } 2400 } 2401 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 2402 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 2403 } 2404 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 2405 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 2406 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 2407 ierr = PetscFree(A);CHKERRQ(ierr); 2408 ierr = PetscFree(B);CHKERRQ(ierr); 2409 PetscFunctionReturn(0); 2410 } 2411 2412 2413 2414 #undef __FUNCT__ 2415 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 2416 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2417 { 2418 PetscErrorCode ierr; 2419 2420 PetscFunctionBegin; 2421 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 2422 PetscFunctionReturn(0); 2423 } 2424