1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 22 { 23 PetscInt i; 24 25 PetscFunctionBegin; 26 PetscCheckFalse(ov < 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 27 for (i=0; i<ov; ++i) { 28 PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C,imax,is)); 29 } 30 PetscFunctionReturn(0); 31 } 32 33 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 34 { 35 PetscInt i; 36 37 PetscFunctionBegin; 38 PetscCheckFalse(ov < 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 39 for (i=0; i<ov; ++i) { 40 PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is)); 41 } 42 PetscFunctionReturn(0); 43 } 44 45 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 46 { 47 MPI_Comm comm; 48 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 49 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 50 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 51 const PetscInt *indices_i,**indices; 52 PetscLayout rmap; 53 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 54 PetscSF sf; 55 PetscSFNode *remote; 56 57 PetscFunctionBegin; 58 PetscCall(PetscObjectGetComm((PetscObject)mat,&comm)); 59 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 60 PetscCallMPI(MPI_Comm_size(comm,&size)); 61 /* get row map to determine where rows should be going */ 62 PetscCall(MatGetLayouts(mat,&rmap,NULL)); 63 /* retrieve IS data and put all together so that we 64 * can optimize communication 65 * */ 66 PetscCall(PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length)); 67 for (i=0,tlength=0; i<nidx; i++) { 68 PetscCall(ISGetLocalSize(is[i],&length[i])); 69 tlength += length[i]; 70 PetscCall(ISGetIndices(is[i],&indices[i])); 71 } 72 /* find these rows on remote processors */ 73 PetscCall(PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids)); 74 PetscCall(PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp)); 75 nrrows = 0; 76 for (i=0; i<nidx; i++) { 77 length_i = length[i]; 78 indices_i = indices[i]; 79 for (j=0; j<length_i; j++) { 80 owner = -1; 81 PetscCall(PetscLayoutFindOwner(rmap,indices_i[j],&owner)); 82 /* remote processors */ 83 if (owner != rank) { 84 tosizes_temp[owner]++; /* number of rows to owner */ 85 rrow_ranks[nrrows] = owner; /* processor */ 86 rrow_isids[nrrows] = i; /* is id */ 87 remoterows[nrrows++] = indices_i[j]; /* row */ 88 } 89 } 90 PetscCall(ISRestoreIndices(is[i],&indices[i])); 91 } 92 PetscCall(PetscFree2(*(PetscInt***)&indices,length)); 93 /* test if we need to exchange messages 94 * generally speaking, we do not need to exchange 95 * data when overlap is 1 96 * */ 97 PetscCall(MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm)); 98 /* we do not have any messages 99 * It usually corresponds to overlap 1 100 * */ 101 if (!reducednrrows) { 102 PetscCall(PetscFree3(toranks,tosizes,tosizes_temp)); 103 PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids)); 104 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is)); 105 PetscFunctionReturn(0); 106 } 107 nto = 0; 108 /* send sizes and ranks for building a two-sided communcation */ 109 for (i=0; i<size; i++) { 110 if (tosizes_temp[i]) { 111 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 112 tosizes_temp[i] = nto; /* a map from processor to index */ 113 toranks[nto++] = i; /* processor */ 114 } 115 } 116 PetscCall(PetscMalloc1(nto+1,&toffsets)); 117 toffsets[0] = 0; 118 for (i=0; i<nto; i++) { 119 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 120 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 121 } 122 /* send information to other processors */ 123 PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes)); 124 nrecvrows = 0; 125 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 126 PetscCall(PetscMalloc1(nrecvrows,&remote)); 127 nrecvrows = 0; 128 for (i=0; i<nfrom; i++) { 129 for (j=0; j<fromsizes[2*i]; j++) { 130 remote[nrecvrows].rank = fromranks[i]; 131 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 132 } 133 } 134 PetscCall(PetscSFCreate(comm,&sf)); 135 PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER)); 136 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 137 PetscCall(PetscSFSetType(sf,PETSCSFBASIC)); 138 PetscCall(PetscSFSetFromOptions(sf)); 139 /* message pair <no of is, row> */ 140 PetscCall(PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata)); 141 for (i=0; i<nrrows; i++) { 142 owner = rrow_ranks[i]; /* processor */ 143 j = tosizes_temp[owner]; /* index */ 144 todata[toffsets[j]++] = rrow_isids[i]; 145 todata[toffsets[j]++] = remoterows[i]; 146 } 147 PetscCall(PetscFree3(toranks,tosizes,tosizes_temp)); 148 PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids)); 149 PetscCall(PetscFree(toffsets)); 150 PetscCall(PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE)); 151 PetscCall(PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE)); 152 PetscCall(PetscSFDestroy(&sf)); 153 /* send rows belonging to the remote so that then we could get the overlapping data back */ 154 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata)); 155 PetscCall(PetscFree2(todata,fromdata)); 156 PetscCall(PetscFree(fromsizes)); 157 PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes)); 158 PetscCall(PetscFree(fromranks)); 159 nrecvrows = 0; 160 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 161 PetscCall(PetscCalloc1(nrecvrows,&todata)); 162 PetscCall(PetscMalloc1(nrecvrows,&remote)); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) { 165 for (j=0; j<tosizes[2*i]; j++) { 166 remote[nrecvrows].rank = toranks[i]; 167 remote[nrecvrows++].index = tosizes[2*i+1]+j; 168 } 169 } 170 PetscCall(PetscSFCreate(comm,&sf)); 171 PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER)); 172 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 173 PetscCall(PetscSFSetType(sf,PETSCSFBASIC)); 174 PetscCall(PetscSFSetFromOptions(sf)); 175 /* overlap communication and computation */ 176 PetscCall(PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE)); 177 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is)); 178 PetscCall(PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE)); 179 PetscCall(PetscSFDestroy(&sf)); 180 PetscCall(PetscFree2(sbdata,sbsizes)); 181 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata)); 182 PetscCall(PetscFree(toranks)); 183 PetscCall(PetscFree(tosizes)); 184 PetscCall(PetscFree(todata)); 185 PetscFunctionReturn(0); 186 } 187 188 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 189 { 190 PetscInt *isz,isz_i,i,j,is_id, data_size; 191 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 192 const PetscInt *indices_i_temp; 193 MPI_Comm *iscomms; 194 195 PetscFunctionBegin; 196 max_lsize = 0; 197 PetscCall(PetscMalloc1(nidx,&isz)); 198 for (i=0; i<nidx; i++) { 199 PetscCall(ISGetLocalSize(is[i],&lsize)); 200 max_lsize = lsize>max_lsize ? lsize:max_lsize; 201 isz[i] = lsize; 202 } 203 PetscCall(PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms)); 204 for (i=0; i<nidx; i++) { 205 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL)); 206 PetscCall(ISGetIndices(is[i],&indices_i_temp)); 207 PetscCall(PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i])); 208 PetscCall(ISRestoreIndices(is[i],&indices_i_temp)); 209 PetscCall(ISDestroy(&is[i])); 210 } 211 /* retrieve information to get row id and its overlap */ 212 for (i=0; i<nrecvs;) { 213 is_id = recvdata[i++]; 214 data_size = recvdata[i++]; 215 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 216 isz_i = isz[is_id]; 217 for (j=0; j< data_size; j++) { 218 col = recvdata[i++]; 219 indices_i[isz_i++] = col; 220 } 221 isz[is_id] = isz_i; 222 } 223 /* remove duplicate entities */ 224 for (i=0; i<nidx; i++) { 225 indices_i = indices_temp+(max_lsize+nrecvs)*i; 226 isz_i = isz[i]; 227 PetscCall(PetscSortRemoveDupsInt(&isz_i,indices_i)); 228 PetscCall(ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i])); 229 PetscCall(PetscCommDestroy(&iscomms[i])); 230 } 231 PetscCall(PetscFree(isz)); 232 PetscCall(PetscFree2(indices_temp,iscomms)); 233 PetscFunctionReturn(0); 234 } 235 236 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 237 { 238 PetscLayout rmap,cmap; 239 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 240 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 241 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 242 const PetscInt *gcols,*ai,*aj,*bi,*bj; 243 Mat amat,bmat; 244 PetscMPIInt rank; 245 PetscBool done; 246 MPI_Comm comm; 247 248 PetscFunctionBegin; 249 PetscCall(PetscObjectGetComm((PetscObject)mat,&comm)); 250 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 251 PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols)); 252 /* Even if the mat is symmetric, we still assume it is not symmetric */ 253 PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 254 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 255 PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 256 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 257 /* total number of nonzero values is used to estimate the memory usage in the next step */ 258 tnz = ai[an]+bi[bn]; 259 PetscCall(MatGetLayouts(mat,&rmap,&cmap)); 260 PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL)); 261 PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL)); 262 /* to find the longest message */ 263 max_fszs = 0; 264 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 265 /* better way to estimate number of nonzero in the mat??? */ 266 PetscCall(PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp)); 267 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 268 rows_pos = 0; 269 totalrows = 0; 270 for (i=0; i<nfrom; i++) { 271 PetscCall(PetscArrayzero(rows_pos_i,nidx)); 272 /* group data together */ 273 for (j=0; j<fromsizes[2*i]; j+=2) { 274 is_id = fromrows[rows_pos++];/* no of is */ 275 rows_i = rows_data[is_id]; 276 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 277 } 278 /* estimate a space to avoid multiple allocations */ 279 for (j=0; j<nidx; j++) { 280 indvc_ij = 0; 281 rows_i = rows_data[j]; 282 for (l=0; l<rows_pos_i[j]; l++) { 283 row = rows_i[l]-rstart; 284 start = ai[row]; 285 end = ai[row+1]; 286 for (k=start; k<end; k++) { /* Amat */ 287 col = aj[k] + cstart; 288 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 289 } 290 start = bi[row]; 291 end = bi[row+1]; 292 for (k=start; k<end; k++) { /* Bmat */ 293 col = gcols[bj[k]]; 294 indices_tmp[indvc_ij++] = col; 295 } 296 } 297 PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp)); 298 indv_counts[i*nidx+j] = indvc_ij; 299 totalrows += indvc_ij; 300 } 301 } 302 /* message triple <no of is, number of rows, rows> */ 303 PetscCall(PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes)); 304 totalrows = 0; 305 rows_pos = 0; 306 /* use this code again */ 307 for (i=0;i<nfrom;i++) { 308 PetscCall(PetscArrayzero(rows_pos_i,nidx)); 309 for (j=0; j<fromsizes[2*i]; j+=2) { 310 is_id = fromrows[rows_pos++]; 311 rows_i = rows_data[is_id]; 312 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 313 } 314 /* add data */ 315 for (j=0; j<nidx; j++) { 316 if (!indv_counts[i*nidx+j]) continue; 317 indvc_ij = 0; 318 sbdata[totalrows++] = j; 319 sbdata[totalrows++] = indv_counts[i*nidx+j]; 320 sbsizes[2*i] += 2; 321 rows_i = rows_data[j]; 322 for (l=0; l<rows_pos_i[j]; l++) { 323 row = rows_i[l]-rstart; 324 start = ai[row]; 325 end = ai[row+1]; 326 for (k=start; k<end; k++) { /* Amat */ 327 col = aj[k] + cstart; 328 indices_tmp[indvc_ij++] = col; 329 } 330 start = bi[row]; 331 end = bi[row+1]; 332 for (k=start; k<end; k++) { /* Bmat */ 333 col = gcols[bj[k]]; 334 indices_tmp[indvc_ij++] = col; 335 } 336 } 337 PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp)); 338 sbsizes[2*i] += indvc_ij; 339 PetscCall(PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij)); 340 totalrows += indvc_ij; 341 } 342 } 343 PetscCall(PetscMalloc1(nfrom+1,&offsets)); 344 offsets[0] = 0; 345 for (i=0; i<nfrom; i++) { 346 offsets[i+1] = offsets[i] + sbsizes[2*i]; 347 sbsizes[2*i+1] = offsets[i]; 348 } 349 PetscCall(PetscFree(offsets)); 350 if (sbrowsizes) *sbrowsizes = sbsizes; 351 if (sbrows) *sbrows = sbdata; 352 PetscCall(PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp)); 353 PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 354 PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 355 PetscFunctionReturn(0); 356 } 357 358 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 359 { 360 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 361 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 362 PetscInt lsize,lsize_tmp; 363 PetscMPIInt rank,owner; 364 Mat amat,bmat; 365 PetscBool done; 366 PetscLayout cmap,rmap; 367 MPI_Comm comm; 368 369 PetscFunctionBegin; 370 PetscCall(PetscObjectGetComm((PetscObject)mat,&comm)); 371 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 372 PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols)); 373 PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 374 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 375 PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 376 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 377 /* is it a safe way to compute number of nonzero values ? */ 378 tnz = ai[an]+bi[bn]; 379 PetscCall(MatGetLayouts(mat,&rmap,&cmap)); 380 PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL)); 381 PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL)); 382 /* it is a better way to estimate memory than the old implementation 383 * where global size of matrix is used 384 * */ 385 PetscCall(PetscMalloc1(tnz,&indices_temp)); 386 for (i=0; i<nidx; i++) { 387 MPI_Comm iscomm; 388 389 PetscCall(ISGetLocalSize(is[i],&lsize)); 390 PetscCall(ISGetIndices(is[i],&indices)); 391 lsize_tmp = 0; 392 for (j=0; j<lsize; j++) { 393 owner = -1; 394 row = indices[j]; 395 PetscCall(PetscLayoutFindOwner(rmap,row,&owner)); 396 if (owner != rank) continue; 397 /* local number */ 398 row -= rstart; 399 start = ai[row]; 400 end = ai[row+1]; 401 for (k=start; k<end; k++) { /* Amat */ 402 col = aj[k] + cstart; 403 indices_temp[lsize_tmp++] = col; 404 } 405 start = bi[row]; 406 end = bi[row+1]; 407 for (k=start; k<end; k++) { /* Bmat */ 408 col = gcols[bj[k]]; 409 indices_temp[lsize_tmp++] = col; 410 } 411 } 412 PetscCall(ISRestoreIndices(is[i],&indices)); 413 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL)); 414 PetscCall(ISDestroy(&is[i])); 415 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp,indices_temp)); 416 PetscCall(ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i])); 417 PetscCall(PetscCommDestroy(&iscomm)); 418 } 419 PetscCall(PetscFree(indices_temp)); 420 PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 421 PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 422 PetscFunctionReturn(0); 423 } 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests received (which have to be or which have been processed) 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscMPIInt size,rank,tag1,tag2,proc = 0; 461 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 462 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 463 PetscBT *table; 464 MPI_Comm comm; 465 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 466 MPI_Status *recv_status; 467 MPI_Comm *iscomms; 468 char *t_p; 469 470 PetscFunctionBegin; 471 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag1)); 477 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2)); 478 479 PetscCall(PetscMalloc2(imax,(PetscInt***)&idx,imax,&n)); 480 481 for (i=0; i<imax; i++) { 482 PetscCall(ISGetIndices(is[i],&idx[i])); 483 PetscCall(ISGetLocalSize(is[i],&n[i])); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4)); 489 for (i=0; i<imax; i++) { 490 PetscCall(PetscArrayzero(w4,size)); /* initialise work vector*/ 491 idx_i = idx[i]; 492 len = n[i]; 493 for (j=0; j<len; j++) { 494 row = idx_i[j]; 495 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 496 PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc)); 497 w4[proc]++; 498 } 499 for (j=0; j<size; j++) { 500 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 501 } 502 } 503 504 nrqs = 0; /* no of outgoing messages */ 505 msz = 0; /* total mesg length (for all proc */ 506 w1[rank] = 0; /* no mesg sent to intself */ 507 w3[rank] = 0; 508 for (i=0; i<size; i++) { 509 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 510 } 511 /* pa - is list of processors to communicate with */ 512 PetscCall(PetscMalloc1(nrqs,&pa)); 513 for (i=0,j=0; i<size; i++) { 514 if (w1[i]) {pa[j] = i; j++;} 515 } 516 517 /* Each message would have a header = 1 + 2*(no of IS) + data */ 518 for (i=0; i<nrqs; i++) { 519 j = pa[i]; 520 w1[j] += w2[j] + 2*w3[j]; 521 msz += w1[j]; 522 } 523 524 /* Determine the number of messages to expect, their lengths, from from-ids */ 525 PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 526 PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 527 528 /* Now post the Irecvs corresponding to these messages */ 529 PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1)); 530 531 /* Allocate Memory for outgoing messages */ 532 PetscCall(PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr)); 533 PetscCall(PetscArrayzero(outdat,size)); 534 PetscCall(PetscArrayzero(ptr,size)); 535 536 { 537 PetscInt *iptr = tmp,ict = 0; 538 for (i=0; i<nrqs; i++) { 539 j = pa[i]; 540 iptr += ict; 541 outdat[j] = iptr; 542 ict = w1[j]; 543 } 544 } 545 546 /* Form the outgoing messages */ 547 /* plug in the headers */ 548 for (i=0; i<nrqs; i++) { 549 j = pa[i]; 550 outdat[j][0] = 0; 551 PetscCall(PetscArrayzero(outdat[j]+1,2*w3[j])); 552 ptr[j] = outdat[j] + 2*w3[j] + 1; 553 } 554 555 /* Memory for doing local proc's work */ 556 { 557 PetscInt M_BPB_imax = 0; 558 #if defined(PETSC_USE_CTABLE) 559 PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax)); 560 PetscCall(PetscMalloc1(imax,&table_data)); 561 for (i=0; i<imax; i++) { 562 PetscCall(PetscTableCreate(n[i],M,&table_data[i])); 563 } 564 PetscCall(PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p)); 565 for (i=0; i<imax; i++) { 566 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 567 } 568 #else 569 PetscInt Mimax = 0; 570 PetscCall(PetscIntMultError(M,imax, &Mimax)); 571 PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax)); 572 PetscCall(PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p)); 573 for (i=0; i<imax; i++) { 574 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 575 data[i] = d_p + M*i; 576 } 577 #endif 578 } 579 580 /* Parse the IS and update local tables and the outgoing buf with the data */ 581 { 582 PetscInt n_i,isz_i,*outdat_j,ctr_j; 583 PetscBT table_i; 584 585 for (i=0; i<imax; i++) { 586 PetscCall(PetscArrayzero(ctr,size)); 587 n_i = n[i]; 588 table_i = table[i]; 589 idx_i = idx[i]; 590 #if defined(PETSC_USE_CTABLE) 591 table_data_i = table_data[i]; 592 #else 593 data_i = data[i]; 594 #endif 595 isz_i = isz[i]; 596 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 597 row = idx_i[j]; 598 PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc)); 599 if (proc != rank) { /* copy to the outgoing buffer */ 600 ctr[proc]++; 601 *ptr[proc] = row; 602 ptr[proc]++; 603 } else if (!PetscBTLookupSet(table_i,row)) { 604 #if defined(PETSC_USE_CTABLE) 605 PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES)); 606 #else 607 data_i[isz_i] = row; /* Update the local table */ 608 #endif 609 isz_i++; 610 } 611 } 612 /* Update the headers for the current IS */ 613 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 614 if ((ctr_j = ctr[j])) { 615 outdat_j = outdat[j]; 616 k = ++outdat_j[0]; 617 outdat_j[2*k] = ctr_j; 618 outdat_j[2*k-1] = i; 619 } 620 } 621 isz[i] = isz_i; 622 } 623 } 624 625 /* Now post the sends */ 626 PetscCall(PetscMalloc1(nrqs,&s_waits1)); 627 for (i=0; i<nrqs; ++i) { 628 j = pa[i]; 629 PetscCallMPI(MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i)); 630 } 631 632 /* No longer need the original indices */ 633 PetscCall(PetscMalloc1(imax,&iscomms)); 634 for (i=0; i<imax; ++i) { 635 PetscCall(ISRestoreIndices(is[i],idx+i)); 636 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL)); 637 } 638 PetscCall(PetscFree2(*(PetscInt***)&idx,n)); 639 640 for (i=0; i<imax; ++i) { 641 PetscCall(ISDestroy(&is[i])); 642 } 643 644 /* Do Local work */ 645 #if defined(PETSC_USE_CTABLE) 646 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data)); 647 #else 648 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL)); 649 #endif 650 651 /* Receive messages */ 652 PetscCall(PetscMalloc1(nrqr,&recv_status)); 653 PetscCallMPI(MPI_Waitall(nrqr,r_waits1,recv_status)); 654 PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE)); 655 656 /* Phase 1 sends are complete - deallocate buffers */ 657 PetscCall(PetscFree4(outdat,ptr,tmp,ctr)); 658 PetscCall(PetscFree4(w1,w2,w3,w4)); 659 660 PetscCall(PetscMalloc1(nrqr,&xdata)); 661 PetscCall(PetscMalloc1(nrqr,&isz1)); 662 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1)); 663 PetscCall(PetscFree(rbuf[0])); 664 PetscCall(PetscFree(rbuf)); 665 666 /* Send the data back */ 667 /* Do a global reduction to know the buffer space req for incoming messages */ 668 { 669 PetscMPIInt *rw1; 670 671 PetscCall(PetscCalloc1(size,&rw1)); 672 673 for (i=0; i<nrqr; ++i) { 674 proc = recv_status[i].MPI_SOURCE; 675 676 PetscCheckFalse(proc != onodes1[i],PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 677 rw1[proc] = isz1[i]; 678 } 679 PetscCall(PetscFree(onodes1)); 680 PetscCall(PetscFree(olengths1)); 681 682 /* Determine the number of messages to expect, their lengths, from from-ids */ 683 PetscCall(PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2)); 684 PetscCall(PetscFree(rw1)); 685 } 686 /* Now post the Irecvs corresponding to these messages */ 687 PetscCall(PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2)); 688 689 /* Now post the sends */ 690 PetscCall(PetscMalloc1(nrqr,&s_waits2)); 691 for (i=0; i<nrqr; ++i) { 692 j = recv_status[i].MPI_SOURCE; 693 PetscCallMPI(MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i)); 694 } 695 696 /* receive work done on other processors */ 697 { 698 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 699 PetscMPIInt idex; 700 PetscBT table_i; 701 702 for (i=0; i<nrqs; ++i) { 703 PetscCallMPI(MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE)); 704 /* Process the message */ 705 rbuf2_i = rbuf2[idex]; 706 ct1 = 2*rbuf2_i[0]+1; 707 jmax = rbuf2[idex][0]; 708 for (j=1; j<=jmax; j++) { 709 max = rbuf2_i[2*j]; 710 is_no = rbuf2_i[2*j-1]; 711 isz_i = isz[is_no]; 712 table_i = table[is_no]; 713 #if defined(PETSC_USE_CTABLE) 714 table_data_i = table_data[is_no]; 715 #else 716 data_i = data[is_no]; 717 #endif 718 for (k=0; k<max; k++,ct1++) { 719 row = rbuf2_i[ct1]; 720 if (!PetscBTLookupSet(table_i,row)) { 721 #if defined(PETSC_USE_CTABLE) 722 PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES)); 723 #else 724 data_i[isz_i] = row; 725 #endif 726 isz_i++; 727 } 728 } 729 isz[is_no] = isz_i; 730 } 731 } 732 733 PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE)); 734 } 735 736 #if defined(PETSC_USE_CTABLE) 737 tcount_max = 0; 738 for (i=0; i<imax; ++i) { 739 table_data_i = table_data[i]; 740 PetscCall(PetscTableGetCount(table_data_i,&tcount)); 741 if (tcount_max < tcount) tcount_max = tcount; 742 } 743 PetscCall(PetscMalloc1(tcount_max+1,&tdata)); 744 #endif 745 746 for (i=0; i<imax; ++i) { 747 #if defined(PETSC_USE_CTABLE) 748 PetscTablePosition tpos; 749 table_data_i = table_data[i]; 750 751 PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos)); 752 while (tpos) { 753 PetscCall(PetscTableGetNext(table_data_i,&tpos,&k,&j)); 754 tdata[--j] = --k; 755 } 756 PetscCall(ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i)); 757 #else 758 PetscCall(ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i)); 759 #endif 760 PetscCall(PetscCommDestroy(&iscomms[i])); 761 } 762 763 PetscCall(PetscFree(iscomms)); 764 PetscCall(PetscFree(onodes2)); 765 PetscCall(PetscFree(olengths2)); 766 767 PetscCall(PetscFree(pa)); 768 PetscCall(PetscFree(rbuf2[0])); 769 PetscCall(PetscFree(rbuf2)); 770 PetscCall(PetscFree(s_waits1)); 771 PetscCall(PetscFree(r_waits1)); 772 PetscCall(PetscFree(s_waits2)); 773 PetscCall(PetscFree(r_waits2)); 774 PetscCall(PetscFree(recv_status)); 775 if (xdata) { 776 PetscCall(PetscFree(xdata[0])); 777 PetscCall(PetscFree(xdata)); 778 } 779 PetscCall(PetscFree(isz1)); 780 #if defined(PETSC_USE_CTABLE) 781 for (i=0; i<imax; i++) { 782 PetscCall(PetscTableDestroy((PetscTable*)&table_data[i])); 783 } 784 PetscCall(PetscFree(table_data)); 785 PetscCall(PetscFree(tdata)); 786 PetscCall(PetscFree4(table,data,isz,t_p)); 787 #else 788 PetscCall(PetscFree5(table,data,isz,d_p,t_p)); 789 #endif 790 PetscFunctionReturn(0); 791 } 792 793 /* 794 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 795 the work on the local processor. 796 797 Inputs: 798 C - MAT_MPIAIJ; 799 imax - total no of index sets processed at a time; 800 table - an array of char - size = m bits. 801 802 Output: 803 isz - array containing the count of the solution elements corresponding 804 to each index set; 805 data or table_data - pointer to the solutions 806 */ 807 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 808 { 809 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 810 Mat A = c->A,B = c->B; 811 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 812 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 813 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 814 PetscBT table_i; 815 #if defined(PETSC_USE_CTABLE) 816 PetscTable table_data_i; 817 PetscTablePosition tpos; 818 PetscInt tcount,*tdata; 819 #else 820 PetscInt *data_i; 821 #endif 822 823 PetscFunctionBegin; 824 rstart = C->rmap->rstart; 825 cstart = C->cmap->rstart; 826 ai = a->i; 827 aj = a->j; 828 bi = b->i; 829 bj = b->j; 830 garray = c->garray; 831 832 for (i=0; i<imax; i++) { 833 #if defined(PETSC_USE_CTABLE) 834 /* copy existing entries of table_data_i into tdata[] */ 835 table_data_i = table_data[i]; 836 PetscCall(PetscTableGetCount(table_data_i,&tcount)); 837 PetscCheckFalse(tcount != isz[i],PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT,tcount,i,isz[i]); 838 839 PetscCall(PetscMalloc1(tcount,&tdata)); 840 PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos)); 841 while (tpos) { 842 PetscCall(PetscTableGetNext(table_data_i,&tpos,&row,&j)); 843 tdata[--j] = --row; 844 PetscCheckFalse(j > tcount - 1,PETSC_COMM_SELF,PETSC_ERR_PLIB," j %" PetscInt_FMT " >= tcount %" PetscInt_FMT,j,tcount); 845 } 846 #else 847 data_i = data[i]; 848 #endif 849 table_i = table[i]; 850 isz_i = isz[i]; 851 max = isz[i]; 852 853 for (j=0; j<max; j++) { 854 #if defined(PETSC_USE_CTABLE) 855 row = tdata[j] - rstart; 856 #else 857 row = data_i[j] - rstart; 858 #endif 859 start = ai[row]; 860 end = ai[row+1]; 861 for (k=start; k<end; k++) { /* Amat */ 862 val = aj[k] + cstart; 863 if (!PetscBTLookupSet(table_i,val)) { 864 #if defined(PETSC_USE_CTABLE) 865 PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES)); 866 #else 867 data_i[isz_i] = val; 868 #endif 869 isz_i++; 870 } 871 } 872 start = bi[row]; 873 end = bi[row+1]; 874 for (k=start; k<end; k++) { /* Bmat */ 875 val = garray[bj[k]]; 876 if (!PetscBTLookupSet(table_i,val)) { 877 #if defined(PETSC_USE_CTABLE) 878 PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES)); 879 #else 880 data_i[isz_i] = val; 881 #endif 882 isz_i++; 883 } 884 } 885 } 886 isz[i] = isz_i; 887 888 #if defined(PETSC_USE_CTABLE) 889 PetscCall(PetscFree(tdata)); 890 #endif 891 } 892 PetscFunctionReturn(0); 893 } 894 895 /* 896 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 897 and return the output 898 899 Input: 900 C - the matrix 901 nrqr - no of messages being processed. 902 rbuf - an array of pointers to the received requests 903 904 Output: 905 xdata - array of messages to be sent back 906 isz1 - size of each message 907 908 For better efficiency perhaps we should malloc separately each xdata[i], 909 then if a remalloc is required we need only copy the data for that one row 910 rather then all previous rows as it is now where a single large chunk of 911 memory is used. 912 913 */ 914 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 915 { 916 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 917 Mat A = c->A,B = c->B; 918 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 919 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 920 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 921 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 922 PetscInt *rbuf_i,kmax,rbuf_0; 923 PetscBT xtable; 924 925 PetscFunctionBegin; 926 m = C->rmap->N; 927 rstart = C->rmap->rstart; 928 cstart = C->cmap->rstart; 929 ai = a->i; 930 aj = a->j; 931 bi = b->i; 932 bj = b->j; 933 garray = c->garray; 934 935 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 936 rbuf_i = rbuf[i]; 937 rbuf_0 = rbuf_i[0]; 938 ct += rbuf_0; 939 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 940 } 941 942 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 943 else max1 = 1; 944 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 945 if (nrqr) { 946 PetscCall(PetscMalloc1(mem_estimate,&xdata[0])); 947 ++no_malloc; 948 } 949 PetscCall(PetscBTCreate(m,&xtable)); 950 PetscCall(PetscArrayzero(isz1,nrqr)); 951 952 ct3 = 0; 953 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 954 rbuf_i = rbuf[i]; 955 rbuf_0 = rbuf_i[0]; 956 ct1 = 2*rbuf_0+1; 957 ct2 = ct1; 958 ct3 += ct1; 959 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 960 PetscCall(PetscBTMemzero(m,xtable)); 961 oct2 = ct2; 962 kmax = rbuf_i[2*j]; 963 for (k=0; k<kmax; k++,ct1++) { 964 row = rbuf_i[ct1]; 965 if (!PetscBTLookupSet(xtable,row)) { 966 if (!(ct3 < mem_estimate)) { 967 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 968 PetscCall(PetscMalloc1(new_estimate,&tmp)); 969 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 970 PetscCall(PetscFree(xdata[0])); 971 xdata[0] = tmp; 972 mem_estimate = new_estimate; ++no_malloc; 973 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 974 } 975 xdata[i][ct2++] = row; 976 ct3++; 977 } 978 } 979 for (k=oct2,max2=ct2; k<max2; k++) { 980 row = xdata[i][k] - rstart; 981 start = ai[row]; 982 end = ai[row+1]; 983 for (l=start; l<end; l++) { 984 val = aj[l] + cstart; 985 if (!PetscBTLookupSet(xtable,val)) { 986 if (!(ct3 < mem_estimate)) { 987 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 988 PetscCall(PetscMalloc1(new_estimate,&tmp)); 989 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 990 PetscCall(PetscFree(xdata[0])); 991 xdata[0] = tmp; 992 mem_estimate = new_estimate; ++no_malloc; 993 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 994 } 995 xdata[i][ct2++] = val; 996 ct3++; 997 } 998 } 999 start = bi[row]; 1000 end = bi[row+1]; 1001 for (l=start; l<end; l++) { 1002 val = garray[bj[l]]; 1003 if (!PetscBTLookupSet(xtable,val)) { 1004 if (!(ct3 < mem_estimate)) { 1005 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1006 PetscCall(PetscMalloc1(new_estimate,&tmp)); 1007 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 1008 PetscCall(PetscFree(xdata[0])); 1009 xdata[0] = tmp; 1010 mem_estimate = new_estimate; ++no_malloc; 1011 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1012 } 1013 xdata[i][ct2++] = val; 1014 ct3++; 1015 } 1016 } 1017 } 1018 /* Update the header*/ 1019 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1020 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1021 } 1022 xdata[i][0] = rbuf_0; 1023 if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2; 1024 isz1[i] = ct2; /* size of each message */ 1025 } 1026 PetscCall(PetscBTDestroy(&xtable)); 1027 PetscCall(PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc)); 1028 PetscFunctionReturn(0); 1029 } 1030 /* -------------------------------------------------------------------------*/ 1031 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1032 /* 1033 Every processor gets the entire matrix 1034 */ 1035 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1036 { 1037 Mat B; 1038 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1039 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1040 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1041 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1042 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1043 1044 PetscFunctionBegin; 1045 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1046 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank)); 1047 if (scall == MAT_INITIAL_MATRIX) { 1048 /* ---------------------------------------------------------------- 1049 Tell every processor the number of nonzeros per row 1050 */ 1051 PetscCall(PetscMalloc1(A->rmap->N,&lens)); 1052 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1053 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1054 } 1055 PetscCall(PetscMalloc2(size,&recvcounts,size,&displs)); 1056 for (i=0; i<size; i++) { 1057 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1058 displs[i] = A->rmap->range[i]; 1059 } 1060 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A))); 1061 /* --------------------------------------------------------------- 1062 Create the sequential matrix of the same type as the local block diagonal 1063 */ 1064 PetscCall(MatCreate(PETSC_COMM_SELF,&B)); 1065 PetscCall(MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE)); 1066 PetscCall(MatSetBlockSizesFromMats(B,A,A)); 1067 PetscCall(MatSetType(B,((PetscObject)a->A)->type_name)); 1068 PetscCall(MatSeqAIJSetPreallocation(B,0,lens)); 1069 PetscCall(PetscCalloc1(2,Bin)); 1070 **Bin = B; 1071 b = (Mat_SeqAIJ*)B->data; 1072 1073 /*-------------------------------------------------------------------- 1074 Copy my part of matrix column indices over 1075 */ 1076 sendcount = ad->nz + bd->nz; 1077 jsendbuf = b->j + b->i[rstarts[rank]]; 1078 a_jsendbuf = ad->j; 1079 b_jsendbuf = bd->j; 1080 n = A->rmap->rend - A->rmap->rstart; 1081 cnt = 0; 1082 for (i=0; i<n; i++) { 1083 /* put in lower diagonal portion */ 1084 m = bd->i[i+1] - bd->i[i]; 1085 while (m > 0) { 1086 /* is it above diagonal (in bd (compressed) numbering) */ 1087 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1088 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1089 m--; 1090 } 1091 1092 /* put in diagonal portion */ 1093 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1094 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1095 } 1096 1097 /* put in upper diagonal portion */ 1098 while (m-- > 0) { 1099 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1100 } 1101 } 1102 PetscCheckFalse(cnt != sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt); 1103 1104 /*-------------------------------------------------------------------- 1105 Gather all column indices to all processors 1106 */ 1107 for (i=0; i<size; i++) { 1108 recvcounts[i] = 0; 1109 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1110 recvcounts[i] += lens[j]; 1111 } 1112 } 1113 displs[0] = 0; 1114 for (i=1; i<size; i++) { 1115 displs[i] = displs[i-1] + recvcounts[i-1]; 1116 } 1117 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A))); 1118 /*-------------------------------------------------------------------- 1119 Assemble the matrix into useable form (note numerical values not yet set) 1120 */ 1121 /* set the b->ilen (length of each row) values */ 1122 PetscCall(PetscArraycpy(b->ilen,lens,A->rmap->N)); 1123 /* set the b->i indices */ 1124 b->i[0] = 0; 1125 for (i=1; i<=A->rmap->N; i++) { 1126 b->i[i] = b->i[i-1] + lens[i-1]; 1127 } 1128 PetscCall(PetscFree(lens)); 1129 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 1130 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 1131 1132 } else { 1133 B = **Bin; 1134 b = (Mat_SeqAIJ*)B->data; 1135 } 1136 1137 /*-------------------------------------------------------------------- 1138 Copy my part of matrix numerical values into the values location 1139 */ 1140 if (flag == MAT_GET_VALUES) { 1141 const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf; 1142 MatScalar *sendbuf,*recvbuf; 1143 1144 PetscCall(MatSeqAIJGetArrayRead(a->A,&ada)); 1145 PetscCall(MatSeqAIJGetArrayRead(a->B,&bda)); 1146 sendcount = ad->nz + bd->nz; 1147 sendbuf = b->a + b->i[rstarts[rank]]; 1148 a_sendbuf = ada; 1149 b_sendbuf = bda; 1150 b_sendj = bd->j; 1151 n = A->rmap->rend - A->rmap->rstart; 1152 cnt = 0; 1153 for (i=0; i<n; i++) { 1154 /* put in lower diagonal portion */ 1155 m = bd->i[i+1] - bd->i[i]; 1156 while (m > 0) { 1157 /* is it above diagonal (in bd (compressed) numbering) */ 1158 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1159 sendbuf[cnt++] = *b_sendbuf++; 1160 m--; 1161 b_sendj++; 1162 } 1163 1164 /* put in diagonal portion */ 1165 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1166 sendbuf[cnt++] = *a_sendbuf++; 1167 } 1168 1169 /* put in upper diagonal portion */ 1170 while (m-- > 0) { 1171 sendbuf[cnt++] = *b_sendbuf++; 1172 b_sendj++; 1173 } 1174 } 1175 PetscCheckFalse(cnt != sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt); 1176 1177 /* ----------------------------------------------------------------- 1178 Gather all numerical values to all processors 1179 */ 1180 if (!recvcounts) { 1181 PetscCall(PetscMalloc2(size,&recvcounts,size,&displs)); 1182 } 1183 for (i=0; i<size; i++) { 1184 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1185 } 1186 displs[0] = 0; 1187 for (i=1; i<size; i++) { 1188 displs[i] = displs[i-1] + recvcounts[i-1]; 1189 } 1190 recvbuf = b->a; 1191 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A))); 1192 PetscCall(MatSeqAIJRestoreArrayRead(a->A,&ada)); 1193 PetscCall(MatSeqAIJRestoreArrayRead(a->B,&bda)); 1194 } /* endof (flag == MAT_GET_VALUES) */ 1195 PetscCall(PetscFree2(recvcounts,displs)); 1196 1197 if (A->symmetric) { 1198 PetscCall(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE)); 1199 } else if (A->hermitian) { 1200 PetscCall(MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE)); 1201 } else if (A->structurally_symmetric) { 1202 PetscCall(MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE)); 1203 } 1204 PetscFunctionReturn(0); 1205 } 1206 1207 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1208 { 1209 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1210 Mat submat,A = c->A,B = c->B; 1211 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1212 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1213 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1214 const PetscInt *icol,*irow; 1215 PetscInt nrow,ncol,start; 1216 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1217 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1218 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1219 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1220 PetscInt *lens,rmax,ncols,*cols,Crow; 1221 #if defined(PETSC_USE_CTABLE) 1222 PetscTable cmap,rmap; 1223 PetscInt *cmap_loc,*rmap_loc; 1224 #else 1225 PetscInt *cmap,*rmap; 1226 #endif 1227 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1228 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1229 PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL; 1230 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1231 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1232 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1233 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1234 MPI_Comm comm; 1235 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1236 PetscMPIInt *onodes1,*olengths1,idex,end; 1237 Mat_SubSppt *smatis1; 1238 PetscBool isrowsorted,iscolsorted; 1239 1240 PetscFunctionBegin; 1241 PetscValidLogicalCollectiveInt(C,ismax,2); 1242 PetscValidLogicalCollectiveEnum(C,scall,5); 1243 PetscCheckFalse(ismax != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1244 PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a)); 1245 PetscCall(MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a)); 1246 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 1247 size = c->size; 1248 rank = c->rank; 1249 1250 PetscCall(ISSorted(iscol[0],&iscolsorted)); 1251 PetscCall(ISSorted(isrow[0],&isrowsorted)); 1252 PetscCall(ISGetIndices(isrow[0],&irow)); 1253 PetscCall(ISGetLocalSize(isrow[0],&nrow)); 1254 if (allcolumns) { 1255 icol = NULL; 1256 ncol = C->cmap->N; 1257 } else { 1258 PetscCall(ISGetIndices(iscol[0],&icol)); 1259 PetscCall(ISGetLocalSize(iscol[0],&ncol)); 1260 } 1261 1262 if (scall == MAT_INITIAL_MATRIX) { 1263 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1264 1265 /* Get some new tags to keep the communication clean */ 1266 tag1 = ((PetscObject)C)->tag; 1267 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2)); 1268 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3)); 1269 1270 /* evaluate communication - mesg to who, length of mesg, and buffer space 1271 required. Based on this, buffers are allocated, and data copied into them */ 1272 PetscCall(PetscCalloc2(size,&w1,size,&w2)); 1273 PetscCall(PetscMalloc1(nrow,&row2proc)); 1274 1275 /* w1[proc] = num of rows owned by proc -- to be requested */ 1276 proc = 0; 1277 nrqs = 0; /* num of outgoing messages */ 1278 for (j=0; j<nrow; j++) { 1279 row = irow[j]; 1280 if (!isrowsorted) proc = 0; 1281 while (row >= C->rmap->range[proc+1]) proc++; 1282 w1[proc]++; 1283 row2proc[j] = proc; /* map row index to proc */ 1284 1285 if (proc != rank && !w2[proc]) { 1286 w2[proc] = 1; nrqs++; 1287 } 1288 } 1289 w1[rank] = 0; /* rows owned by self will not be requested */ 1290 1291 PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/ 1292 for (proc=0,j=0; proc<size; proc++) { 1293 if (w1[proc]) { pa[j++] = proc;} 1294 } 1295 1296 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1297 msz = 0; /* total mesg length (for all procs) */ 1298 for (i=0; i<nrqs; i++) { 1299 proc = pa[i]; 1300 w1[proc] += 3; 1301 msz += w1[proc]; 1302 } 1303 PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz)); 1304 1305 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1306 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1307 PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 1308 1309 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1310 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1311 PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 1312 1313 /* Now post the Irecvs corresponding to these messages */ 1314 PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1)); 1315 1316 PetscCall(PetscFree(onodes1)); 1317 PetscCall(PetscFree(olengths1)); 1318 1319 /* Allocate Memory for outgoing messages */ 1320 PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr)); 1321 PetscCall(PetscArrayzero(sbuf1,size)); 1322 PetscCall(PetscArrayzero(ptr,size)); 1323 1324 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1325 iptr = tmp; 1326 for (i=0; i<nrqs; i++) { 1327 proc = pa[i]; 1328 sbuf1[proc] = iptr; 1329 iptr += w1[proc]; 1330 } 1331 1332 /* Form the outgoing messages */ 1333 /* Initialize the header space */ 1334 for (i=0; i<nrqs; i++) { 1335 proc = pa[i]; 1336 PetscCall(PetscArrayzero(sbuf1[proc],3)); 1337 ptr[proc] = sbuf1[proc] + 3; 1338 } 1339 1340 /* Parse the isrow and copy data into outbuf */ 1341 PetscCall(PetscArrayzero(ctr,size)); 1342 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1343 proc = row2proc[j]; 1344 if (proc != rank) { /* copy to the outgoing buf*/ 1345 *ptr[proc] = irow[j]; 1346 ctr[proc]++; ptr[proc]++; 1347 } 1348 } 1349 1350 /* Update the headers for the current IS */ 1351 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1352 if ((ctr_j = ctr[j])) { 1353 sbuf1_j = sbuf1[j]; 1354 k = ++sbuf1_j[0]; 1355 sbuf1_j[2*k] = ctr_j; 1356 sbuf1_j[2*k-1] = 0; 1357 } 1358 } 1359 1360 /* Now post the sends */ 1361 PetscCall(PetscMalloc1(nrqs,&s_waits1)); 1362 for (i=0; i<nrqs; ++i) { 1363 proc = pa[i]; 1364 PetscCallMPI(MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i)); 1365 } 1366 1367 /* Post Receives to capture the buffer size */ 1368 PetscCall(PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2)); 1369 PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3)); 1370 1371 if (nrqs) rbuf2[0] = tmp + msz; 1372 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1373 1374 for (i=0; i<nrqs; ++i) { 1375 proc = pa[i]; 1376 PetscCallMPI(MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i)); 1377 } 1378 1379 PetscCall(PetscFree2(w1,w2)); 1380 1381 /* Send to other procs the buf size they should allocate */ 1382 /* Receive messages*/ 1383 PetscCall(PetscMalloc1(nrqr,&r_status1)); 1384 PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1)); 1385 1386 PetscCallMPI(MPI_Waitall(nrqr,r_waits1,r_status1)); 1387 for (i=0; i<nrqr; ++i) { 1388 req_size[i] = 0; 1389 rbuf1_i = rbuf1[i]; 1390 start = 2*rbuf1_i[0] + 1; 1391 PetscCallMPI(MPI_Get_count(r_status1+i,MPIU_INT,&end)); 1392 PetscCall(PetscMalloc1(end,&sbuf2[i])); 1393 sbuf2_i = sbuf2[i]; 1394 for (j=start; j<end; j++) { 1395 k = rbuf1_i[j] - rstart; 1396 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1397 sbuf2_i[j] = ncols; 1398 req_size[i] += ncols; 1399 } 1400 req_source1[i] = r_status1[i].MPI_SOURCE; 1401 1402 /* form the header */ 1403 sbuf2_i[0] = req_size[i]; 1404 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1405 1406 PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i)); 1407 } 1408 1409 PetscCall(PetscFree(r_status1)); 1410 PetscCall(PetscFree(r_waits1)); 1411 1412 /* rbuf2 is received, Post recv column indices a->j */ 1413 PetscCallMPI(MPI_Waitall(nrqs,r_waits2,r_status2)); 1414 1415 PetscCall(PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3)); 1416 for (i=0; i<nrqs; ++i) { 1417 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i])); 1418 req_source2[i] = r_status2[i].MPI_SOURCE; 1419 PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i)); 1420 } 1421 1422 /* Wait on sends1 and sends2 */ 1423 PetscCall(PetscMalloc1(nrqs,&s_status1)); 1424 PetscCallMPI(MPI_Waitall(nrqs,s_waits1,s_status1)); 1425 PetscCall(PetscFree(s_waits1)); 1426 PetscCall(PetscFree(s_status1)); 1427 1428 PetscCallMPI(MPI_Waitall(nrqr,s_waits2,s_status2)); 1429 PetscCall(PetscFree4(r_status2,s_waits2,r_waits2,s_status2)); 1430 1431 /* Now allocate sending buffers for a->j, and send them off */ 1432 PetscCall(PetscMalloc1(nrqr,&sbuf_aj)); 1433 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1434 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0])); 1435 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1436 1437 for (i=0; i<nrqr; i++) { /* for each requested message */ 1438 rbuf1_i = rbuf1[i]; 1439 sbuf_aj_i = sbuf_aj[i]; 1440 ct1 = 2*rbuf1_i[0] + 1; 1441 ct2 = 0; 1442 /* max1=rbuf1_i[0]; PetscCheckFalse(max1 != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1443 1444 kmax = rbuf1[i][2]; 1445 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1446 row = rbuf1_i[ct1] - rstart; 1447 nzA = ai[row+1] - ai[row]; 1448 nzB = bi[row+1] - bi[row]; 1449 ncols = nzA + nzB; 1450 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1451 1452 /* load the column indices for this row into cols*/ 1453 cols = sbuf_aj_i + ct2; 1454 1455 lwrite = 0; 1456 for (l=0; l<nzB; l++) { 1457 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1458 } 1459 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1460 for (l=0; l<nzB; l++) { 1461 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1462 } 1463 1464 ct2 += ncols; 1465 } 1466 PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i)); 1467 } 1468 1469 /* create column map (cmap): global col of C -> local col of submat */ 1470 #if defined(PETSC_USE_CTABLE) 1471 if (!allcolumns) { 1472 PetscCall(PetscTableCreate(ncol,C->cmap->N,&cmap)); 1473 PetscCall(PetscCalloc1(C->cmap->n,&cmap_loc)); 1474 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1475 if (icol[j] >= cstart && icol[j] <cend) { 1476 cmap_loc[icol[j] - cstart] = j+1; 1477 } else { /* use PetscTable for non-local col indices */ 1478 PetscCall(PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES)); 1479 } 1480 } 1481 } else { 1482 cmap = NULL; 1483 cmap_loc = NULL; 1484 } 1485 PetscCall(PetscCalloc1(C->rmap->n,&rmap_loc)); 1486 #else 1487 if (!allcolumns) { 1488 PetscCall(PetscCalloc1(C->cmap->N,&cmap)); 1489 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1490 } else { 1491 cmap = NULL; 1492 } 1493 #endif 1494 1495 /* Create lens for MatSeqAIJSetPreallocation() */ 1496 PetscCall(PetscCalloc1(nrow,&lens)); 1497 1498 /* Compute lens from local part of C */ 1499 for (j=0; j<nrow; j++) { 1500 row = irow[j]; 1501 proc = row2proc[j]; 1502 if (proc == rank) { 1503 /* diagonal part A = c->A */ 1504 ncols = ai[row-rstart+1] - ai[row-rstart]; 1505 cols = aj + ai[row-rstart]; 1506 if (!allcolumns) { 1507 for (k=0; k<ncols; k++) { 1508 #if defined(PETSC_USE_CTABLE) 1509 tcol = cmap_loc[cols[k]]; 1510 #else 1511 tcol = cmap[cols[k]+cstart]; 1512 #endif 1513 if (tcol) lens[j]++; 1514 } 1515 } else { /* allcolumns */ 1516 lens[j] = ncols; 1517 } 1518 1519 /* off-diagonal part B = c->B */ 1520 ncols = bi[row-rstart+1] - bi[row-rstart]; 1521 cols = bj + bi[row-rstart]; 1522 if (!allcolumns) { 1523 for (k=0; k<ncols; k++) { 1524 #if defined(PETSC_USE_CTABLE) 1525 PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol)); 1526 #else 1527 tcol = cmap[bmap[cols[k]]]; 1528 #endif 1529 if (tcol) lens[j]++; 1530 } 1531 } else { /* allcolumns */ 1532 lens[j] += ncols; 1533 } 1534 } 1535 } 1536 1537 /* Create row map (rmap): global row of C -> local row of submat */ 1538 #if defined(PETSC_USE_CTABLE) 1539 PetscCall(PetscTableCreate(nrow,C->rmap->N,&rmap)); 1540 for (j=0; j<nrow; j++) { 1541 row = irow[j]; 1542 proc = row2proc[j]; 1543 if (proc == rank) { /* a local row */ 1544 rmap_loc[row - rstart] = j; 1545 } else { 1546 PetscCall(PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES)); 1547 } 1548 } 1549 #else 1550 PetscCall(PetscCalloc1(C->rmap->N,&rmap)); 1551 for (j=0; j<nrow; j++) { 1552 rmap[irow[j]] = j; 1553 } 1554 #endif 1555 1556 /* Update lens from offproc data */ 1557 /* recv a->j is done */ 1558 PetscCallMPI(MPI_Waitall(nrqs,r_waits3,r_status3)); 1559 for (i=0; i<nrqs; i++) { 1560 proc = pa[i]; 1561 sbuf1_i = sbuf1[proc]; 1562 /* jmax = sbuf1_i[0]; PetscCheckFalse(jmax != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1563 ct1 = 2 + 1; 1564 ct2 = 0; 1565 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1566 rbuf3_i = rbuf3[i]; /* received C->j */ 1567 1568 /* is_no = sbuf1_i[2*j-1]; PetscCheckFalse(is_no != 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1569 max1 = sbuf1_i[2]; 1570 for (k=0; k<max1; k++,ct1++) { 1571 #if defined(PETSC_USE_CTABLE) 1572 PetscCall(PetscTableFind(rmap,sbuf1_i[ct1]+1,&row)); 1573 row--; 1574 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1575 #else 1576 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1577 #endif 1578 /* Now, store row index of submat in sbuf1_i[ct1] */ 1579 sbuf1_i[ct1] = row; 1580 1581 nnz = rbuf2_i[ct1]; 1582 if (!allcolumns) { 1583 for (l=0; l<nnz; l++,ct2++) { 1584 #if defined(PETSC_USE_CTABLE) 1585 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1586 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1587 } else { 1588 PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol)); 1589 } 1590 #else 1591 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1592 #endif 1593 if (tcol) lens[row]++; 1594 } 1595 } else { /* allcolumns */ 1596 lens[row] += nnz; 1597 } 1598 } 1599 } 1600 PetscCallMPI(MPI_Waitall(nrqr,s_waits3,s_status3)); 1601 PetscCall(PetscFree4(r_waits3,s_waits3,r_status3,s_status3)); 1602 1603 /* Create the submatrices */ 1604 PetscCall(MatCreate(PETSC_COMM_SELF,&submat)); 1605 PetscCall(MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE)); 1606 1607 PetscCall(ISGetBlockSize(isrow[0],&i)); 1608 PetscCall(ISGetBlockSize(iscol[0],&j)); 1609 PetscCall(MatSetBlockSizes(submat,i,j)); 1610 PetscCall(MatSetType(submat,((PetscObject)A)->type_name)); 1611 PetscCall(MatSeqAIJSetPreallocation(submat,0,lens)); 1612 1613 /* create struct Mat_SubSppt and attached it to submat */ 1614 PetscCall(PetscNew(&smatis1)); 1615 subc = (Mat_SeqAIJ*)submat->data; 1616 subc->submatis1 = smatis1; 1617 1618 smatis1->id = 0; 1619 smatis1->nrqs = nrqs; 1620 smatis1->nrqr = nrqr; 1621 smatis1->rbuf1 = rbuf1; 1622 smatis1->rbuf2 = rbuf2; 1623 smatis1->rbuf3 = rbuf3; 1624 smatis1->sbuf2 = sbuf2; 1625 smatis1->req_source2 = req_source2; 1626 1627 smatis1->sbuf1 = sbuf1; 1628 smatis1->ptr = ptr; 1629 smatis1->tmp = tmp; 1630 smatis1->ctr = ctr; 1631 1632 smatis1->pa = pa; 1633 smatis1->req_size = req_size; 1634 smatis1->req_source1 = req_source1; 1635 1636 smatis1->allcolumns = allcolumns; 1637 smatis1->singleis = PETSC_TRUE; 1638 smatis1->row2proc = row2proc; 1639 smatis1->rmap = rmap; 1640 smatis1->cmap = cmap; 1641 #if defined(PETSC_USE_CTABLE) 1642 smatis1->rmap_loc = rmap_loc; 1643 smatis1->cmap_loc = cmap_loc; 1644 #endif 1645 1646 smatis1->destroy = submat->ops->destroy; 1647 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1648 submat->factortype = C->factortype; 1649 1650 /* compute rmax */ 1651 rmax = 0; 1652 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1653 1654 } else { /* scall == MAT_REUSE_MATRIX */ 1655 submat = submats[0]; 1656 PetscCheckFalse(submat->rmap->n != nrow || submat->cmap->n != ncol,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1657 1658 subc = (Mat_SeqAIJ*)submat->data; 1659 rmax = subc->rmax; 1660 smatis1 = subc->submatis1; 1661 nrqs = smatis1->nrqs; 1662 nrqr = smatis1->nrqr; 1663 rbuf1 = smatis1->rbuf1; 1664 rbuf2 = smatis1->rbuf2; 1665 rbuf3 = smatis1->rbuf3; 1666 req_source2 = smatis1->req_source2; 1667 1668 sbuf1 = smatis1->sbuf1; 1669 sbuf2 = smatis1->sbuf2; 1670 ptr = smatis1->ptr; 1671 tmp = smatis1->tmp; 1672 ctr = smatis1->ctr; 1673 1674 pa = smatis1->pa; 1675 req_size = smatis1->req_size; 1676 req_source1 = smatis1->req_source1; 1677 1678 allcolumns = smatis1->allcolumns; 1679 row2proc = smatis1->row2proc; 1680 rmap = smatis1->rmap; 1681 cmap = smatis1->cmap; 1682 #if defined(PETSC_USE_CTABLE) 1683 rmap_loc = smatis1->rmap_loc; 1684 cmap_loc = smatis1->cmap_loc; 1685 #endif 1686 } 1687 1688 /* Post recv matrix values */ 1689 PetscCall(PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals)); 1690 PetscCall(PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4)); 1691 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4)); 1692 for (i=0; i<nrqs; ++i) { 1693 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i])); 1694 PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i)); 1695 } 1696 1697 /* Allocate sending buffers for a->a, and send them off */ 1698 PetscCall(PetscMalloc1(nrqr,&sbuf_aa)); 1699 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1700 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0])); 1701 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1702 1703 for (i=0; i<nrqr; i++) { 1704 rbuf1_i = rbuf1[i]; 1705 sbuf_aa_i = sbuf_aa[i]; 1706 ct1 = 2*rbuf1_i[0]+1; 1707 ct2 = 0; 1708 /* max1=rbuf1_i[0]; PetscCheckFalse(max1 != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1709 1710 kmax = rbuf1_i[2]; 1711 for (k=0; k<kmax; k++,ct1++) { 1712 row = rbuf1_i[ct1] - rstart; 1713 nzA = ai[row+1] - ai[row]; 1714 nzB = bi[row+1] - bi[row]; 1715 ncols = nzA + nzB; 1716 cworkB = bj + bi[row]; 1717 vworkA = a_a + ai[row]; 1718 vworkB = b_a + bi[row]; 1719 1720 /* load the column values for this row into vals*/ 1721 vals = sbuf_aa_i + ct2; 1722 1723 lwrite = 0; 1724 for (l=0; l<nzB; l++) { 1725 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1726 } 1727 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1728 for (l=0; l<nzB; l++) { 1729 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1730 } 1731 1732 ct2 += ncols; 1733 } 1734 PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i)); 1735 } 1736 1737 /* Assemble submat */ 1738 /* First assemble the local rows */ 1739 for (j=0; j<nrow; j++) { 1740 row = irow[j]; 1741 proc = row2proc[j]; 1742 if (proc == rank) { 1743 Crow = row - rstart; /* local row index of C */ 1744 #if defined(PETSC_USE_CTABLE) 1745 row = rmap_loc[Crow]; /* row index of submat */ 1746 #else 1747 row = rmap[row]; 1748 #endif 1749 1750 if (allcolumns) { 1751 /* diagonal part A = c->A */ 1752 ncols = ai[Crow+1] - ai[Crow]; 1753 cols = aj + ai[Crow]; 1754 vals = a_a + ai[Crow]; 1755 i = 0; 1756 for (k=0; k<ncols; k++) { 1757 subcols[i] = cols[k] + cstart; 1758 subvals[i++] = vals[k]; 1759 } 1760 1761 /* off-diagonal part B = c->B */ 1762 ncols = bi[Crow+1] - bi[Crow]; 1763 cols = bj + bi[Crow]; 1764 vals = b_a + bi[Crow]; 1765 for (k=0; k<ncols; k++) { 1766 subcols[i] = bmap[cols[k]]; 1767 subvals[i++] = vals[k]; 1768 } 1769 1770 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES)); 1771 1772 } else { /* !allcolumns */ 1773 #if defined(PETSC_USE_CTABLE) 1774 /* diagonal part A = c->A */ 1775 ncols = ai[Crow+1] - ai[Crow]; 1776 cols = aj + ai[Crow]; 1777 vals = a_a + ai[Crow]; 1778 i = 0; 1779 for (k=0; k<ncols; k++) { 1780 tcol = cmap_loc[cols[k]]; 1781 if (tcol) { 1782 subcols[i] = --tcol; 1783 subvals[i++] = vals[k]; 1784 } 1785 } 1786 1787 /* off-diagonal part B = c->B */ 1788 ncols = bi[Crow+1] - bi[Crow]; 1789 cols = bj + bi[Crow]; 1790 vals = b_a + bi[Crow]; 1791 for (k=0; k<ncols; k++) { 1792 PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol)); 1793 if (tcol) { 1794 subcols[i] = --tcol; 1795 subvals[i++] = vals[k]; 1796 } 1797 } 1798 #else 1799 /* diagonal part A = c->A */ 1800 ncols = ai[Crow+1] - ai[Crow]; 1801 cols = aj + ai[Crow]; 1802 vals = a_a + ai[Crow]; 1803 i = 0; 1804 for (k=0; k<ncols; k++) { 1805 tcol = cmap[cols[k]+cstart]; 1806 if (tcol) { 1807 subcols[i] = --tcol; 1808 subvals[i++] = vals[k]; 1809 } 1810 } 1811 1812 /* off-diagonal part B = c->B */ 1813 ncols = bi[Crow+1] - bi[Crow]; 1814 cols = bj + bi[Crow]; 1815 vals = b_a + bi[Crow]; 1816 for (k=0; k<ncols; k++) { 1817 tcol = cmap[bmap[cols[k]]]; 1818 if (tcol) { 1819 subcols[i] = --tcol; 1820 subvals[i++] = vals[k]; 1821 } 1822 } 1823 #endif 1824 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES)); 1825 } 1826 } 1827 } 1828 1829 /* Now assemble the off-proc rows */ 1830 for (i=0; i<nrqs; i++) { /* for each requested message */ 1831 /* recv values from other processes */ 1832 PetscCallMPI(MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i)); 1833 proc = pa[idex]; 1834 sbuf1_i = sbuf1[proc]; 1835 /* jmax = sbuf1_i[0]; PetscCheckFalse(jmax != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1836 ct1 = 2 + 1; 1837 ct2 = 0; /* count of received C->j */ 1838 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1839 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1840 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1841 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1842 1843 /* is_no = sbuf1_i[2*j-1]; PetscCheckFalse(is_no != 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1844 max1 = sbuf1_i[2]; /* num of rows */ 1845 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1846 row = sbuf1_i[ct1]; /* row index of submat */ 1847 if (!allcolumns) { 1848 idex = 0; 1849 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1850 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1851 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1852 #if defined(PETSC_USE_CTABLE) 1853 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1854 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1855 } else { 1856 PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol)); 1857 } 1858 #else 1859 tcol = cmap[rbuf3_i[ct2]]; 1860 #endif 1861 if (tcol) { 1862 subcols[idex] = --tcol; /* may not be sorted */ 1863 subvals[idex++] = rbuf4_i[ct2]; 1864 1865 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1866 For reuse, we replace received C->j with index that should be inserted to submat */ 1867 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1868 } 1869 } 1870 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES)); 1871 } else { /* scall == MAT_REUSE_MATRIX */ 1872 submat = submats[0]; 1873 subc = (Mat_SeqAIJ*)submat->data; 1874 1875 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1876 for (l=0; l<nnz; l++) { 1877 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1878 subvals[idex++] = rbuf4_i[ct2]; 1879 } 1880 1881 bj = subc->j + subc->i[row]; /* sorted column indices */ 1882 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES)); 1883 } 1884 } else { /* allcolumns */ 1885 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1886 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES)); 1887 ct2 += nnz; 1888 } 1889 } 1890 } 1891 1892 /* sending a->a are done */ 1893 PetscCallMPI(MPI_Waitall(nrqr,s_waits4,s_status4)); 1894 PetscCall(PetscFree4(r_waits4,s_waits4,r_status4,s_status4)); 1895 1896 PetscCall(MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY)); 1897 PetscCall(MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY)); 1898 submats[0] = submat; 1899 1900 /* Restore the indices */ 1901 PetscCall(ISRestoreIndices(isrow[0],&irow)); 1902 if (!allcolumns) { 1903 PetscCall(ISRestoreIndices(iscol[0],&icol)); 1904 } 1905 1906 /* Destroy allocated memory */ 1907 for (i=0; i<nrqs; ++i) { 1908 PetscCall(PetscFree(rbuf4[i])); 1909 } 1910 PetscCall(PetscFree3(rbuf4,subcols,subvals)); 1911 if (sbuf_aa) { 1912 PetscCall(PetscFree(sbuf_aa[0])); 1913 PetscCall(PetscFree(sbuf_aa)); 1914 } 1915 1916 if (scall == MAT_INITIAL_MATRIX) { 1917 PetscCall(PetscFree(lens)); 1918 if (sbuf_aj) { 1919 PetscCall(PetscFree(sbuf_aj[0])); 1920 PetscCall(PetscFree(sbuf_aj)); 1921 } 1922 } 1923 PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a)); 1924 PetscCall(MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a)); 1925 PetscFunctionReturn(0); 1926 } 1927 1928 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1929 { 1930 PetscInt ncol; 1931 PetscBool colflag,allcolumns=PETSC_FALSE; 1932 1933 PetscFunctionBegin; 1934 /* Allocate memory to hold all the submatrices */ 1935 if (scall == MAT_INITIAL_MATRIX) { 1936 PetscCall(PetscCalloc1(2,submat)); 1937 } 1938 1939 /* Check for special case: each processor gets entire matrix columns */ 1940 PetscCall(ISIdentity(iscol[0],&colflag)); 1941 PetscCall(ISGetLocalSize(iscol[0],&ncol)); 1942 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1943 1944 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat)); 1945 PetscFunctionReturn(0); 1946 } 1947 1948 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1949 { 1950 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1951 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1952 Mat_SeqAIJ *subc; 1953 Mat_SubSppt *smat; 1954 1955 PetscFunctionBegin; 1956 /* Check for special case: each processor has a single IS */ 1957 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1958 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat)); 1959 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1960 PetscFunctionReturn(0); 1961 } 1962 1963 /* Collect global wantallmatrix and nstages */ 1964 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1965 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1966 if (!nmax) nmax = 1; 1967 1968 if (scall == MAT_INITIAL_MATRIX) { 1969 /* Collect global wantallmatrix and nstages */ 1970 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1971 PetscCall(ISIdentity(*isrow,&rowflag)); 1972 PetscCall(ISIdentity(*iscol,&colflag)); 1973 PetscCall(ISGetLocalSize(*isrow,&nrow)); 1974 PetscCall(ISGetLocalSize(*iscol,&ncol)); 1975 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1976 wantallmatrix = PETSC_TRUE; 1977 1978 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL)); 1979 } 1980 } 1981 1982 /* Determine the number of stages through which submatrices are done 1983 Each stage will extract nmax submatrices. 1984 nmax is determined by the matrix column dimension. 1985 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1986 */ 1987 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1988 1989 in[0] = -1*(PetscInt)wantallmatrix; 1990 in[1] = nstages; 1991 PetscCall(MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C))); 1992 wantallmatrix = (PetscBool)(-out[0]); 1993 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 1994 1995 } else { /* MAT_REUSE_MATRIX */ 1996 if (ismax) { 1997 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 1998 smat = subc->submatis1; 1999 } else { /* (*submat)[0] is a dummy matrix */ 2000 smat = (Mat_SubSppt*)(*submat)[0]->data; 2001 } 2002 if (!smat) { 2003 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2004 wantallmatrix = PETSC_TRUE; 2005 } else if (smat->singleis) { 2006 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat)); 2007 PetscFunctionReturn(0); 2008 } else { 2009 nstages = smat->nstages; 2010 } 2011 } 2012 2013 if (wantallmatrix) { 2014 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat)); 2015 PetscFunctionReturn(0); 2016 } 2017 2018 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2019 if (scall == MAT_INITIAL_MATRIX) { 2020 PetscCall(PetscCalloc1(ismax+nstages,submat)); 2021 } 2022 2023 for (i=0,pos=0; i<nstages; i++) { 2024 if (pos+nmax <= ismax) max_no = nmax; 2025 else if (pos >= ismax) max_no = 0; 2026 else max_no = ismax-pos; 2027 2028 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos)); 2029 if (!max_no) { 2030 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2031 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2032 smat->nstages = nstages; 2033 } 2034 pos++; /* advance to next dummy matrix if any */ 2035 } else pos += max_no; 2036 } 2037 2038 if (ismax && scall == MAT_INITIAL_MATRIX) { 2039 /* save nstages for reuse */ 2040 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2041 smat = subc->submatis1; 2042 smat->nstages = nstages; 2043 } 2044 PetscFunctionReturn(0); 2045 } 2046 2047 /* -------------------------------------------------------------------------*/ 2048 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2049 { 2050 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2051 Mat A = c->A; 2052 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2053 const PetscInt **icol,**irow; 2054 PetscInt *nrow,*ncol,start; 2055 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2056 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2057 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2058 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2059 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2060 #if defined(PETSC_USE_CTABLE) 2061 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2062 #else 2063 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2064 #endif 2065 const PetscInt *irow_i; 2066 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2067 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2068 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2069 MPI_Comm comm; 2070 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2071 PetscMPIInt *onodes1,*olengths1,end; 2072 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2073 Mat_SubSppt *smat_i; 2074 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2075 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2076 2077 PetscFunctionBegin; 2078 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 2079 size = c->size; 2080 rank = c->rank; 2081 2082 PetscCall(PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns)); 2083 PetscCall(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted)); 2084 2085 for (i=0; i<ismax; i++) { 2086 PetscCall(ISSorted(iscol[i],&issorted[i])); 2087 if (!issorted[i]) iscsorted = issorted[i]; 2088 2089 PetscCall(ISSorted(isrow[i],&issorted[i])); 2090 2091 PetscCall(ISGetIndices(isrow[i],&irow[i])); 2092 PetscCall(ISGetLocalSize(isrow[i],&nrow[i])); 2093 2094 /* Check for special case: allcolumn */ 2095 PetscCall(ISIdentity(iscol[i],&colflag)); 2096 PetscCall(ISGetLocalSize(iscol[i],&ncol[i])); 2097 if (colflag && ncol[i] == C->cmap->N) { 2098 allcolumns[i] = PETSC_TRUE; 2099 icol[i] = NULL; 2100 } else { 2101 allcolumns[i] = PETSC_FALSE; 2102 PetscCall(ISGetIndices(iscol[i],&icol[i])); 2103 } 2104 } 2105 2106 if (scall == MAT_REUSE_MATRIX) { 2107 /* Assumes new rows are same length as the old rows */ 2108 for (i=0; i<ismax; i++) { 2109 PetscCheckFalse(!submats[i],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%" PetscInt_FMT "] is null, cannot reuse",i); 2110 subc = (Mat_SeqAIJ*)submats[i]->data; 2111 PetscCheckFalse((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2112 2113 /* Initial matrix as if empty */ 2114 PetscCall(PetscArrayzero(subc->ilen,submats[i]->rmap->n)); 2115 2116 smat_i = subc->submatis1; 2117 2118 nrqs = smat_i->nrqs; 2119 nrqr = smat_i->nrqr; 2120 rbuf1 = smat_i->rbuf1; 2121 rbuf2 = smat_i->rbuf2; 2122 rbuf3 = smat_i->rbuf3; 2123 req_source2 = smat_i->req_source2; 2124 2125 sbuf1 = smat_i->sbuf1; 2126 sbuf2 = smat_i->sbuf2; 2127 ptr = smat_i->ptr; 2128 tmp = smat_i->tmp; 2129 ctr = smat_i->ctr; 2130 2131 pa = smat_i->pa; 2132 req_size = smat_i->req_size; 2133 req_source1 = smat_i->req_source1; 2134 2135 allcolumns[i] = smat_i->allcolumns; 2136 row2proc[i] = smat_i->row2proc; 2137 rmap[i] = smat_i->rmap; 2138 cmap[i] = smat_i->cmap; 2139 } 2140 2141 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2142 PetscCheckFalse(!submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2143 smat_i = (Mat_SubSppt*)submats[0]->data; 2144 2145 nrqs = smat_i->nrqs; 2146 nrqr = smat_i->nrqr; 2147 rbuf1 = smat_i->rbuf1; 2148 rbuf2 = smat_i->rbuf2; 2149 rbuf3 = smat_i->rbuf3; 2150 req_source2 = smat_i->req_source2; 2151 2152 sbuf1 = smat_i->sbuf1; 2153 sbuf2 = smat_i->sbuf2; 2154 ptr = smat_i->ptr; 2155 tmp = smat_i->tmp; 2156 ctr = smat_i->ctr; 2157 2158 pa = smat_i->pa; 2159 req_size = smat_i->req_size; 2160 req_source1 = smat_i->req_source1; 2161 2162 allcolumns[0] = PETSC_FALSE; 2163 } 2164 } else { /* scall == MAT_INITIAL_MATRIX */ 2165 /* Get some new tags to keep the communication clean */ 2166 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2)); 2167 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3)); 2168 2169 /* evaluate communication - mesg to who, length of mesg, and buffer space 2170 required. Based on this, buffers are allocated, and data copied into them*/ 2171 PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4)); /* mesg size, initialize work vectors */ 2172 2173 for (i=0; i<ismax; i++) { 2174 jmax = nrow[i]; 2175 irow_i = irow[i]; 2176 2177 PetscCall(PetscMalloc1(jmax,&row2proc_i)); 2178 row2proc[i] = row2proc_i; 2179 2180 if (issorted[i]) proc = 0; 2181 for (j=0; j<jmax; j++) { 2182 if (!issorted[i]) proc = 0; 2183 row = irow_i[j]; 2184 while (row >= C->rmap->range[proc+1]) proc++; 2185 w4[proc]++; 2186 row2proc_i[j] = proc; /* map row index to proc */ 2187 } 2188 for (j=0; j<size; j++) { 2189 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2190 } 2191 } 2192 2193 nrqs = 0; /* no of outgoing messages */ 2194 msz = 0; /* total mesg length (for all procs) */ 2195 w1[rank] = 0; /* no mesg sent to self */ 2196 w3[rank] = 0; 2197 for (i=0; i<size; i++) { 2198 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2199 } 2200 PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/ 2201 for (i=0,j=0; i<size; i++) { 2202 if (w1[i]) { pa[j] = i; j++; } 2203 } 2204 2205 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2206 for (i=0; i<nrqs; i++) { 2207 j = pa[i]; 2208 w1[j] += w2[j] + 2* w3[j]; 2209 msz += w1[j]; 2210 } 2211 PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz)); 2212 2213 /* Determine the number of messages to expect, their lengths, from from-ids */ 2214 PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 2215 PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 2216 2217 /* Now post the Irecvs corresponding to these messages */ 2218 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag0)); 2219 PetscCall(PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1)); 2220 2221 /* Allocate Memory for outgoing messages */ 2222 PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr)); 2223 PetscCall(PetscArrayzero(sbuf1,size)); 2224 PetscCall(PetscArrayzero(ptr,size)); 2225 2226 { 2227 PetscInt *iptr = tmp; 2228 k = 0; 2229 for (i=0; i<nrqs; i++) { 2230 j = pa[i]; 2231 iptr += k; 2232 sbuf1[j] = iptr; 2233 k = w1[j]; 2234 } 2235 } 2236 2237 /* Form the outgoing messages. Initialize the header space */ 2238 for (i=0; i<nrqs; i++) { 2239 j = pa[i]; 2240 sbuf1[j][0] = 0; 2241 PetscCall(PetscArrayzero(sbuf1[j]+1,2*w3[j])); 2242 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2243 } 2244 2245 /* Parse the isrow and copy data into outbuf */ 2246 for (i=0; i<ismax; i++) { 2247 row2proc_i = row2proc[i]; 2248 PetscCall(PetscArrayzero(ctr,size)); 2249 irow_i = irow[i]; 2250 jmax = nrow[i]; 2251 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2252 proc = row2proc_i[j]; 2253 if (proc != rank) { /* copy to the outgoing buf*/ 2254 ctr[proc]++; 2255 *ptr[proc] = irow_i[j]; 2256 ptr[proc]++; 2257 } 2258 } 2259 /* Update the headers for the current IS */ 2260 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2261 if ((ctr_j = ctr[j])) { 2262 sbuf1_j = sbuf1[j]; 2263 k = ++sbuf1_j[0]; 2264 sbuf1_j[2*k] = ctr_j; 2265 sbuf1_j[2*k-1] = i; 2266 } 2267 } 2268 } 2269 2270 /* Now post the sends */ 2271 PetscCall(PetscMalloc1(nrqs,&s_waits1)); 2272 for (i=0; i<nrqs; ++i) { 2273 j = pa[i]; 2274 PetscCallMPI(MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i)); 2275 } 2276 2277 /* Post Receives to capture the buffer size */ 2278 PetscCall(PetscMalloc1(nrqs,&r_waits2)); 2279 PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3)); 2280 if (nrqs) rbuf2[0] = tmp + msz; 2281 for (i=1; i<nrqs; ++i) { 2282 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2283 } 2284 for (i=0; i<nrqs; ++i) { 2285 j = pa[i]; 2286 PetscCallMPI(MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i)); 2287 } 2288 2289 /* Send to other procs the buf size they should allocate */ 2290 /* Receive messages*/ 2291 PetscCall(PetscMalloc1(nrqr,&s_waits2)); 2292 PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1)); 2293 { 2294 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2295 PetscInt *sbuf2_i; 2296 2297 PetscCallMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE)); 2298 for (i=0; i<nrqr; ++i) { 2299 req_size[i] = 0; 2300 rbuf1_i = rbuf1[i]; 2301 start = 2*rbuf1_i[0] + 1; 2302 end = olengths1[i]; 2303 PetscCall(PetscMalloc1(end,&sbuf2[i])); 2304 sbuf2_i = sbuf2[i]; 2305 for (j=start; j<end; j++) { 2306 id = rbuf1_i[j] - rstart; 2307 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2308 sbuf2_i[j] = ncols; 2309 req_size[i] += ncols; 2310 } 2311 req_source1[i] = onodes1[i]; 2312 /* form the header */ 2313 sbuf2_i[0] = req_size[i]; 2314 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2315 2316 PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i)); 2317 } 2318 } 2319 2320 PetscCall(PetscFree(onodes1)); 2321 PetscCall(PetscFree(olengths1)); 2322 PetscCall(PetscFree(r_waits1)); 2323 PetscCall(PetscFree4(w1,w2,w3,w4)); 2324 2325 /* Receive messages*/ 2326 PetscCall(PetscMalloc1(nrqs,&r_waits3)); 2327 PetscCallMPI(MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE)); 2328 for (i=0; i<nrqs; ++i) { 2329 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i])); 2330 req_source2[i] = pa[i]; 2331 PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i)); 2332 } 2333 PetscCall(PetscFree(r_waits2)); 2334 2335 /* Wait on sends1 and sends2 */ 2336 PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE)); 2337 PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE)); 2338 PetscCall(PetscFree(s_waits1)); 2339 PetscCall(PetscFree(s_waits2)); 2340 2341 /* Now allocate sending buffers for a->j, and send them off */ 2342 PetscCall(PetscMalloc1(nrqr,&sbuf_aj)); 2343 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2344 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0])); 2345 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2346 2347 PetscCall(PetscMalloc1(nrqr,&s_waits3)); 2348 { 2349 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2350 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2351 PetscInt cend = C->cmap->rend; 2352 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2353 2354 for (i=0; i<nrqr; i++) { 2355 rbuf1_i = rbuf1[i]; 2356 sbuf_aj_i = sbuf_aj[i]; 2357 ct1 = 2*rbuf1_i[0] + 1; 2358 ct2 = 0; 2359 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2360 kmax = rbuf1[i][2*j]; 2361 for (k=0; k<kmax; k++,ct1++) { 2362 row = rbuf1_i[ct1] - rstart; 2363 nzA = a_i[row+1] - a_i[row]; 2364 nzB = b_i[row+1] - b_i[row]; 2365 ncols = nzA + nzB; 2366 cworkA = a_j + a_i[row]; 2367 cworkB = b_j + b_i[row]; 2368 2369 /* load the column indices for this row into cols */ 2370 cols = sbuf_aj_i + ct2; 2371 2372 lwrite = 0; 2373 for (l=0; l<nzB; l++) { 2374 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2375 } 2376 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2377 for (l=0; l<nzB; l++) { 2378 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2379 } 2380 2381 ct2 += ncols; 2382 } 2383 } 2384 PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i)); 2385 } 2386 } 2387 2388 /* create col map: global col of C -> local col of submatrices */ 2389 { 2390 const PetscInt *icol_i; 2391 #if defined(PETSC_USE_CTABLE) 2392 for (i=0; i<ismax; i++) { 2393 if (!allcolumns[i]) { 2394 PetscCall(PetscTableCreate(ncol[i],C->cmap->N,&cmap[i])); 2395 2396 jmax = ncol[i]; 2397 icol_i = icol[i]; 2398 cmap_i = cmap[i]; 2399 for (j=0; j<jmax; j++) { 2400 PetscCall(PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES)); 2401 } 2402 } else cmap[i] = NULL; 2403 } 2404 #else 2405 for (i=0; i<ismax; i++) { 2406 if (!allcolumns[i]) { 2407 PetscCall(PetscCalloc1(C->cmap->N,&cmap[i])); 2408 jmax = ncol[i]; 2409 icol_i = icol[i]; 2410 cmap_i = cmap[i]; 2411 for (j=0; j<jmax; j++) { 2412 cmap_i[icol_i[j]] = j+1; 2413 } 2414 } else cmap[i] = NULL; 2415 } 2416 #endif 2417 } 2418 2419 /* Create lens which is required for MatCreate... */ 2420 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2421 PetscCall(PetscMalloc1(ismax,&lens)); 2422 2423 if (ismax) { 2424 PetscCall(PetscCalloc1(j,&lens[0])); 2425 } 2426 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2427 2428 /* Update lens from local data */ 2429 for (i=0; i<ismax; i++) { 2430 row2proc_i = row2proc[i]; 2431 jmax = nrow[i]; 2432 if (!allcolumns[i]) cmap_i = cmap[i]; 2433 irow_i = irow[i]; 2434 lens_i = lens[i]; 2435 for (j=0; j<jmax; j++) { 2436 row = irow_i[j]; 2437 proc = row2proc_i[j]; 2438 if (proc == rank) { 2439 PetscCall(MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL)); 2440 if (!allcolumns[i]) { 2441 for (k=0; k<ncols; k++) { 2442 #if defined(PETSC_USE_CTABLE) 2443 PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol)); 2444 #else 2445 tcol = cmap_i[cols[k]]; 2446 #endif 2447 if (tcol) lens_i[j]++; 2448 } 2449 } else { /* allcolumns */ 2450 lens_i[j] = ncols; 2451 } 2452 PetscCall(MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL)); 2453 } 2454 } 2455 } 2456 2457 /* Create row map: global row of C -> local row of submatrices */ 2458 #if defined(PETSC_USE_CTABLE) 2459 for (i=0; i<ismax; i++) { 2460 PetscCall(PetscTableCreate(nrow[i],C->rmap->N,&rmap[i])); 2461 irow_i = irow[i]; 2462 jmax = nrow[i]; 2463 for (j=0; j<jmax; j++) { 2464 PetscCall(PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES)); 2465 } 2466 } 2467 #else 2468 for (i=0; i<ismax; i++) { 2469 PetscCall(PetscCalloc1(C->rmap->N,&rmap[i])); 2470 rmap_i = rmap[i]; 2471 irow_i = irow[i]; 2472 jmax = nrow[i]; 2473 for (j=0; j<jmax; j++) { 2474 rmap_i[irow_i[j]] = j; 2475 } 2476 } 2477 #endif 2478 2479 /* Update lens from offproc data */ 2480 { 2481 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2482 2483 PetscCallMPI(MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE)); 2484 for (tmp2=0; tmp2<nrqs; tmp2++) { 2485 sbuf1_i = sbuf1[pa[tmp2]]; 2486 jmax = sbuf1_i[0]; 2487 ct1 = 2*jmax+1; 2488 ct2 = 0; 2489 rbuf2_i = rbuf2[tmp2]; 2490 rbuf3_i = rbuf3[tmp2]; 2491 for (j=1; j<=jmax; j++) { 2492 is_no = sbuf1_i[2*j-1]; 2493 max1 = sbuf1_i[2*j]; 2494 lens_i = lens[is_no]; 2495 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2496 rmap_i = rmap[is_no]; 2497 for (k=0; k<max1; k++,ct1++) { 2498 #if defined(PETSC_USE_CTABLE) 2499 PetscCall(PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row)); 2500 row--; 2501 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2502 #else 2503 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2504 #endif 2505 max2 = rbuf2_i[ct1]; 2506 for (l=0; l<max2; l++,ct2++) { 2507 if (!allcolumns[is_no]) { 2508 #if defined(PETSC_USE_CTABLE) 2509 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol)); 2510 #else 2511 tcol = cmap_i[rbuf3_i[ct2]]; 2512 #endif 2513 if (tcol) lens_i[row]++; 2514 } else { /* allcolumns */ 2515 lens_i[row]++; /* lens_i[row] += max2 ? */ 2516 } 2517 } 2518 } 2519 } 2520 } 2521 } 2522 PetscCall(PetscFree(r_waits3)); 2523 PetscCallMPI(MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE)); 2524 PetscCall(PetscFree(s_waits3)); 2525 2526 /* Create the submatrices */ 2527 for (i=0; i<ismax; i++) { 2528 PetscInt rbs,cbs; 2529 2530 PetscCall(ISGetBlockSize(isrow[i],&rbs)); 2531 PetscCall(ISGetBlockSize(iscol[i],&cbs)); 2532 2533 PetscCall(MatCreate(PETSC_COMM_SELF,submats+i)); 2534 PetscCall(MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE)); 2535 2536 PetscCall(MatSetBlockSizes(submats[i],rbs,cbs)); 2537 PetscCall(MatSetType(submats[i],((PetscObject)A)->type_name)); 2538 PetscCall(MatSeqAIJSetPreallocation(submats[i],0,lens[i])); 2539 2540 /* create struct Mat_SubSppt and attached it to submat */ 2541 PetscCall(PetscNew(&smat_i)); 2542 subc = (Mat_SeqAIJ*)submats[i]->data; 2543 subc->submatis1 = smat_i; 2544 2545 smat_i->destroy = submats[i]->ops->destroy; 2546 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2547 submats[i]->factortype = C->factortype; 2548 2549 smat_i->id = i; 2550 smat_i->nrqs = nrqs; 2551 smat_i->nrqr = nrqr; 2552 smat_i->rbuf1 = rbuf1; 2553 smat_i->rbuf2 = rbuf2; 2554 smat_i->rbuf3 = rbuf3; 2555 smat_i->sbuf2 = sbuf2; 2556 smat_i->req_source2 = req_source2; 2557 2558 smat_i->sbuf1 = sbuf1; 2559 smat_i->ptr = ptr; 2560 smat_i->tmp = tmp; 2561 smat_i->ctr = ctr; 2562 2563 smat_i->pa = pa; 2564 smat_i->req_size = req_size; 2565 smat_i->req_source1 = req_source1; 2566 2567 smat_i->allcolumns = allcolumns[i]; 2568 smat_i->singleis = PETSC_FALSE; 2569 smat_i->row2proc = row2proc[i]; 2570 smat_i->rmap = rmap[i]; 2571 smat_i->cmap = cmap[i]; 2572 } 2573 2574 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2575 PetscCall(MatCreate(PETSC_COMM_SELF,&submats[0])); 2576 PetscCall(MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE)); 2577 PetscCall(MatSetType(submats[0],MATDUMMY)); 2578 2579 /* create struct Mat_SubSppt and attached it to submat */ 2580 PetscCall(PetscNewLog(submats[0],&smat_i)); 2581 submats[0]->data = (void*)smat_i; 2582 2583 smat_i->destroy = submats[0]->ops->destroy; 2584 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2585 submats[0]->factortype = C->factortype; 2586 2587 smat_i->id = 0; 2588 smat_i->nrqs = nrqs; 2589 smat_i->nrqr = nrqr; 2590 smat_i->rbuf1 = rbuf1; 2591 smat_i->rbuf2 = rbuf2; 2592 smat_i->rbuf3 = rbuf3; 2593 smat_i->sbuf2 = sbuf2; 2594 smat_i->req_source2 = req_source2; 2595 2596 smat_i->sbuf1 = sbuf1; 2597 smat_i->ptr = ptr; 2598 smat_i->tmp = tmp; 2599 smat_i->ctr = ctr; 2600 2601 smat_i->pa = pa; 2602 smat_i->req_size = req_size; 2603 smat_i->req_source1 = req_source1; 2604 2605 smat_i->allcolumns = PETSC_FALSE; 2606 smat_i->singleis = PETSC_FALSE; 2607 smat_i->row2proc = NULL; 2608 smat_i->rmap = NULL; 2609 smat_i->cmap = NULL; 2610 } 2611 2612 if (ismax) PetscCall(PetscFree(lens[0])); 2613 PetscCall(PetscFree(lens)); 2614 if (sbuf_aj) { 2615 PetscCall(PetscFree(sbuf_aj[0])); 2616 PetscCall(PetscFree(sbuf_aj)); 2617 } 2618 2619 } /* endof scall == MAT_INITIAL_MATRIX */ 2620 2621 /* Post recv matrix values */ 2622 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4)); 2623 PetscCall(PetscMalloc1(nrqs,&rbuf4)); 2624 PetscCall(PetscMalloc1(nrqs,&r_waits4)); 2625 for (i=0; i<nrqs; ++i) { 2626 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i])); 2627 PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i)); 2628 } 2629 2630 /* Allocate sending buffers for a->a, and send them off */ 2631 PetscCall(PetscMalloc1(nrqr,&sbuf_aa)); 2632 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2633 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0])); 2634 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2635 2636 PetscCall(PetscMalloc1(nrqr,&s_waits4)); 2637 { 2638 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2639 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2640 PetscInt cend = C->cmap->rend; 2641 PetscInt *b_j = b->j; 2642 PetscScalar *vworkA,*vworkB,*a_a,*b_a; 2643 2644 PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a)); 2645 PetscCall(MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a)); 2646 for (i=0; i<nrqr; i++) { 2647 rbuf1_i = rbuf1[i]; 2648 sbuf_aa_i = sbuf_aa[i]; 2649 ct1 = 2*rbuf1_i[0]+1; 2650 ct2 = 0; 2651 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2652 kmax = rbuf1_i[2*j]; 2653 for (k=0; k<kmax; k++,ct1++) { 2654 row = rbuf1_i[ct1] - rstart; 2655 nzA = a_i[row+1] - a_i[row]; 2656 nzB = b_i[row+1] - b_i[row]; 2657 ncols = nzA + nzB; 2658 cworkB = b_j + b_i[row]; 2659 vworkA = a_a + a_i[row]; 2660 vworkB = b_a + b_i[row]; 2661 2662 /* load the column values for this row into vals*/ 2663 vals = sbuf_aa_i+ct2; 2664 2665 lwrite = 0; 2666 for (l=0; l<nzB; l++) { 2667 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2668 } 2669 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2670 for (l=0; l<nzB; l++) { 2671 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2672 } 2673 2674 ct2 += ncols; 2675 } 2676 } 2677 PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i)); 2678 } 2679 PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a)); 2680 PetscCall(MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a)); 2681 } 2682 2683 /* Assemble the matrices */ 2684 /* First assemble the local rows */ 2685 for (i=0; i<ismax; i++) { 2686 row2proc_i = row2proc[i]; 2687 subc = (Mat_SeqAIJ*)submats[i]->data; 2688 imat_ilen = subc->ilen; 2689 imat_j = subc->j; 2690 imat_i = subc->i; 2691 imat_a = subc->a; 2692 2693 if (!allcolumns[i]) cmap_i = cmap[i]; 2694 rmap_i = rmap[i]; 2695 irow_i = irow[i]; 2696 jmax = nrow[i]; 2697 for (j=0; j<jmax; j++) { 2698 row = irow_i[j]; 2699 proc = row2proc_i[j]; 2700 if (proc == rank) { 2701 old_row = row; 2702 #if defined(PETSC_USE_CTABLE) 2703 PetscCall(PetscTableFind(rmap_i,row+1,&row)); 2704 row--; 2705 #else 2706 row = rmap_i[row]; 2707 #endif 2708 ilen_row = imat_ilen[row]; 2709 PetscCall(MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals)); 2710 mat_i = imat_i[row]; 2711 mat_a = imat_a + mat_i; 2712 mat_j = imat_j + mat_i; 2713 if (!allcolumns[i]) { 2714 for (k=0; k<ncols; k++) { 2715 #if defined(PETSC_USE_CTABLE) 2716 PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol)); 2717 #else 2718 tcol = cmap_i[cols[k]]; 2719 #endif 2720 if (tcol) { 2721 *mat_j++ = tcol - 1; 2722 *mat_a++ = vals[k]; 2723 ilen_row++; 2724 } 2725 } 2726 } else { /* allcolumns */ 2727 for (k=0; k<ncols; k++) { 2728 *mat_j++ = cols[k]; /* global col index! */ 2729 *mat_a++ = vals[k]; 2730 ilen_row++; 2731 } 2732 } 2733 PetscCall(MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals)); 2734 2735 imat_ilen[row] = ilen_row; 2736 } 2737 } 2738 } 2739 2740 /* Now assemble the off proc rows */ 2741 PetscCallMPI(MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE)); 2742 for (tmp2=0; tmp2<nrqs; tmp2++) { 2743 sbuf1_i = sbuf1[pa[tmp2]]; 2744 jmax = sbuf1_i[0]; 2745 ct1 = 2*jmax + 1; 2746 ct2 = 0; 2747 rbuf2_i = rbuf2[tmp2]; 2748 rbuf3_i = rbuf3[tmp2]; 2749 rbuf4_i = rbuf4[tmp2]; 2750 for (j=1; j<=jmax; j++) { 2751 is_no = sbuf1_i[2*j-1]; 2752 rmap_i = rmap[is_no]; 2753 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2754 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2755 imat_ilen = subc->ilen; 2756 imat_j = subc->j; 2757 imat_i = subc->i; 2758 imat_a = subc->a; 2759 max1 = sbuf1_i[2*j]; 2760 for (k=0; k<max1; k++,ct1++) { 2761 row = sbuf1_i[ct1]; 2762 #if defined(PETSC_USE_CTABLE) 2763 PetscCall(PetscTableFind(rmap_i,row+1,&row)); 2764 row--; 2765 #else 2766 row = rmap_i[row]; 2767 #endif 2768 ilen = imat_ilen[row]; 2769 mat_i = imat_i[row]; 2770 mat_a = imat_a + mat_i; 2771 mat_j = imat_j + mat_i; 2772 max2 = rbuf2_i[ct1]; 2773 if (!allcolumns[is_no]) { 2774 for (l=0; l<max2; l++,ct2++) { 2775 #if defined(PETSC_USE_CTABLE) 2776 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol)); 2777 #else 2778 tcol = cmap_i[rbuf3_i[ct2]]; 2779 #endif 2780 if (tcol) { 2781 *mat_j++ = tcol - 1; 2782 *mat_a++ = rbuf4_i[ct2]; 2783 ilen++; 2784 } 2785 } 2786 } else { /* allcolumns */ 2787 for (l=0; l<max2; l++,ct2++) { 2788 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2789 *mat_a++ = rbuf4_i[ct2]; 2790 ilen++; 2791 } 2792 } 2793 imat_ilen[row] = ilen; 2794 } 2795 } 2796 } 2797 2798 if (!iscsorted) { /* sort column indices of the rows */ 2799 for (i=0; i<ismax; i++) { 2800 subc = (Mat_SeqAIJ*)submats[i]->data; 2801 imat_j = subc->j; 2802 imat_i = subc->i; 2803 imat_a = subc->a; 2804 imat_ilen = subc->ilen; 2805 2806 if (allcolumns[i]) continue; 2807 jmax = nrow[i]; 2808 for (j=0; j<jmax; j++) { 2809 mat_i = imat_i[j]; 2810 mat_a = imat_a + mat_i; 2811 mat_j = imat_j + mat_i; 2812 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a)); 2813 } 2814 } 2815 } 2816 2817 PetscCall(PetscFree(r_waits4)); 2818 PetscCallMPI(MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE)); 2819 PetscCall(PetscFree(s_waits4)); 2820 2821 /* Restore the indices */ 2822 for (i=0; i<ismax; i++) { 2823 PetscCall(ISRestoreIndices(isrow[i],irow+i)); 2824 if (!allcolumns[i]) { 2825 PetscCall(ISRestoreIndices(iscol[i],icol+i)); 2826 } 2827 } 2828 2829 for (i=0; i<ismax; i++) { 2830 PetscCall(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY)); 2831 PetscCall(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY)); 2832 } 2833 2834 /* Destroy allocated memory */ 2835 if (sbuf_aa) { 2836 PetscCall(PetscFree(sbuf_aa[0])); 2837 PetscCall(PetscFree(sbuf_aa)); 2838 } 2839 PetscCall(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted)); 2840 2841 for (i=0; i<nrqs; ++i) { 2842 PetscCall(PetscFree(rbuf4[i])); 2843 } 2844 PetscCall(PetscFree(rbuf4)); 2845 2846 PetscCall(PetscFree4(row2proc,cmap,rmap,allcolumns)); 2847 PetscFunctionReturn(0); 2848 } 2849 2850 /* 2851 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2852 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2853 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2854 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2855 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2856 state, and needs to be "assembled" later by compressing B's column space. 2857 2858 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2859 Following this call, C->A & C->B have been created, even if empty. 2860 */ 2861 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2862 { 2863 /* If making this function public, change the error returned in this function away from _PLIB. */ 2864 Mat_MPIAIJ *aij; 2865 Mat_SeqAIJ *Baij; 2866 PetscBool seqaij,Bdisassembled; 2867 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2868 PetscScalar v; 2869 const PetscInt *rowindices,*colindices; 2870 2871 PetscFunctionBegin; 2872 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2873 if (A) { 2874 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij)); 2875 PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2876 if (rowemb) { 2877 PetscCall(ISGetLocalSize(rowemb,&m)); 2878 PetscCheckFalse(m != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT,m,A->rmap->n); 2879 } else { 2880 if (C->rmap->n != A->rmap->n) { 2881 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2882 } 2883 } 2884 if (dcolemb) { 2885 PetscCall(ISGetLocalSize(dcolemb,&n)); 2886 PetscCheckFalse(n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT,n,A->cmap->n); 2887 } else { 2888 PetscCheckFalse(C->cmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2889 } 2890 } 2891 if (B) { 2892 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij)); 2893 PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2894 if (rowemb) { 2895 PetscCall(ISGetLocalSize(rowemb,&m)); 2896 PetscCheckFalse(m != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT,m,A->rmap->n); 2897 } else { 2898 if (C->rmap->n != B->rmap->n) { 2899 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2900 } 2901 } 2902 if (ocolemb) { 2903 PetscCall(ISGetLocalSize(ocolemb,&n)); 2904 PetscCheckFalse(n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT,n,B->cmap->n); 2905 } else { 2906 PetscCheckFalse(C->cmap->N - C->cmap->n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2907 } 2908 } 2909 2910 aij = (Mat_MPIAIJ*)C->data; 2911 if (!aij->A) { 2912 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2913 PetscCall(MatCreate(PETSC_COMM_SELF,&aij->A)); 2914 PetscCall(MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n)); 2915 PetscCall(MatSetBlockSizesFromMats(aij->A,C,C)); 2916 PetscCall(MatSetType(aij->A,MATSEQAIJ)); 2917 PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A)); 2918 } 2919 if (A) { 2920 PetscCall(MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A)); 2921 } else { 2922 PetscCall(MatSetUp(aij->A)); 2923 } 2924 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2925 /* 2926 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2927 need to "disassemble" B -- convert it to using C's global indices. 2928 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2929 2930 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2931 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2932 2933 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2934 At least avoid calling MatSetValues() and the implied searches? 2935 */ 2936 2937 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2938 #if defined(PETSC_USE_CTABLE) 2939 PetscCall(PetscTableDestroy(&aij->colmap)); 2940 #else 2941 PetscCall(PetscFree(aij->colmap)); 2942 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2943 if (aij->B) { 2944 PetscCall(PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt))); 2945 } 2946 #endif 2947 ngcol = 0; 2948 if (aij->lvec) { 2949 PetscCall(VecGetSize(aij->lvec,&ngcol)); 2950 } 2951 if (aij->garray) { 2952 PetscCall(PetscFree(aij->garray)); 2953 PetscCall(PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt))); 2954 } 2955 PetscCall(VecDestroy(&aij->lvec)); 2956 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2957 } 2958 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2959 PetscCall(MatDestroy(&aij->B)); 2960 } 2961 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2962 PetscCall(MatZeroEntries(aij->B)); 2963 } 2964 } 2965 Bdisassembled = PETSC_FALSE; 2966 if (!aij->B) { 2967 PetscCall(MatCreate(PETSC_COMM_SELF,&aij->B)); 2968 PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B)); 2969 PetscCall(MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N)); 2970 PetscCall(MatSetBlockSizesFromMats(aij->B,B,B)); 2971 PetscCall(MatSetType(aij->B,MATSEQAIJ)); 2972 Bdisassembled = PETSC_TRUE; 2973 } 2974 if (B) { 2975 Baij = (Mat_SeqAIJ*)B->data; 2976 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2977 PetscCall(PetscMalloc1(B->rmap->n,&nz)); 2978 for (i=0; i<B->rmap->n; i++) { 2979 nz[i] = Baij->i[i+1] - Baij->i[i]; 2980 } 2981 PetscCall(MatSeqAIJSetPreallocation(aij->B,0,nz)); 2982 PetscCall(PetscFree(nz)); 2983 } 2984 2985 PetscCall(PetscLayoutGetRange(C->rmap,&rstart,&rend)); 2986 shift = rend-rstart; 2987 count = 0; 2988 rowindices = NULL; 2989 colindices = NULL; 2990 if (rowemb) { 2991 PetscCall(ISGetIndices(rowemb,&rowindices)); 2992 } 2993 if (ocolemb) { 2994 PetscCall(ISGetIndices(ocolemb,&colindices)); 2995 } 2996 for (i=0; i<B->rmap->n; i++) { 2997 PetscInt row; 2998 row = i; 2999 if (rowindices) row = rowindices[i]; 3000 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3001 col = Baij->j[count]; 3002 if (colindices) col = colindices[col]; 3003 if (Bdisassembled && col>=rstart) col += shift; 3004 v = Baij->a[count]; 3005 PetscCall(MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES)); 3006 ++count; 3007 } 3008 } 3009 /* No assembly for aij->B is necessary. */ 3010 /* FIXME: set aij->B's nonzerostate correctly. */ 3011 } else { 3012 PetscCall(MatSetUp(aij->B)); 3013 } 3014 C->preallocated = PETSC_TRUE; 3015 C->was_assembled = PETSC_FALSE; 3016 C->assembled = PETSC_FALSE; 3017 /* 3018 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3019 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3020 */ 3021 PetscFunctionReturn(0); 3022 } 3023 3024 /* 3025 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3026 */ 3027 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3028 { 3029 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3030 3031 PetscFunctionBegin; 3032 PetscValidPointer(A,2); 3033 PetscValidPointer(B,3); 3034 /* FIXME: make sure C is assembled */ 3035 *A = aij->A; 3036 *B = aij->B; 3037 /* Note that we don't incref *A and *B, so be careful! */ 3038 PetscFunctionReturn(0); 3039 } 3040 3041 /* 3042 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3043 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3044 */ 3045 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3046 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3047 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3048 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3049 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3050 { 3051 PetscMPIInt size,flag; 3052 PetscInt i,ii,cismax,ispar; 3053 Mat *A,*B; 3054 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3055 3056 PetscFunctionBegin; 3057 if (!ismax) PetscFunctionReturn(0); 3058 3059 for (i = 0, cismax = 0; i < ismax; ++i) { 3060 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag)); 3061 PetscCheckFalse(flag != MPI_IDENT,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3062 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3063 if (size > 1) ++cismax; 3064 } 3065 3066 /* 3067 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3068 ispar counts the number of parallel ISs across C's comm. 3069 */ 3070 PetscCall(MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C))); 3071 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3072 PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat)); 3073 PetscFunctionReturn(0); 3074 } 3075 3076 /* if (ispar) */ 3077 /* 3078 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3079 These are used to extract the off-diag portion of the resulting parallel matrix. 3080 The row IS for the off-diag portion is the same as for the diag portion, 3081 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3082 */ 3083 PetscCall(PetscMalloc2(cismax,&cisrow,cismax,&ciscol)); 3084 PetscCall(PetscMalloc1(cismax,&ciscol_p)); 3085 for (i = 0, ii = 0; i < ismax; ++i) { 3086 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3087 if (size > 1) { 3088 /* 3089 TODO: This is the part that's ***NOT SCALABLE***. 3090 To fix this we need to extract just the indices of C's nonzero columns 3091 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3092 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3093 to be done without serializing on the IS list, so, most likely, it is best 3094 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3095 */ 3096 PetscCall(ISGetNonlocalIS(iscol[i],&(ciscol[ii]))); 3097 /* Now we have to 3098 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3099 were sorted on each rank, concatenated they might no longer be sorted; 3100 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3101 indices in the nondecreasing order to the original index positions. 3102 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3103 */ 3104 PetscCall(ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii)); 3105 PetscCall(ISSort(ciscol[ii])); 3106 ++ii; 3107 } 3108 } 3109 PetscCall(PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p)); 3110 for (i = 0, ii = 0; i < ismax; ++i) { 3111 PetscInt j,issize; 3112 const PetscInt *indices; 3113 3114 /* 3115 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3116 */ 3117 PetscCall(ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i)); 3118 PetscCall(ISSort(isrow[i])); 3119 PetscCall(ISGetLocalSize(isrow[i],&issize)); 3120 PetscCall(ISGetIndices(isrow[i],&indices)); 3121 for (j = 1; j < issize; ++j) { 3122 PetscCheckFalse(indices[j] == indices[j-1],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]); 3123 } 3124 PetscCall(ISRestoreIndices(isrow[i],&indices)); 3125 PetscCall(ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i)); 3126 PetscCall(ISSort(iscol[i])); 3127 PetscCall(ISGetLocalSize(iscol[i],&issize)); 3128 PetscCall(ISGetIndices(iscol[i],&indices)); 3129 for (j = 1; j < issize; ++j) { 3130 PetscCheckFalse(indices[j-1] == indices[j],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]); 3131 } 3132 PetscCall(ISRestoreIndices(iscol[i],&indices)); 3133 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3134 if (size > 1) { 3135 cisrow[ii] = isrow[i]; 3136 ++ii; 3137 } 3138 } 3139 /* 3140 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3141 array of sequential matrices underlying the resulting parallel matrices. 3142 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3143 contain duplicates. 3144 3145 There are as many diag matrices as there are original index sets. There are only as many parallel 3146 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3147 3148 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3149 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3150 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3151 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3152 with A[i] and B[ii] extracted from the corresponding MPI submat. 3153 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3154 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3155 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3156 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3157 values into A[i] and B[ii] sitting inside the corresponding submat. 3158 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3159 A[i], B[ii], AA[i] or BB[ii] matrices. 3160 */ 3161 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3162 if (scall == MAT_INITIAL_MATRIX) { 3163 PetscCall(PetscMalloc1(ismax,submat)); 3164 } 3165 3166 /* Now obtain the sequential A and B submatrices separately. */ 3167 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3168 PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A)); 3169 PetscCall((*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B)); 3170 3171 /* 3172 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3173 matrices A & B have been extracted directly into the parallel matrices containing them, or 3174 simply into the sequential matrix identical with the corresponding A (if size == 1). 3175 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3176 to have the same sparsity pattern. 3177 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3178 must be constructed for C. This is done by setseqmat(s). 3179 */ 3180 for (i = 0, ii = 0; i < ismax; ++i) { 3181 /* 3182 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3183 That way we can avoid sorting and computing permutations when reusing. 3184 To this end: 3185 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3186 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3187 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3188 */ 3189 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3190 3191 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3192 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3193 if (size > 1) { 3194 if (scall == MAT_INITIAL_MATRIX) { 3195 PetscCall(MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i)); 3196 PetscCall(MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE)); 3197 PetscCall(MatSetType((*submat)[i],MATMPIAIJ)); 3198 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3199 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3200 } 3201 /* 3202 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3203 */ 3204 { 3205 Mat AA = A[i],BB = B[ii]; 3206 3207 if (AA || BB) { 3208 PetscCall(setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB)); 3209 PetscCall(MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY)); 3210 PetscCall(MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY)); 3211 } 3212 PetscCall(MatDestroy(&AA)); 3213 } 3214 PetscCall(ISDestroy(ciscol+ii)); 3215 PetscCall(ISDestroy(ciscol_p+ii)); 3216 ++ii; 3217 } else { /* if (size == 1) */ 3218 if (scall == MAT_REUSE_MATRIX) { 3219 PetscCall(MatDestroy(&(*submat)[i])); 3220 } 3221 if (isrow_p[i] || iscol_p[i]) { 3222 PetscCall(MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i)); 3223 PetscCall(setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i])); 3224 /* Otherwise A is extracted straight into (*submats)[i]. */ 3225 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3226 PetscCall(MatDestroy(A+i)); 3227 } else (*submat)[i] = A[i]; 3228 } 3229 PetscCall(ISDestroy(&isrow_p[i])); 3230 PetscCall(ISDestroy(&iscol_p[i])); 3231 } 3232 PetscCall(PetscFree2(cisrow,ciscol)); 3233 PetscCall(PetscFree2(isrow_p,iscol_p)); 3234 PetscCall(PetscFree(ciscol_p)); 3235 PetscCall(PetscFree(A)); 3236 PetscCall(MatDestroySubMatrices(cismax,&B)); 3237 PetscFunctionReturn(0); 3238 } 3239 3240 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3241 { 3242 PetscFunctionBegin; 3243 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ)); 3244 PetscFunctionReturn(0); 3245 } 3246