1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 22 { 23 PetscInt i; 24 25 PetscFunctionBegin; 26 PetscCheck(ov >= 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 27 for (i=0; i<ov; ++i) { 28 PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C,imax,is)); 29 } 30 PetscFunctionReturn(0); 31 } 32 33 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 34 { 35 PetscInt i; 36 37 PetscFunctionBegin; 38 PetscCheck(ov >= 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 39 for (i=0; i<ov; ++i) { 40 PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is)); 41 } 42 PetscFunctionReturn(0); 43 } 44 45 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 46 { 47 MPI_Comm comm; 48 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 49 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 50 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 51 const PetscInt *indices_i,**indices; 52 PetscLayout rmap; 53 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 54 PetscSF sf; 55 PetscSFNode *remote; 56 57 PetscFunctionBegin; 58 PetscCall(PetscObjectGetComm((PetscObject)mat,&comm)); 59 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 60 PetscCallMPI(MPI_Comm_size(comm,&size)); 61 /* get row map to determine where rows should be going */ 62 PetscCall(MatGetLayouts(mat,&rmap,NULL)); 63 /* retrieve IS data and put all together so that we 64 * can optimize communication 65 * */ 66 PetscCall(PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length)); 67 for (i=0,tlength=0; i<nidx; i++) { 68 PetscCall(ISGetLocalSize(is[i],&length[i])); 69 tlength += length[i]; 70 PetscCall(ISGetIndices(is[i],&indices[i])); 71 } 72 /* find these rows on remote processors */ 73 PetscCall(PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids)); 74 PetscCall(PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp)); 75 nrrows = 0; 76 for (i=0; i<nidx; i++) { 77 length_i = length[i]; 78 indices_i = indices[i]; 79 for (j=0; j<length_i; j++) { 80 owner = -1; 81 PetscCall(PetscLayoutFindOwner(rmap,indices_i[j],&owner)); 82 /* remote processors */ 83 if (owner != rank) { 84 tosizes_temp[owner]++; /* number of rows to owner */ 85 rrow_ranks[nrrows] = owner; /* processor */ 86 rrow_isids[nrrows] = i; /* is id */ 87 remoterows[nrrows++] = indices_i[j]; /* row */ 88 } 89 } 90 PetscCall(ISRestoreIndices(is[i],&indices[i])); 91 } 92 PetscCall(PetscFree2(*(PetscInt***)&indices,length)); 93 /* test if we need to exchange messages 94 * generally speaking, we do not need to exchange 95 * data when overlap is 1 96 * */ 97 PetscCall(MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm)); 98 /* we do not have any messages 99 * It usually corresponds to overlap 1 100 * */ 101 if (!reducednrrows) { 102 PetscCall(PetscFree3(toranks,tosizes,tosizes_temp)); 103 PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids)); 104 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is)); 105 PetscFunctionReturn(0); 106 } 107 nto = 0; 108 /* send sizes and ranks for building a two-sided communcation */ 109 for (i=0; i<size; i++) { 110 if (tosizes_temp[i]) { 111 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 112 tosizes_temp[i] = nto; /* a map from processor to index */ 113 toranks[nto++] = i; /* processor */ 114 } 115 } 116 PetscCall(PetscMalloc1(nto+1,&toffsets)); 117 toffsets[0] = 0; 118 for (i=0; i<nto; i++) { 119 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 120 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 121 } 122 /* send information to other processors */ 123 PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes)); 124 nrecvrows = 0; 125 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 126 PetscCall(PetscMalloc1(nrecvrows,&remote)); 127 nrecvrows = 0; 128 for (i=0; i<nfrom; i++) { 129 for (j=0; j<fromsizes[2*i]; j++) { 130 remote[nrecvrows].rank = fromranks[i]; 131 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 132 } 133 } 134 PetscCall(PetscSFCreate(comm,&sf)); 135 PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER)); 136 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 137 PetscCall(PetscSFSetType(sf,PETSCSFBASIC)); 138 PetscCall(PetscSFSetFromOptions(sf)); 139 /* message pair <no of is, row> */ 140 PetscCall(PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata)); 141 for (i=0; i<nrrows; i++) { 142 owner = rrow_ranks[i]; /* processor */ 143 j = tosizes_temp[owner]; /* index */ 144 todata[toffsets[j]++] = rrow_isids[i]; 145 todata[toffsets[j]++] = remoterows[i]; 146 } 147 PetscCall(PetscFree3(toranks,tosizes,tosizes_temp)); 148 PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids)); 149 PetscCall(PetscFree(toffsets)); 150 PetscCall(PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE)); 151 PetscCall(PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE)); 152 PetscCall(PetscSFDestroy(&sf)); 153 /* send rows belonging to the remote so that then we could get the overlapping data back */ 154 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata)); 155 PetscCall(PetscFree2(todata,fromdata)); 156 PetscCall(PetscFree(fromsizes)); 157 PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes)); 158 PetscCall(PetscFree(fromranks)); 159 nrecvrows = 0; 160 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 161 PetscCall(PetscCalloc1(nrecvrows,&todata)); 162 PetscCall(PetscMalloc1(nrecvrows,&remote)); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) { 165 for (j=0; j<tosizes[2*i]; j++) { 166 remote[nrecvrows].rank = toranks[i]; 167 remote[nrecvrows++].index = tosizes[2*i+1]+j; 168 } 169 } 170 PetscCall(PetscSFCreate(comm,&sf)); 171 PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER)); 172 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 173 PetscCall(PetscSFSetType(sf,PETSCSFBASIC)); 174 PetscCall(PetscSFSetFromOptions(sf)); 175 /* overlap communication and computation */ 176 PetscCall(PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE)); 177 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is)); 178 PetscCall(PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE)); 179 PetscCall(PetscSFDestroy(&sf)); 180 PetscCall(PetscFree2(sbdata,sbsizes)); 181 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata)); 182 PetscCall(PetscFree(toranks)); 183 PetscCall(PetscFree(tosizes)); 184 PetscCall(PetscFree(todata)); 185 PetscFunctionReturn(0); 186 } 187 188 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 189 { 190 PetscInt *isz,isz_i,i,j,is_id, data_size; 191 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 192 const PetscInt *indices_i_temp; 193 MPI_Comm *iscomms; 194 195 PetscFunctionBegin; 196 max_lsize = 0; 197 PetscCall(PetscMalloc1(nidx,&isz)); 198 for (i=0; i<nidx; i++) { 199 PetscCall(ISGetLocalSize(is[i],&lsize)); 200 max_lsize = lsize>max_lsize ? lsize:max_lsize; 201 isz[i] = lsize; 202 } 203 PetscCall(PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms)); 204 for (i=0; i<nidx; i++) { 205 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL)); 206 PetscCall(ISGetIndices(is[i],&indices_i_temp)); 207 PetscCall(PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i])); 208 PetscCall(ISRestoreIndices(is[i],&indices_i_temp)); 209 PetscCall(ISDestroy(&is[i])); 210 } 211 /* retrieve information to get row id and its overlap */ 212 for (i=0; i<nrecvs;) { 213 is_id = recvdata[i++]; 214 data_size = recvdata[i++]; 215 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 216 isz_i = isz[is_id]; 217 for (j=0; j< data_size; j++) { 218 col = recvdata[i++]; 219 indices_i[isz_i++] = col; 220 } 221 isz[is_id] = isz_i; 222 } 223 /* remove duplicate entities */ 224 for (i=0; i<nidx; i++) { 225 indices_i = indices_temp+(max_lsize+nrecvs)*i; 226 isz_i = isz[i]; 227 PetscCall(PetscSortRemoveDupsInt(&isz_i,indices_i)); 228 PetscCall(ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i])); 229 PetscCall(PetscCommDestroy(&iscomms[i])); 230 } 231 PetscCall(PetscFree(isz)); 232 PetscCall(PetscFree2(indices_temp,iscomms)); 233 PetscFunctionReturn(0); 234 } 235 236 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 237 { 238 PetscLayout rmap,cmap; 239 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 240 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 241 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 242 const PetscInt *gcols,*ai,*aj,*bi,*bj; 243 Mat amat,bmat; 244 PetscMPIInt rank; 245 PetscBool done; 246 MPI_Comm comm; 247 248 PetscFunctionBegin; 249 PetscCall(PetscObjectGetComm((PetscObject)mat,&comm)); 250 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 251 PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols)); 252 /* Even if the mat is symmetric, we still assume it is not symmetric */ 253 PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 254 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 255 PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 256 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 257 /* total number of nonzero values is used to estimate the memory usage in the next step */ 258 tnz = ai[an]+bi[bn]; 259 PetscCall(MatGetLayouts(mat,&rmap,&cmap)); 260 PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL)); 261 PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL)); 262 /* to find the longest message */ 263 max_fszs = 0; 264 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 265 /* better way to estimate number of nonzero in the mat??? */ 266 PetscCall(PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp)); 267 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 268 rows_pos = 0; 269 totalrows = 0; 270 for (i=0; i<nfrom; i++) { 271 PetscCall(PetscArrayzero(rows_pos_i,nidx)); 272 /* group data together */ 273 for (j=0; j<fromsizes[2*i]; j+=2) { 274 is_id = fromrows[rows_pos++];/* no of is */ 275 rows_i = rows_data[is_id]; 276 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 277 } 278 /* estimate a space to avoid multiple allocations */ 279 for (j=0; j<nidx; j++) { 280 indvc_ij = 0; 281 rows_i = rows_data[j]; 282 for (l=0; l<rows_pos_i[j]; l++) { 283 row = rows_i[l]-rstart; 284 start = ai[row]; 285 end = ai[row+1]; 286 for (k=start; k<end; k++) { /* Amat */ 287 col = aj[k] + cstart; 288 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 289 } 290 start = bi[row]; 291 end = bi[row+1]; 292 for (k=start; k<end; k++) { /* Bmat */ 293 col = gcols[bj[k]]; 294 indices_tmp[indvc_ij++] = col; 295 } 296 } 297 PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp)); 298 indv_counts[i*nidx+j] = indvc_ij; 299 totalrows += indvc_ij; 300 } 301 } 302 /* message triple <no of is, number of rows, rows> */ 303 PetscCall(PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes)); 304 totalrows = 0; 305 rows_pos = 0; 306 /* use this code again */ 307 for (i=0;i<nfrom;i++) { 308 PetscCall(PetscArrayzero(rows_pos_i,nidx)); 309 for (j=0; j<fromsizes[2*i]; j+=2) { 310 is_id = fromrows[rows_pos++]; 311 rows_i = rows_data[is_id]; 312 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 313 } 314 /* add data */ 315 for (j=0; j<nidx; j++) { 316 if (!indv_counts[i*nidx+j]) continue; 317 indvc_ij = 0; 318 sbdata[totalrows++] = j; 319 sbdata[totalrows++] = indv_counts[i*nidx+j]; 320 sbsizes[2*i] += 2; 321 rows_i = rows_data[j]; 322 for (l=0; l<rows_pos_i[j]; l++) { 323 row = rows_i[l]-rstart; 324 start = ai[row]; 325 end = ai[row+1]; 326 for (k=start; k<end; k++) { /* Amat */ 327 col = aj[k] + cstart; 328 indices_tmp[indvc_ij++] = col; 329 } 330 start = bi[row]; 331 end = bi[row+1]; 332 for (k=start; k<end; k++) { /* Bmat */ 333 col = gcols[bj[k]]; 334 indices_tmp[indvc_ij++] = col; 335 } 336 } 337 PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp)); 338 sbsizes[2*i] += indvc_ij; 339 PetscCall(PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij)); 340 totalrows += indvc_ij; 341 } 342 } 343 PetscCall(PetscMalloc1(nfrom+1,&offsets)); 344 offsets[0] = 0; 345 for (i=0; i<nfrom; i++) { 346 offsets[i+1] = offsets[i] + sbsizes[2*i]; 347 sbsizes[2*i+1] = offsets[i]; 348 } 349 PetscCall(PetscFree(offsets)); 350 if (sbrowsizes) *sbrowsizes = sbsizes; 351 if (sbrows) *sbrows = sbdata; 352 PetscCall(PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp)); 353 PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 354 PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 355 PetscFunctionReturn(0); 356 } 357 358 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 359 { 360 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 361 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 362 PetscInt lsize,lsize_tmp; 363 PetscMPIInt rank,owner; 364 Mat amat,bmat; 365 PetscBool done; 366 PetscLayout cmap,rmap; 367 MPI_Comm comm; 368 369 PetscFunctionBegin; 370 PetscCall(PetscObjectGetComm((PetscObject)mat,&comm)); 371 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 372 PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols)); 373 PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 374 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 375 PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 376 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 377 /* is it a safe way to compute number of nonzero values ? */ 378 tnz = ai[an]+bi[bn]; 379 PetscCall(MatGetLayouts(mat,&rmap,&cmap)); 380 PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL)); 381 PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL)); 382 /* it is a better way to estimate memory than the old implementation 383 * where global size of matrix is used 384 * */ 385 PetscCall(PetscMalloc1(tnz,&indices_temp)); 386 for (i=0; i<nidx; i++) { 387 MPI_Comm iscomm; 388 389 PetscCall(ISGetLocalSize(is[i],&lsize)); 390 PetscCall(ISGetIndices(is[i],&indices)); 391 lsize_tmp = 0; 392 for (j=0; j<lsize; j++) { 393 owner = -1; 394 row = indices[j]; 395 PetscCall(PetscLayoutFindOwner(rmap,row,&owner)); 396 if (owner != rank) continue; 397 /* local number */ 398 row -= rstart; 399 start = ai[row]; 400 end = ai[row+1]; 401 for (k=start; k<end; k++) { /* Amat */ 402 col = aj[k] + cstart; 403 indices_temp[lsize_tmp++] = col; 404 } 405 start = bi[row]; 406 end = bi[row+1]; 407 for (k=start; k<end; k++) { /* Bmat */ 408 col = gcols[bj[k]]; 409 indices_temp[lsize_tmp++] = col; 410 } 411 } 412 PetscCall(ISRestoreIndices(is[i],&indices)); 413 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL)); 414 PetscCall(ISDestroy(&is[i])); 415 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp,indices_temp)); 416 PetscCall(ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i])); 417 PetscCall(PetscCommDestroy(&iscomm)); 418 } 419 PetscCall(PetscFree(indices_temp)); 420 PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 421 PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 422 PetscFunctionReturn(0); 423 } 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests received (which have to be or which have been processed) 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscMPIInt size,rank,tag1,tag2,proc = 0; 461 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 462 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 463 PetscBT *table; 464 MPI_Comm comm; 465 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 466 MPI_Status *recv_status; 467 MPI_Comm *iscomms; 468 char *t_p; 469 470 PetscFunctionBegin; 471 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag1)); 477 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2)); 478 479 PetscCall(PetscMalloc2(imax,(PetscInt***)&idx,imax,&n)); 480 481 for (i=0; i<imax; i++) { 482 PetscCall(ISGetIndices(is[i],&idx[i])); 483 PetscCall(ISGetLocalSize(is[i],&n[i])); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4)); 489 for (i=0; i<imax; i++) { 490 PetscCall(PetscArrayzero(w4,size)); /* initialise work vector*/ 491 idx_i = idx[i]; 492 len = n[i]; 493 for (j=0; j<len; j++) { 494 row = idx_i[j]; 495 PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 496 PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc)); 497 w4[proc]++; 498 } 499 for (j=0; j<size; j++) { 500 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 501 } 502 } 503 504 nrqs = 0; /* no of outgoing messages */ 505 msz = 0; /* total mesg length (for all proc */ 506 w1[rank] = 0; /* no mesg sent to intself */ 507 w3[rank] = 0; 508 for (i=0; i<size; i++) { 509 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 510 } 511 /* pa - is list of processors to communicate with */ 512 PetscCall(PetscMalloc1(nrqs,&pa)); 513 for (i=0,j=0; i<size; i++) { 514 if (w1[i]) {pa[j] = i; j++;} 515 } 516 517 /* Each message would have a header = 1 + 2*(no of IS) + data */ 518 for (i=0; i<nrqs; i++) { 519 j = pa[i]; 520 w1[j] += w2[j] + 2*w3[j]; 521 msz += w1[j]; 522 } 523 524 /* Determine the number of messages to expect, their lengths, from from-ids */ 525 PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 526 PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 527 528 /* Now post the Irecvs corresponding to these messages */ 529 PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1)); 530 531 /* Allocate Memory for outgoing messages */ 532 PetscCall(PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr)); 533 PetscCall(PetscArrayzero(outdat,size)); 534 PetscCall(PetscArrayzero(ptr,size)); 535 536 { 537 PetscInt *iptr = tmp,ict = 0; 538 for (i=0; i<nrqs; i++) { 539 j = pa[i]; 540 iptr += ict; 541 outdat[j] = iptr; 542 ict = w1[j]; 543 } 544 } 545 546 /* Form the outgoing messages */ 547 /* plug in the headers */ 548 for (i=0; i<nrqs; i++) { 549 j = pa[i]; 550 outdat[j][0] = 0; 551 PetscCall(PetscArrayzero(outdat[j]+1,2*w3[j])); 552 ptr[j] = outdat[j] + 2*w3[j] + 1; 553 } 554 555 /* Memory for doing local proc's work */ 556 { 557 PetscInt M_BPB_imax = 0; 558 #if defined(PETSC_USE_CTABLE) 559 PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax)); 560 PetscCall(PetscMalloc1(imax,&table_data)); 561 for (i=0; i<imax; i++) { 562 PetscCall(PetscTableCreate(n[i],M,&table_data[i])); 563 } 564 PetscCall(PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p)); 565 for (i=0; i<imax; i++) { 566 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 567 } 568 #else 569 PetscInt Mimax = 0; 570 PetscCall(PetscIntMultError(M,imax, &Mimax)); 571 PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax)); 572 PetscCall(PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p)); 573 for (i=0; i<imax; i++) { 574 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 575 data[i] = d_p + M*i; 576 } 577 #endif 578 } 579 580 /* Parse the IS and update local tables and the outgoing buf with the data */ 581 { 582 PetscInt n_i,isz_i,*outdat_j,ctr_j; 583 PetscBT table_i; 584 585 for (i=0; i<imax; i++) { 586 PetscCall(PetscArrayzero(ctr,size)); 587 n_i = n[i]; 588 table_i = table[i]; 589 idx_i = idx[i]; 590 #if defined(PETSC_USE_CTABLE) 591 table_data_i = table_data[i]; 592 #else 593 data_i = data[i]; 594 #endif 595 isz_i = isz[i]; 596 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 597 row = idx_i[j]; 598 PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc)); 599 if (proc != rank) { /* copy to the outgoing buffer */ 600 ctr[proc]++; 601 *ptr[proc] = row; 602 ptr[proc]++; 603 } else if (!PetscBTLookupSet(table_i,row)) { 604 #if defined(PETSC_USE_CTABLE) 605 PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES)); 606 #else 607 data_i[isz_i] = row; /* Update the local table */ 608 #endif 609 isz_i++; 610 } 611 } 612 /* Update the headers for the current IS */ 613 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 614 if ((ctr_j = ctr[j])) { 615 outdat_j = outdat[j]; 616 k = ++outdat_j[0]; 617 outdat_j[2*k] = ctr_j; 618 outdat_j[2*k-1] = i; 619 } 620 } 621 isz[i] = isz_i; 622 } 623 } 624 625 /* Now post the sends */ 626 PetscCall(PetscMalloc1(nrqs,&s_waits1)); 627 for (i=0; i<nrqs; ++i) { 628 j = pa[i]; 629 PetscCallMPI(MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i)); 630 } 631 632 /* No longer need the original indices */ 633 PetscCall(PetscMalloc1(imax,&iscomms)); 634 for (i=0; i<imax; ++i) { 635 PetscCall(ISRestoreIndices(is[i],idx+i)); 636 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL)); 637 } 638 PetscCall(PetscFree2(*(PetscInt***)&idx,n)); 639 640 for (i=0; i<imax; ++i) { 641 PetscCall(ISDestroy(&is[i])); 642 } 643 644 /* Do Local work */ 645 #if defined(PETSC_USE_CTABLE) 646 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data)); 647 #else 648 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL)); 649 #endif 650 651 /* Receive messages */ 652 PetscCall(PetscMalloc1(nrqr,&recv_status)); 653 PetscCallMPI(MPI_Waitall(nrqr,r_waits1,recv_status)); 654 PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE)); 655 656 /* Phase 1 sends are complete - deallocate buffers */ 657 PetscCall(PetscFree4(outdat,ptr,tmp,ctr)); 658 PetscCall(PetscFree4(w1,w2,w3,w4)); 659 660 PetscCall(PetscMalloc1(nrqr,&xdata)); 661 PetscCall(PetscMalloc1(nrqr,&isz1)); 662 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1)); 663 PetscCall(PetscFree(rbuf[0])); 664 PetscCall(PetscFree(rbuf)); 665 666 /* Send the data back */ 667 /* Do a global reduction to know the buffer space req for incoming messages */ 668 { 669 PetscMPIInt *rw1; 670 671 PetscCall(PetscCalloc1(size,&rw1)); 672 673 for (i=0; i<nrqr; ++i) { 674 proc = recv_status[i].MPI_SOURCE; 675 676 PetscCheck(proc == onodes1[i],PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 677 rw1[proc] = isz1[i]; 678 } 679 PetscCall(PetscFree(onodes1)); 680 PetscCall(PetscFree(olengths1)); 681 682 /* Determine the number of messages to expect, their lengths, from from-ids */ 683 PetscCall(PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2)); 684 PetscCall(PetscFree(rw1)); 685 } 686 /* Now post the Irecvs corresponding to these messages */ 687 PetscCall(PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2)); 688 689 /* Now post the sends */ 690 PetscCall(PetscMalloc1(nrqr,&s_waits2)); 691 for (i=0; i<nrqr; ++i) { 692 j = recv_status[i].MPI_SOURCE; 693 PetscCallMPI(MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i)); 694 } 695 696 /* receive work done on other processors */ 697 { 698 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 699 PetscMPIInt idex; 700 PetscBT table_i; 701 702 for (i=0; i<nrqs; ++i) { 703 PetscCallMPI(MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE)); 704 /* Process the message */ 705 rbuf2_i = rbuf2[idex]; 706 ct1 = 2*rbuf2_i[0]+1; 707 jmax = rbuf2[idex][0]; 708 for (j=1; j<=jmax; j++) { 709 max = rbuf2_i[2*j]; 710 is_no = rbuf2_i[2*j-1]; 711 isz_i = isz[is_no]; 712 table_i = table[is_no]; 713 #if defined(PETSC_USE_CTABLE) 714 table_data_i = table_data[is_no]; 715 #else 716 data_i = data[is_no]; 717 #endif 718 for (k=0; k<max; k++,ct1++) { 719 row = rbuf2_i[ct1]; 720 if (!PetscBTLookupSet(table_i,row)) { 721 #if defined(PETSC_USE_CTABLE) 722 PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES)); 723 #else 724 data_i[isz_i] = row; 725 #endif 726 isz_i++; 727 } 728 } 729 isz[is_no] = isz_i; 730 } 731 } 732 733 PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE)); 734 } 735 736 #if defined(PETSC_USE_CTABLE) 737 tcount_max = 0; 738 for (i=0; i<imax; ++i) { 739 table_data_i = table_data[i]; 740 PetscCall(PetscTableGetCount(table_data_i,&tcount)); 741 if (tcount_max < tcount) tcount_max = tcount; 742 } 743 PetscCall(PetscMalloc1(tcount_max+1,&tdata)); 744 #endif 745 746 for (i=0; i<imax; ++i) { 747 #if defined(PETSC_USE_CTABLE) 748 PetscTablePosition tpos; 749 table_data_i = table_data[i]; 750 751 PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos)); 752 while (tpos) { 753 PetscCall(PetscTableGetNext(table_data_i,&tpos,&k,&j)); 754 tdata[--j] = --k; 755 } 756 PetscCall(ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i)); 757 #else 758 PetscCall(ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i)); 759 #endif 760 PetscCall(PetscCommDestroy(&iscomms[i])); 761 } 762 763 PetscCall(PetscFree(iscomms)); 764 PetscCall(PetscFree(onodes2)); 765 PetscCall(PetscFree(olengths2)); 766 767 PetscCall(PetscFree(pa)); 768 PetscCall(PetscFree(rbuf2[0])); 769 PetscCall(PetscFree(rbuf2)); 770 PetscCall(PetscFree(s_waits1)); 771 PetscCall(PetscFree(r_waits1)); 772 PetscCall(PetscFree(s_waits2)); 773 PetscCall(PetscFree(r_waits2)); 774 PetscCall(PetscFree(recv_status)); 775 if (xdata) { 776 PetscCall(PetscFree(xdata[0])); 777 PetscCall(PetscFree(xdata)); 778 } 779 PetscCall(PetscFree(isz1)); 780 #if defined(PETSC_USE_CTABLE) 781 for (i=0; i<imax; i++) { 782 PetscCall(PetscTableDestroy((PetscTable*)&table_data[i])); 783 } 784 PetscCall(PetscFree(table_data)); 785 PetscCall(PetscFree(tdata)); 786 PetscCall(PetscFree4(table,data,isz,t_p)); 787 #else 788 PetscCall(PetscFree5(table,data,isz,d_p,t_p)); 789 #endif 790 PetscFunctionReturn(0); 791 } 792 793 /* 794 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 795 the work on the local processor. 796 797 Inputs: 798 C - MAT_MPIAIJ; 799 imax - total no of index sets processed at a time; 800 table - an array of char - size = m bits. 801 802 Output: 803 isz - array containing the count of the solution elements corresponding 804 to each index set; 805 data or table_data - pointer to the solutions 806 */ 807 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 808 { 809 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 810 Mat A = c->A,B = c->B; 811 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 812 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 813 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 814 PetscBT table_i; 815 #if defined(PETSC_USE_CTABLE) 816 PetscTable table_data_i; 817 PetscTablePosition tpos; 818 PetscInt tcount,*tdata; 819 #else 820 PetscInt *data_i; 821 #endif 822 823 PetscFunctionBegin; 824 rstart = C->rmap->rstart; 825 cstart = C->cmap->rstart; 826 ai = a->i; 827 aj = a->j; 828 bi = b->i; 829 bj = b->j; 830 garray = c->garray; 831 832 for (i=0; i<imax; i++) { 833 #if defined(PETSC_USE_CTABLE) 834 /* copy existing entries of table_data_i into tdata[] */ 835 table_data_i = table_data[i]; 836 PetscCall(PetscTableGetCount(table_data_i,&tcount)); 837 PetscCheck(tcount == isz[i],PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT,tcount,i,isz[i]); 838 839 PetscCall(PetscMalloc1(tcount,&tdata)); 840 PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos)); 841 while (tpos) { 842 PetscCall(PetscTableGetNext(table_data_i,&tpos,&row,&j)); 843 tdata[--j] = --row; 844 PetscCheck(j <= tcount - 1,PETSC_COMM_SELF,PETSC_ERR_PLIB," j %" PetscInt_FMT " >= tcount %" PetscInt_FMT,j,tcount); 845 } 846 #else 847 data_i = data[i]; 848 #endif 849 table_i = table[i]; 850 isz_i = isz[i]; 851 max = isz[i]; 852 853 for (j=0; j<max; j++) { 854 #if defined(PETSC_USE_CTABLE) 855 row = tdata[j] - rstart; 856 #else 857 row = data_i[j] - rstart; 858 #endif 859 start = ai[row]; 860 end = ai[row+1]; 861 for (k=start; k<end; k++) { /* Amat */ 862 val = aj[k] + cstart; 863 if (!PetscBTLookupSet(table_i,val)) { 864 #if defined(PETSC_USE_CTABLE) 865 PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES)); 866 #else 867 data_i[isz_i] = val; 868 #endif 869 isz_i++; 870 } 871 } 872 start = bi[row]; 873 end = bi[row+1]; 874 for (k=start; k<end; k++) { /* Bmat */ 875 val = garray[bj[k]]; 876 if (!PetscBTLookupSet(table_i,val)) { 877 #if defined(PETSC_USE_CTABLE) 878 PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES)); 879 #else 880 data_i[isz_i] = val; 881 #endif 882 isz_i++; 883 } 884 } 885 } 886 isz[i] = isz_i; 887 888 #if defined(PETSC_USE_CTABLE) 889 PetscCall(PetscFree(tdata)); 890 #endif 891 } 892 PetscFunctionReturn(0); 893 } 894 895 /* 896 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 897 and return the output 898 899 Input: 900 C - the matrix 901 nrqr - no of messages being processed. 902 rbuf - an array of pointers to the received requests 903 904 Output: 905 xdata - array of messages to be sent back 906 isz1 - size of each message 907 908 For better efficiency perhaps we should malloc separately each xdata[i], 909 then if a remalloc is required we need only copy the data for that one row 910 rather then all previous rows as it is now where a single large chunk of 911 memory is used. 912 913 */ 914 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 915 { 916 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 917 Mat A = c->A,B = c->B; 918 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 919 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 920 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 921 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 922 PetscInt *rbuf_i,kmax,rbuf_0; 923 PetscBT xtable; 924 925 PetscFunctionBegin; 926 m = C->rmap->N; 927 rstart = C->rmap->rstart; 928 cstart = C->cmap->rstart; 929 ai = a->i; 930 aj = a->j; 931 bi = b->i; 932 bj = b->j; 933 garray = c->garray; 934 935 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 936 rbuf_i = rbuf[i]; 937 rbuf_0 = rbuf_i[0]; 938 ct += rbuf_0; 939 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 940 } 941 942 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 943 else max1 = 1; 944 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 945 if (nrqr) { 946 PetscCall(PetscMalloc1(mem_estimate,&xdata[0])); 947 ++no_malloc; 948 } 949 PetscCall(PetscBTCreate(m,&xtable)); 950 PetscCall(PetscArrayzero(isz1,nrqr)); 951 952 ct3 = 0; 953 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 954 rbuf_i = rbuf[i]; 955 rbuf_0 = rbuf_i[0]; 956 ct1 = 2*rbuf_0+1; 957 ct2 = ct1; 958 ct3 += ct1; 959 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 960 PetscCall(PetscBTMemzero(m,xtable)); 961 oct2 = ct2; 962 kmax = rbuf_i[2*j]; 963 for (k=0; k<kmax; k++,ct1++) { 964 row = rbuf_i[ct1]; 965 if (!PetscBTLookupSet(xtable,row)) { 966 if (!(ct3 < mem_estimate)) { 967 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 968 PetscCall(PetscMalloc1(new_estimate,&tmp)); 969 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 970 PetscCall(PetscFree(xdata[0])); 971 xdata[0] = tmp; 972 mem_estimate = new_estimate; ++no_malloc; 973 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 974 } 975 xdata[i][ct2++] = row; 976 ct3++; 977 } 978 } 979 for (k=oct2,max2=ct2; k<max2; k++) { 980 row = xdata[i][k] - rstart; 981 start = ai[row]; 982 end = ai[row+1]; 983 for (l=start; l<end; l++) { 984 val = aj[l] + cstart; 985 if (!PetscBTLookupSet(xtable,val)) { 986 if (!(ct3 < mem_estimate)) { 987 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 988 PetscCall(PetscMalloc1(new_estimate,&tmp)); 989 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 990 PetscCall(PetscFree(xdata[0])); 991 xdata[0] = tmp; 992 mem_estimate = new_estimate; ++no_malloc; 993 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 994 } 995 xdata[i][ct2++] = val; 996 ct3++; 997 } 998 } 999 start = bi[row]; 1000 end = bi[row+1]; 1001 for (l=start; l<end; l++) { 1002 val = garray[bj[l]]; 1003 if (!PetscBTLookupSet(xtable,val)) { 1004 if (!(ct3 < mem_estimate)) { 1005 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1006 PetscCall(PetscMalloc1(new_estimate,&tmp)); 1007 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 1008 PetscCall(PetscFree(xdata[0])); 1009 xdata[0] = tmp; 1010 mem_estimate = new_estimate; ++no_malloc; 1011 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1012 } 1013 xdata[i][ct2++] = val; 1014 ct3++; 1015 } 1016 } 1017 } 1018 /* Update the header*/ 1019 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1020 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1021 } 1022 xdata[i][0] = rbuf_0; 1023 if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2; 1024 isz1[i] = ct2; /* size of each message */ 1025 } 1026 PetscCall(PetscBTDestroy(&xtable)); 1027 PetscCall(PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc)); 1028 PetscFunctionReturn(0); 1029 } 1030 /* -------------------------------------------------------------------------*/ 1031 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1032 /* 1033 Every processor gets the entire matrix 1034 */ 1035 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1036 { 1037 Mat B; 1038 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1039 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1040 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1041 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1042 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1043 1044 PetscFunctionBegin; 1045 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1046 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank)); 1047 if (scall == MAT_INITIAL_MATRIX) { 1048 /* ---------------------------------------------------------------- 1049 Tell every processor the number of nonzeros per row 1050 */ 1051 PetscCall(PetscMalloc1(A->rmap->N,&lens)); 1052 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1053 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1054 } 1055 PetscCall(PetscMalloc2(size,&recvcounts,size,&displs)); 1056 for (i=0; i<size; i++) { 1057 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1058 displs[i] = A->rmap->range[i]; 1059 } 1060 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A))); 1061 /* --------------------------------------------------------------- 1062 Create the sequential matrix of the same type as the local block diagonal 1063 */ 1064 PetscCall(MatCreate(PETSC_COMM_SELF,&B)); 1065 PetscCall(MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE)); 1066 PetscCall(MatSetBlockSizesFromMats(B,A,A)); 1067 PetscCall(MatSetType(B,((PetscObject)a->A)->type_name)); 1068 PetscCall(MatSeqAIJSetPreallocation(B,0,lens)); 1069 PetscCall(PetscCalloc1(2,Bin)); 1070 **Bin = B; 1071 b = (Mat_SeqAIJ*)B->data; 1072 1073 /*-------------------------------------------------------------------- 1074 Copy my part of matrix column indices over 1075 */ 1076 sendcount = ad->nz + bd->nz; 1077 jsendbuf = b->j + b->i[rstarts[rank]]; 1078 a_jsendbuf = ad->j; 1079 b_jsendbuf = bd->j; 1080 n = A->rmap->rend - A->rmap->rstart; 1081 cnt = 0; 1082 for (i=0; i<n; i++) { 1083 /* put in lower diagonal portion */ 1084 m = bd->i[i+1] - bd->i[i]; 1085 while (m > 0) { 1086 /* is it above diagonal (in bd (compressed) numbering) */ 1087 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1088 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1089 m--; 1090 } 1091 1092 /* put in diagonal portion */ 1093 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1094 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1095 } 1096 1097 /* put in upper diagonal portion */ 1098 while (m-- > 0) { 1099 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1100 } 1101 } 1102 PetscCheck(cnt == sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt); 1103 1104 /*-------------------------------------------------------------------- 1105 Gather all column indices to all processors 1106 */ 1107 for (i=0; i<size; i++) { 1108 recvcounts[i] = 0; 1109 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1110 recvcounts[i] += lens[j]; 1111 } 1112 } 1113 displs[0] = 0; 1114 for (i=1; i<size; i++) { 1115 displs[i] = displs[i-1] + recvcounts[i-1]; 1116 } 1117 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A))); 1118 /*-------------------------------------------------------------------- 1119 Assemble the matrix into useable form (note numerical values not yet set) 1120 */ 1121 /* set the b->ilen (length of each row) values */ 1122 PetscCall(PetscArraycpy(b->ilen,lens,A->rmap->N)); 1123 /* set the b->i indices */ 1124 b->i[0] = 0; 1125 for (i=1; i<=A->rmap->N; i++) { 1126 b->i[i] = b->i[i-1] + lens[i-1]; 1127 } 1128 PetscCall(PetscFree(lens)); 1129 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 1130 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 1131 1132 } else { 1133 B = **Bin; 1134 b = (Mat_SeqAIJ*)B->data; 1135 } 1136 1137 /*-------------------------------------------------------------------- 1138 Copy my part of matrix numerical values into the values location 1139 */ 1140 if (flag == MAT_GET_VALUES) { 1141 const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf; 1142 MatScalar *sendbuf,*recvbuf; 1143 1144 PetscCall(MatSeqAIJGetArrayRead(a->A,&ada)); 1145 PetscCall(MatSeqAIJGetArrayRead(a->B,&bda)); 1146 sendcount = ad->nz + bd->nz; 1147 sendbuf = b->a + b->i[rstarts[rank]]; 1148 a_sendbuf = ada; 1149 b_sendbuf = bda; 1150 b_sendj = bd->j; 1151 n = A->rmap->rend - A->rmap->rstart; 1152 cnt = 0; 1153 for (i=0; i<n; i++) { 1154 /* put in lower diagonal portion */ 1155 m = bd->i[i+1] - bd->i[i]; 1156 while (m > 0) { 1157 /* is it above diagonal (in bd (compressed) numbering) */ 1158 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1159 sendbuf[cnt++] = *b_sendbuf++; 1160 m--; 1161 b_sendj++; 1162 } 1163 1164 /* put in diagonal portion */ 1165 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1166 sendbuf[cnt++] = *a_sendbuf++; 1167 } 1168 1169 /* put in upper diagonal portion */ 1170 while (m-- > 0) { 1171 sendbuf[cnt++] = *b_sendbuf++; 1172 b_sendj++; 1173 } 1174 } 1175 PetscCheck(cnt == sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt); 1176 1177 /* ----------------------------------------------------------------- 1178 Gather all numerical values to all processors 1179 */ 1180 if (!recvcounts) { 1181 PetscCall(PetscMalloc2(size,&recvcounts,size,&displs)); 1182 } 1183 for (i=0; i<size; i++) { 1184 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1185 } 1186 displs[0] = 0; 1187 for (i=1; i<size; i++) { 1188 displs[i] = displs[i-1] + recvcounts[i-1]; 1189 } 1190 recvbuf = b->a; 1191 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A))); 1192 PetscCall(MatSeqAIJRestoreArrayRead(a->A,&ada)); 1193 PetscCall(MatSeqAIJRestoreArrayRead(a->B,&bda)); 1194 } /* endof (flag == MAT_GET_VALUES) */ 1195 PetscCall(PetscFree2(recvcounts,displs)); 1196 1197 PetscCall(MatPropagateSymmetryOptions(A,B)); 1198 PetscFunctionReturn(0); 1199 } 1200 1201 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1202 { 1203 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1204 Mat submat,A = c->A,B = c->B; 1205 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1206 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1207 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1208 const PetscInt *icol,*irow; 1209 PetscInt nrow,ncol,start; 1210 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1211 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1212 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1213 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1214 PetscInt *lens,rmax,ncols,*cols,Crow; 1215 #if defined(PETSC_USE_CTABLE) 1216 PetscTable cmap,rmap; 1217 PetscInt *cmap_loc,*rmap_loc; 1218 #else 1219 PetscInt *cmap,*rmap; 1220 #endif 1221 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1222 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1223 PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL; 1224 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1225 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1226 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1227 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1228 MPI_Comm comm; 1229 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1230 PetscMPIInt *onodes1,*olengths1,idex,end; 1231 Mat_SubSppt *smatis1; 1232 PetscBool isrowsorted,iscolsorted; 1233 1234 PetscFunctionBegin; 1235 PetscValidLogicalCollectiveInt(C,ismax,2); 1236 PetscValidLogicalCollectiveEnum(C,scall,5); 1237 PetscCheck(ismax == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1238 PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a)); 1239 PetscCall(MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a)); 1240 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 1241 size = c->size; 1242 rank = c->rank; 1243 1244 PetscCall(ISSorted(iscol[0],&iscolsorted)); 1245 PetscCall(ISSorted(isrow[0],&isrowsorted)); 1246 PetscCall(ISGetIndices(isrow[0],&irow)); 1247 PetscCall(ISGetLocalSize(isrow[0],&nrow)); 1248 if (allcolumns) { 1249 icol = NULL; 1250 ncol = C->cmap->N; 1251 } else { 1252 PetscCall(ISGetIndices(iscol[0],&icol)); 1253 PetscCall(ISGetLocalSize(iscol[0],&ncol)); 1254 } 1255 1256 if (scall == MAT_INITIAL_MATRIX) { 1257 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1258 1259 /* Get some new tags to keep the communication clean */ 1260 tag1 = ((PetscObject)C)->tag; 1261 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2)); 1262 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3)); 1263 1264 /* evaluate communication - mesg to who, length of mesg, and buffer space 1265 required. Based on this, buffers are allocated, and data copied into them */ 1266 PetscCall(PetscCalloc2(size,&w1,size,&w2)); 1267 PetscCall(PetscMalloc1(nrow,&row2proc)); 1268 1269 /* w1[proc] = num of rows owned by proc -- to be requested */ 1270 proc = 0; 1271 nrqs = 0; /* num of outgoing messages */ 1272 for (j=0; j<nrow; j++) { 1273 row = irow[j]; 1274 if (!isrowsorted) proc = 0; 1275 while (row >= C->rmap->range[proc+1]) proc++; 1276 w1[proc]++; 1277 row2proc[j] = proc; /* map row index to proc */ 1278 1279 if (proc != rank && !w2[proc]) { 1280 w2[proc] = 1; nrqs++; 1281 } 1282 } 1283 w1[rank] = 0; /* rows owned by self will not be requested */ 1284 1285 PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/ 1286 for (proc=0,j=0; proc<size; proc++) { 1287 if (w1[proc]) { pa[j++] = proc;} 1288 } 1289 1290 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1291 msz = 0; /* total mesg length (for all procs) */ 1292 for (i=0; i<nrqs; i++) { 1293 proc = pa[i]; 1294 w1[proc] += 3; 1295 msz += w1[proc]; 1296 } 1297 PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz)); 1298 1299 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1300 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1301 PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 1302 1303 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1304 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1305 PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 1306 1307 /* Now post the Irecvs corresponding to these messages */ 1308 PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1)); 1309 1310 PetscCall(PetscFree(onodes1)); 1311 PetscCall(PetscFree(olengths1)); 1312 1313 /* Allocate Memory for outgoing messages */ 1314 PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr)); 1315 PetscCall(PetscArrayzero(sbuf1,size)); 1316 PetscCall(PetscArrayzero(ptr,size)); 1317 1318 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1319 iptr = tmp; 1320 for (i=0; i<nrqs; i++) { 1321 proc = pa[i]; 1322 sbuf1[proc] = iptr; 1323 iptr += w1[proc]; 1324 } 1325 1326 /* Form the outgoing messages */ 1327 /* Initialize the header space */ 1328 for (i=0; i<nrqs; i++) { 1329 proc = pa[i]; 1330 PetscCall(PetscArrayzero(sbuf1[proc],3)); 1331 ptr[proc] = sbuf1[proc] + 3; 1332 } 1333 1334 /* Parse the isrow and copy data into outbuf */ 1335 PetscCall(PetscArrayzero(ctr,size)); 1336 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1337 proc = row2proc[j]; 1338 if (proc != rank) { /* copy to the outgoing buf*/ 1339 *ptr[proc] = irow[j]; 1340 ctr[proc]++; ptr[proc]++; 1341 } 1342 } 1343 1344 /* Update the headers for the current IS */ 1345 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1346 if ((ctr_j = ctr[j])) { 1347 sbuf1_j = sbuf1[j]; 1348 k = ++sbuf1_j[0]; 1349 sbuf1_j[2*k] = ctr_j; 1350 sbuf1_j[2*k-1] = 0; 1351 } 1352 } 1353 1354 /* Now post the sends */ 1355 PetscCall(PetscMalloc1(nrqs,&s_waits1)); 1356 for (i=0; i<nrqs; ++i) { 1357 proc = pa[i]; 1358 PetscCallMPI(MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i)); 1359 } 1360 1361 /* Post Receives to capture the buffer size */ 1362 PetscCall(PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2)); 1363 PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3)); 1364 1365 if (nrqs) rbuf2[0] = tmp + msz; 1366 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1367 1368 for (i=0; i<nrqs; ++i) { 1369 proc = pa[i]; 1370 PetscCallMPI(MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i)); 1371 } 1372 1373 PetscCall(PetscFree2(w1,w2)); 1374 1375 /* Send to other procs the buf size they should allocate */ 1376 /* Receive messages*/ 1377 PetscCall(PetscMalloc1(nrqr,&r_status1)); 1378 PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1)); 1379 1380 PetscCallMPI(MPI_Waitall(nrqr,r_waits1,r_status1)); 1381 for (i=0; i<nrqr; ++i) { 1382 req_size[i] = 0; 1383 rbuf1_i = rbuf1[i]; 1384 start = 2*rbuf1_i[0] + 1; 1385 PetscCallMPI(MPI_Get_count(r_status1+i,MPIU_INT,&end)); 1386 PetscCall(PetscMalloc1(end,&sbuf2[i])); 1387 sbuf2_i = sbuf2[i]; 1388 for (j=start; j<end; j++) { 1389 k = rbuf1_i[j] - rstart; 1390 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1391 sbuf2_i[j] = ncols; 1392 req_size[i] += ncols; 1393 } 1394 req_source1[i] = r_status1[i].MPI_SOURCE; 1395 1396 /* form the header */ 1397 sbuf2_i[0] = req_size[i]; 1398 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1399 1400 PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i)); 1401 } 1402 1403 PetscCall(PetscFree(r_status1)); 1404 PetscCall(PetscFree(r_waits1)); 1405 1406 /* rbuf2 is received, Post recv column indices a->j */ 1407 PetscCallMPI(MPI_Waitall(nrqs,r_waits2,r_status2)); 1408 1409 PetscCall(PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3)); 1410 for (i=0; i<nrqs; ++i) { 1411 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i])); 1412 req_source2[i] = r_status2[i].MPI_SOURCE; 1413 PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i)); 1414 } 1415 1416 /* Wait on sends1 and sends2 */ 1417 PetscCall(PetscMalloc1(nrqs,&s_status1)); 1418 PetscCallMPI(MPI_Waitall(nrqs,s_waits1,s_status1)); 1419 PetscCall(PetscFree(s_waits1)); 1420 PetscCall(PetscFree(s_status1)); 1421 1422 PetscCallMPI(MPI_Waitall(nrqr,s_waits2,s_status2)); 1423 PetscCall(PetscFree4(r_status2,s_waits2,r_waits2,s_status2)); 1424 1425 /* Now allocate sending buffers for a->j, and send them off */ 1426 PetscCall(PetscMalloc1(nrqr,&sbuf_aj)); 1427 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1428 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0])); 1429 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1430 1431 for (i=0; i<nrqr; i++) { /* for each requested message */ 1432 rbuf1_i = rbuf1[i]; 1433 sbuf_aj_i = sbuf_aj[i]; 1434 ct1 = 2*rbuf1_i[0] + 1; 1435 ct2 = 0; 1436 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1437 1438 kmax = rbuf1[i][2]; 1439 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1440 row = rbuf1_i[ct1] - rstart; 1441 nzA = ai[row+1] - ai[row]; 1442 nzB = bi[row+1] - bi[row]; 1443 ncols = nzA + nzB; 1444 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1445 1446 /* load the column indices for this row into cols*/ 1447 cols = sbuf_aj_i + ct2; 1448 1449 lwrite = 0; 1450 for (l=0; l<nzB; l++) { 1451 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1452 } 1453 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1454 for (l=0; l<nzB; l++) { 1455 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1456 } 1457 1458 ct2 += ncols; 1459 } 1460 PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i)); 1461 } 1462 1463 /* create column map (cmap): global col of C -> local col of submat */ 1464 #if defined(PETSC_USE_CTABLE) 1465 if (!allcolumns) { 1466 PetscCall(PetscTableCreate(ncol,C->cmap->N,&cmap)); 1467 PetscCall(PetscCalloc1(C->cmap->n,&cmap_loc)); 1468 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1469 if (icol[j] >= cstart && icol[j] <cend) { 1470 cmap_loc[icol[j] - cstart] = j+1; 1471 } else { /* use PetscTable for non-local col indices */ 1472 PetscCall(PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES)); 1473 } 1474 } 1475 } else { 1476 cmap = NULL; 1477 cmap_loc = NULL; 1478 } 1479 PetscCall(PetscCalloc1(C->rmap->n,&rmap_loc)); 1480 #else 1481 if (!allcolumns) { 1482 PetscCall(PetscCalloc1(C->cmap->N,&cmap)); 1483 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1484 } else { 1485 cmap = NULL; 1486 } 1487 #endif 1488 1489 /* Create lens for MatSeqAIJSetPreallocation() */ 1490 PetscCall(PetscCalloc1(nrow,&lens)); 1491 1492 /* Compute lens from local part of C */ 1493 for (j=0; j<nrow; j++) { 1494 row = irow[j]; 1495 proc = row2proc[j]; 1496 if (proc == rank) { 1497 /* diagonal part A = c->A */ 1498 ncols = ai[row-rstart+1] - ai[row-rstart]; 1499 cols = aj + ai[row-rstart]; 1500 if (!allcolumns) { 1501 for (k=0; k<ncols; k++) { 1502 #if defined(PETSC_USE_CTABLE) 1503 tcol = cmap_loc[cols[k]]; 1504 #else 1505 tcol = cmap[cols[k]+cstart]; 1506 #endif 1507 if (tcol) lens[j]++; 1508 } 1509 } else { /* allcolumns */ 1510 lens[j] = ncols; 1511 } 1512 1513 /* off-diagonal part B = c->B */ 1514 ncols = bi[row-rstart+1] - bi[row-rstart]; 1515 cols = bj + bi[row-rstart]; 1516 if (!allcolumns) { 1517 for (k=0; k<ncols; k++) { 1518 #if defined(PETSC_USE_CTABLE) 1519 PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol)); 1520 #else 1521 tcol = cmap[bmap[cols[k]]]; 1522 #endif 1523 if (tcol) lens[j]++; 1524 } 1525 } else { /* allcolumns */ 1526 lens[j] += ncols; 1527 } 1528 } 1529 } 1530 1531 /* Create row map (rmap): global row of C -> local row of submat */ 1532 #if defined(PETSC_USE_CTABLE) 1533 PetscCall(PetscTableCreate(nrow,C->rmap->N,&rmap)); 1534 for (j=0; j<nrow; j++) { 1535 row = irow[j]; 1536 proc = row2proc[j]; 1537 if (proc == rank) { /* a local row */ 1538 rmap_loc[row - rstart] = j; 1539 } else { 1540 PetscCall(PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES)); 1541 } 1542 } 1543 #else 1544 PetscCall(PetscCalloc1(C->rmap->N,&rmap)); 1545 for (j=0; j<nrow; j++) { 1546 rmap[irow[j]] = j; 1547 } 1548 #endif 1549 1550 /* Update lens from offproc data */ 1551 /* recv a->j is done */ 1552 PetscCallMPI(MPI_Waitall(nrqs,r_waits3,r_status3)); 1553 for (i=0; i<nrqs; i++) { 1554 proc = pa[i]; 1555 sbuf1_i = sbuf1[proc]; 1556 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1557 ct1 = 2 + 1; 1558 ct2 = 0; 1559 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1560 rbuf3_i = rbuf3[i]; /* received C->j */ 1561 1562 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1563 max1 = sbuf1_i[2]; 1564 for (k=0; k<max1; k++,ct1++) { 1565 #if defined(PETSC_USE_CTABLE) 1566 PetscCall(PetscTableFind(rmap,sbuf1_i[ct1]+1,&row)); 1567 row--; 1568 PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1569 #else 1570 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1571 #endif 1572 /* Now, store row index of submat in sbuf1_i[ct1] */ 1573 sbuf1_i[ct1] = row; 1574 1575 nnz = rbuf2_i[ct1]; 1576 if (!allcolumns) { 1577 for (l=0; l<nnz; l++,ct2++) { 1578 #if defined(PETSC_USE_CTABLE) 1579 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1580 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1581 } else { 1582 PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol)); 1583 } 1584 #else 1585 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1586 #endif 1587 if (tcol) lens[row]++; 1588 } 1589 } else { /* allcolumns */ 1590 lens[row] += nnz; 1591 } 1592 } 1593 } 1594 PetscCallMPI(MPI_Waitall(nrqr,s_waits3,s_status3)); 1595 PetscCall(PetscFree4(r_waits3,s_waits3,r_status3,s_status3)); 1596 1597 /* Create the submatrices */ 1598 PetscCall(MatCreate(PETSC_COMM_SELF,&submat)); 1599 PetscCall(MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE)); 1600 1601 PetscCall(ISGetBlockSize(isrow[0],&i)); 1602 PetscCall(ISGetBlockSize(iscol[0],&j)); 1603 PetscCall(MatSetBlockSizes(submat,i,j)); 1604 PetscCall(MatSetType(submat,((PetscObject)A)->type_name)); 1605 PetscCall(MatSeqAIJSetPreallocation(submat,0,lens)); 1606 1607 /* create struct Mat_SubSppt and attached it to submat */ 1608 PetscCall(PetscNew(&smatis1)); 1609 subc = (Mat_SeqAIJ*)submat->data; 1610 subc->submatis1 = smatis1; 1611 1612 smatis1->id = 0; 1613 smatis1->nrqs = nrqs; 1614 smatis1->nrqr = nrqr; 1615 smatis1->rbuf1 = rbuf1; 1616 smatis1->rbuf2 = rbuf2; 1617 smatis1->rbuf3 = rbuf3; 1618 smatis1->sbuf2 = sbuf2; 1619 smatis1->req_source2 = req_source2; 1620 1621 smatis1->sbuf1 = sbuf1; 1622 smatis1->ptr = ptr; 1623 smatis1->tmp = tmp; 1624 smatis1->ctr = ctr; 1625 1626 smatis1->pa = pa; 1627 smatis1->req_size = req_size; 1628 smatis1->req_source1 = req_source1; 1629 1630 smatis1->allcolumns = allcolumns; 1631 smatis1->singleis = PETSC_TRUE; 1632 smatis1->row2proc = row2proc; 1633 smatis1->rmap = rmap; 1634 smatis1->cmap = cmap; 1635 #if defined(PETSC_USE_CTABLE) 1636 smatis1->rmap_loc = rmap_loc; 1637 smatis1->cmap_loc = cmap_loc; 1638 #endif 1639 1640 smatis1->destroy = submat->ops->destroy; 1641 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1642 submat->factortype = C->factortype; 1643 1644 /* compute rmax */ 1645 rmax = 0; 1646 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1647 1648 } else { /* scall == MAT_REUSE_MATRIX */ 1649 submat = submats[0]; 1650 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1651 1652 subc = (Mat_SeqAIJ*)submat->data; 1653 rmax = subc->rmax; 1654 smatis1 = subc->submatis1; 1655 nrqs = smatis1->nrqs; 1656 nrqr = smatis1->nrqr; 1657 rbuf1 = smatis1->rbuf1; 1658 rbuf2 = smatis1->rbuf2; 1659 rbuf3 = smatis1->rbuf3; 1660 req_source2 = smatis1->req_source2; 1661 1662 sbuf1 = smatis1->sbuf1; 1663 sbuf2 = smatis1->sbuf2; 1664 ptr = smatis1->ptr; 1665 tmp = smatis1->tmp; 1666 ctr = smatis1->ctr; 1667 1668 pa = smatis1->pa; 1669 req_size = smatis1->req_size; 1670 req_source1 = smatis1->req_source1; 1671 1672 allcolumns = smatis1->allcolumns; 1673 row2proc = smatis1->row2proc; 1674 rmap = smatis1->rmap; 1675 cmap = smatis1->cmap; 1676 #if defined(PETSC_USE_CTABLE) 1677 rmap_loc = smatis1->rmap_loc; 1678 cmap_loc = smatis1->cmap_loc; 1679 #endif 1680 } 1681 1682 /* Post recv matrix values */ 1683 PetscCall(PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals)); 1684 PetscCall(PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4)); 1685 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4)); 1686 for (i=0; i<nrqs; ++i) { 1687 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i])); 1688 PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i)); 1689 } 1690 1691 /* Allocate sending buffers for a->a, and send them off */ 1692 PetscCall(PetscMalloc1(nrqr,&sbuf_aa)); 1693 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1694 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0])); 1695 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1696 1697 for (i=0; i<nrqr; i++) { 1698 rbuf1_i = rbuf1[i]; 1699 sbuf_aa_i = sbuf_aa[i]; 1700 ct1 = 2*rbuf1_i[0]+1; 1701 ct2 = 0; 1702 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1703 1704 kmax = rbuf1_i[2]; 1705 for (k=0; k<kmax; k++,ct1++) { 1706 row = rbuf1_i[ct1] - rstart; 1707 nzA = ai[row+1] - ai[row]; 1708 nzB = bi[row+1] - bi[row]; 1709 ncols = nzA + nzB; 1710 cworkB = bj + bi[row]; 1711 vworkA = a_a + ai[row]; 1712 vworkB = b_a + bi[row]; 1713 1714 /* load the column values for this row into vals*/ 1715 vals = sbuf_aa_i + ct2; 1716 1717 lwrite = 0; 1718 for (l=0; l<nzB; l++) { 1719 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1720 } 1721 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1722 for (l=0; l<nzB; l++) { 1723 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1724 } 1725 1726 ct2 += ncols; 1727 } 1728 PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i)); 1729 } 1730 1731 /* Assemble submat */ 1732 /* First assemble the local rows */ 1733 for (j=0; j<nrow; j++) { 1734 row = irow[j]; 1735 proc = row2proc[j]; 1736 if (proc == rank) { 1737 Crow = row - rstart; /* local row index of C */ 1738 #if defined(PETSC_USE_CTABLE) 1739 row = rmap_loc[Crow]; /* row index of submat */ 1740 #else 1741 row = rmap[row]; 1742 #endif 1743 1744 if (allcolumns) { 1745 /* diagonal part A = c->A */ 1746 ncols = ai[Crow+1] - ai[Crow]; 1747 cols = aj + ai[Crow]; 1748 vals = a_a + ai[Crow]; 1749 i = 0; 1750 for (k=0; k<ncols; k++) { 1751 subcols[i] = cols[k] + cstart; 1752 subvals[i++] = vals[k]; 1753 } 1754 1755 /* off-diagonal part B = c->B */ 1756 ncols = bi[Crow+1] - bi[Crow]; 1757 cols = bj + bi[Crow]; 1758 vals = b_a + bi[Crow]; 1759 for (k=0; k<ncols; k++) { 1760 subcols[i] = bmap[cols[k]]; 1761 subvals[i++] = vals[k]; 1762 } 1763 1764 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES)); 1765 1766 } else { /* !allcolumns */ 1767 #if defined(PETSC_USE_CTABLE) 1768 /* diagonal part A = c->A */ 1769 ncols = ai[Crow+1] - ai[Crow]; 1770 cols = aj + ai[Crow]; 1771 vals = a_a + ai[Crow]; 1772 i = 0; 1773 for (k=0; k<ncols; k++) { 1774 tcol = cmap_loc[cols[k]]; 1775 if (tcol) { 1776 subcols[i] = --tcol; 1777 subvals[i++] = vals[k]; 1778 } 1779 } 1780 1781 /* off-diagonal part B = c->B */ 1782 ncols = bi[Crow+1] - bi[Crow]; 1783 cols = bj + bi[Crow]; 1784 vals = b_a + bi[Crow]; 1785 for (k=0; k<ncols; k++) { 1786 PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol)); 1787 if (tcol) { 1788 subcols[i] = --tcol; 1789 subvals[i++] = vals[k]; 1790 } 1791 } 1792 #else 1793 /* diagonal part A = c->A */ 1794 ncols = ai[Crow+1] - ai[Crow]; 1795 cols = aj + ai[Crow]; 1796 vals = a_a + ai[Crow]; 1797 i = 0; 1798 for (k=0; k<ncols; k++) { 1799 tcol = cmap[cols[k]+cstart]; 1800 if (tcol) { 1801 subcols[i] = --tcol; 1802 subvals[i++] = vals[k]; 1803 } 1804 } 1805 1806 /* off-diagonal part B = c->B */ 1807 ncols = bi[Crow+1] - bi[Crow]; 1808 cols = bj + bi[Crow]; 1809 vals = b_a + bi[Crow]; 1810 for (k=0; k<ncols; k++) { 1811 tcol = cmap[bmap[cols[k]]]; 1812 if (tcol) { 1813 subcols[i] = --tcol; 1814 subvals[i++] = vals[k]; 1815 } 1816 } 1817 #endif 1818 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES)); 1819 } 1820 } 1821 } 1822 1823 /* Now assemble the off-proc rows */ 1824 for (i=0; i<nrqs; i++) { /* for each requested message */ 1825 /* recv values from other processes */ 1826 PetscCallMPI(MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i)); 1827 proc = pa[idex]; 1828 sbuf1_i = sbuf1[proc]; 1829 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1830 ct1 = 2 + 1; 1831 ct2 = 0; /* count of received C->j */ 1832 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1833 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1834 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1835 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1836 1837 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1838 max1 = sbuf1_i[2]; /* num of rows */ 1839 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1840 row = sbuf1_i[ct1]; /* row index of submat */ 1841 if (!allcolumns) { 1842 idex = 0; 1843 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1844 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1845 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1846 #if defined(PETSC_USE_CTABLE) 1847 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1848 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1849 } else { 1850 PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol)); 1851 } 1852 #else 1853 tcol = cmap[rbuf3_i[ct2]]; 1854 #endif 1855 if (tcol) { 1856 subcols[idex] = --tcol; /* may not be sorted */ 1857 subvals[idex++] = rbuf4_i[ct2]; 1858 1859 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1860 For reuse, we replace received C->j with index that should be inserted to submat */ 1861 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1862 } 1863 } 1864 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES)); 1865 } else { /* scall == MAT_REUSE_MATRIX */ 1866 submat = submats[0]; 1867 subc = (Mat_SeqAIJ*)submat->data; 1868 1869 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1870 for (l=0; l<nnz; l++) { 1871 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1872 subvals[idex++] = rbuf4_i[ct2]; 1873 } 1874 1875 bj = subc->j + subc->i[row]; /* sorted column indices */ 1876 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES)); 1877 } 1878 } else { /* allcolumns */ 1879 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1880 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES)); 1881 ct2 += nnz; 1882 } 1883 } 1884 } 1885 1886 /* sending a->a are done */ 1887 PetscCallMPI(MPI_Waitall(nrqr,s_waits4,s_status4)); 1888 PetscCall(PetscFree4(r_waits4,s_waits4,r_status4,s_status4)); 1889 1890 PetscCall(MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY)); 1891 PetscCall(MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY)); 1892 submats[0] = submat; 1893 1894 /* Restore the indices */ 1895 PetscCall(ISRestoreIndices(isrow[0],&irow)); 1896 if (!allcolumns) { 1897 PetscCall(ISRestoreIndices(iscol[0],&icol)); 1898 } 1899 1900 /* Destroy allocated memory */ 1901 for (i=0; i<nrqs; ++i) { 1902 PetscCall(PetscFree(rbuf4[i])); 1903 } 1904 PetscCall(PetscFree3(rbuf4,subcols,subvals)); 1905 if (sbuf_aa) { 1906 PetscCall(PetscFree(sbuf_aa[0])); 1907 PetscCall(PetscFree(sbuf_aa)); 1908 } 1909 1910 if (scall == MAT_INITIAL_MATRIX) { 1911 PetscCall(PetscFree(lens)); 1912 if (sbuf_aj) { 1913 PetscCall(PetscFree(sbuf_aj[0])); 1914 PetscCall(PetscFree(sbuf_aj)); 1915 } 1916 } 1917 PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a)); 1918 PetscCall(MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a)); 1919 PetscFunctionReturn(0); 1920 } 1921 1922 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1923 { 1924 PetscInt ncol; 1925 PetscBool colflag,allcolumns=PETSC_FALSE; 1926 1927 PetscFunctionBegin; 1928 /* Allocate memory to hold all the submatrices */ 1929 if (scall == MAT_INITIAL_MATRIX) { 1930 PetscCall(PetscCalloc1(2,submat)); 1931 } 1932 1933 /* Check for special case: each processor gets entire matrix columns */ 1934 PetscCall(ISIdentity(iscol[0],&colflag)); 1935 PetscCall(ISGetLocalSize(iscol[0],&ncol)); 1936 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1937 1938 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat)); 1939 PetscFunctionReturn(0); 1940 } 1941 1942 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1943 { 1944 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1945 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1946 Mat_SeqAIJ *subc; 1947 Mat_SubSppt *smat; 1948 1949 PetscFunctionBegin; 1950 /* Check for special case: each processor has a single IS */ 1951 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1952 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat)); 1953 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1954 PetscFunctionReturn(0); 1955 } 1956 1957 /* Collect global wantallmatrix and nstages */ 1958 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1959 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1960 if (!nmax) nmax = 1; 1961 1962 if (scall == MAT_INITIAL_MATRIX) { 1963 /* Collect global wantallmatrix and nstages */ 1964 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1965 PetscCall(ISIdentity(*isrow,&rowflag)); 1966 PetscCall(ISIdentity(*iscol,&colflag)); 1967 PetscCall(ISGetLocalSize(*isrow,&nrow)); 1968 PetscCall(ISGetLocalSize(*iscol,&ncol)); 1969 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1970 wantallmatrix = PETSC_TRUE; 1971 1972 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL)); 1973 } 1974 } 1975 1976 /* Determine the number of stages through which submatrices are done 1977 Each stage will extract nmax submatrices. 1978 nmax is determined by the matrix column dimension. 1979 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1980 */ 1981 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1982 1983 in[0] = -1*(PetscInt)wantallmatrix; 1984 in[1] = nstages; 1985 PetscCall(MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C))); 1986 wantallmatrix = (PetscBool)(-out[0]); 1987 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 1988 1989 } else { /* MAT_REUSE_MATRIX */ 1990 if (ismax) { 1991 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 1992 smat = subc->submatis1; 1993 } else { /* (*submat)[0] is a dummy matrix */ 1994 smat = (Mat_SubSppt*)(*submat)[0]->data; 1995 } 1996 if (!smat) { 1997 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 1998 wantallmatrix = PETSC_TRUE; 1999 } else if (smat->singleis) { 2000 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat)); 2001 PetscFunctionReturn(0); 2002 } else { 2003 nstages = smat->nstages; 2004 } 2005 } 2006 2007 if (wantallmatrix) { 2008 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat)); 2009 PetscFunctionReturn(0); 2010 } 2011 2012 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2013 if (scall == MAT_INITIAL_MATRIX) { 2014 PetscCall(PetscCalloc1(ismax+nstages,submat)); 2015 } 2016 2017 for (i=0,pos=0; i<nstages; i++) { 2018 if (pos+nmax <= ismax) max_no = nmax; 2019 else if (pos >= ismax) max_no = 0; 2020 else max_no = ismax-pos; 2021 2022 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos)); 2023 if (!max_no) { 2024 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2025 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2026 smat->nstages = nstages; 2027 } 2028 pos++; /* advance to next dummy matrix if any */ 2029 } else pos += max_no; 2030 } 2031 2032 if (ismax && scall == MAT_INITIAL_MATRIX) { 2033 /* save nstages for reuse */ 2034 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2035 smat = subc->submatis1; 2036 smat->nstages = nstages; 2037 } 2038 PetscFunctionReturn(0); 2039 } 2040 2041 /* -------------------------------------------------------------------------*/ 2042 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2043 { 2044 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2045 Mat A = c->A; 2046 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2047 const PetscInt **icol,**irow; 2048 PetscInt *nrow,*ncol,start; 2049 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2050 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2051 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2052 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2053 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2054 #if defined(PETSC_USE_CTABLE) 2055 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2056 #else 2057 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2058 #endif 2059 const PetscInt *irow_i; 2060 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2061 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2062 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2063 MPI_Comm comm; 2064 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2065 PetscMPIInt *onodes1,*olengths1,end; 2066 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2067 Mat_SubSppt *smat_i; 2068 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2069 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2070 2071 PetscFunctionBegin; 2072 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 2073 size = c->size; 2074 rank = c->rank; 2075 2076 PetscCall(PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns)); 2077 PetscCall(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted)); 2078 2079 for (i=0; i<ismax; i++) { 2080 PetscCall(ISSorted(iscol[i],&issorted[i])); 2081 if (!issorted[i]) iscsorted = issorted[i]; 2082 2083 PetscCall(ISSorted(isrow[i],&issorted[i])); 2084 2085 PetscCall(ISGetIndices(isrow[i],&irow[i])); 2086 PetscCall(ISGetLocalSize(isrow[i],&nrow[i])); 2087 2088 /* Check for special case: allcolumn */ 2089 PetscCall(ISIdentity(iscol[i],&colflag)); 2090 PetscCall(ISGetLocalSize(iscol[i],&ncol[i])); 2091 if (colflag && ncol[i] == C->cmap->N) { 2092 allcolumns[i] = PETSC_TRUE; 2093 icol[i] = NULL; 2094 } else { 2095 allcolumns[i] = PETSC_FALSE; 2096 PetscCall(ISGetIndices(iscol[i],&icol[i])); 2097 } 2098 } 2099 2100 if (scall == MAT_REUSE_MATRIX) { 2101 /* Assumes new rows are same length as the old rows */ 2102 for (i=0; i<ismax; i++) { 2103 PetscCheck(submats[i],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%" PetscInt_FMT "] is null, cannot reuse",i); 2104 subc = (Mat_SeqAIJ*)submats[i]->data; 2105 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2106 2107 /* Initial matrix as if empty */ 2108 PetscCall(PetscArrayzero(subc->ilen,submats[i]->rmap->n)); 2109 2110 smat_i = subc->submatis1; 2111 2112 nrqs = smat_i->nrqs; 2113 nrqr = smat_i->nrqr; 2114 rbuf1 = smat_i->rbuf1; 2115 rbuf2 = smat_i->rbuf2; 2116 rbuf3 = smat_i->rbuf3; 2117 req_source2 = smat_i->req_source2; 2118 2119 sbuf1 = smat_i->sbuf1; 2120 sbuf2 = smat_i->sbuf2; 2121 ptr = smat_i->ptr; 2122 tmp = smat_i->tmp; 2123 ctr = smat_i->ctr; 2124 2125 pa = smat_i->pa; 2126 req_size = smat_i->req_size; 2127 req_source1 = smat_i->req_source1; 2128 2129 allcolumns[i] = smat_i->allcolumns; 2130 row2proc[i] = smat_i->row2proc; 2131 rmap[i] = smat_i->rmap; 2132 cmap[i] = smat_i->cmap; 2133 } 2134 2135 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2136 PetscCheck(submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2137 smat_i = (Mat_SubSppt*)submats[0]->data; 2138 2139 nrqs = smat_i->nrqs; 2140 nrqr = smat_i->nrqr; 2141 rbuf1 = smat_i->rbuf1; 2142 rbuf2 = smat_i->rbuf2; 2143 rbuf3 = smat_i->rbuf3; 2144 req_source2 = smat_i->req_source2; 2145 2146 sbuf1 = smat_i->sbuf1; 2147 sbuf2 = smat_i->sbuf2; 2148 ptr = smat_i->ptr; 2149 tmp = smat_i->tmp; 2150 ctr = smat_i->ctr; 2151 2152 pa = smat_i->pa; 2153 req_size = smat_i->req_size; 2154 req_source1 = smat_i->req_source1; 2155 2156 allcolumns[0] = PETSC_FALSE; 2157 } 2158 } else { /* scall == MAT_INITIAL_MATRIX */ 2159 /* Get some new tags to keep the communication clean */ 2160 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2)); 2161 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3)); 2162 2163 /* evaluate communication - mesg to who, length of mesg, and buffer space 2164 required. Based on this, buffers are allocated, and data copied into them*/ 2165 PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4)); /* mesg size, initialize work vectors */ 2166 2167 for (i=0; i<ismax; i++) { 2168 jmax = nrow[i]; 2169 irow_i = irow[i]; 2170 2171 PetscCall(PetscMalloc1(jmax,&row2proc_i)); 2172 row2proc[i] = row2proc_i; 2173 2174 if (issorted[i]) proc = 0; 2175 for (j=0; j<jmax; j++) { 2176 if (!issorted[i]) proc = 0; 2177 row = irow_i[j]; 2178 while (row >= C->rmap->range[proc+1]) proc++; 2179 w4[proc]++; 2180 row2proc_i[j] = proc; /* map row index to proc */ 2181 } 2182 for (j=0; j<size; j++) { 2183 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2184 } 2185 } 2186 2187 nrqs = 0; /* no of outgoing messages */ 2188 msz = 0; /* total mesg length (for all procs) */ 2189 w1[rank] = 0; /* no mesg sent to self */ 2190 w3[rank] = 0; 2191 for (i=0; i<size; i++) { 2192 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2193 } 2194 PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/ 2195 for (i=0,j=0; i<size; i++) { 2196 if (w1[i]) { pa[j] = i; j++; } 2197 } 2198 2199 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2200 for (i=0; i<nrqs; i++) { 2201 j = pa[i]; 2202 w1[j] += w2[j] + 2* w3[j]; 2203 msz += w1[j]; 2204 } 2205 PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz)); 2206 2207 /* Determine the number of messages to expect, their lengths, from from-ids */ 2208 PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 2209 PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 2210 2211 /* Now post the Irecvs corresponding to these messages */ 2212 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag0)); 2213 PetscCall(PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1)); 2214 2215 /* Allocate Memory for outgoing messages */ 2216 PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr)); 2217 PetscCall(PetscArrayzero(sbuf1,size)); 2218 PetscCall(PetscArrayzero(ptr,size)); 2219 2220 { 2221 PetscInt *iptr = tmp; 2222 k = 0; 2223 for (i=0; i<nrqs; i++) { 2224 j = pa[i]; 2225 iptr += k; 2226 sbuf1[j] = iptr; 2227 k = w1[j]; 2228 } 2229 } 2230 2231 /* Form the outgoing messages. Initialize the header space */ 2232 for (i=0; i<nrqs; i++) { 2233 j = pa[i]; 2234 sbuf1[j][0] = 0; 2235 PetscCall(PetscArrayzero(sbuf1[j]+1,2*w3[j])); 2236 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2237 } 2238 2239 /* Parse the isrow and copy data into outbuf */ 2240 for (i=0; i<ismax; i++) { 2241 row2proc_i = row2proc[i]; 2242 PetscCall(PetscArrayzero(ctr,size)); 2243 irow_i = irow[i]; 2244 jmax = nrow[i]; 2245 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2246 proc = row2proc_i[j]; 2247 if (proc != rank) { /* copy to the outgoing buf*/ 2248 ctr[proc]++; 2249 *ptr[proc] = irow_i[j]; 2250 ptr[proc]++; 2251 } 2252 } 2253 /* Update the headers for the current IS */ 2254 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2255 if ((ctr_j = ctr[j])) { 2256 sbuf1_j = sbuf1[j]; 2257 k = ++sbuf1_j[0]; 2258 sbuf1_j[2*k] = ctr_j; 2259 sbuf1_j[2*k-1] = i; 2260 } 2261 } 2262 } 2263 2264 /* Now post the sends */ 2265 PetscCall(PetscMalloc1(nrqs,&s_waits1)); 2266 for (i=0; i<nrqs; ++i) { 2267 j = pa[i]; 2268 PetscCallMPI(MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i)); 2269 } 2270 2271 /* Post Receives to capture the buffer size */ 2272 PetscCall(PetscMalloc1(nrqs,&r_waits2)); 2273 PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3)); 2274 if (nrqs) rbuf2[0] = tmp + msz; 2275 for (i=1; i<nrqs; ++i) { 2276 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2277 } 2278 for (i=0; i<nrqs; ++i) { 2279 j = pa[i]; 2280 PetscCallMPI(MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i)); 2281 } 2282 2283 /* Send to other procs the buf size they should allocate */ 2284 /* Receive messages*/ 2285 PetscCall(PetscMalloc1(nrqr,&s_waits2)); 2286 PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1)); 2287 { 2288 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2289 PetscInt *sbuf2_i; 2290 2291 PetscCallMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE)); 2292 for (i=0; i<nrqr; ++i) { 2293 req_size[i] = 0; 2294 rbuf1_i = rbuf1[i]; 2295 start = 2*rbuf1_i[0] + 1; 2296 end = olengths1[i]; 2297 PetscCall(PetscMalloc1(end,&sbuf2[i])); 2298 sbuf2_i = sbuf2[i]; 2299 for (j=start; j<end; j++) { 2300 id = rbuf1_i[j] - rstart; 2301 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2302 sbuf2_i[j] = ncols; 2303 req_size[i] += ncols; 2304 } 2305 req_source1[i] = onodes1[i]; 2306 /* form the header */ 2307 sbuf2_i[0] = req_size[i]; 2308 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2309 2310 PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i)); 2311 } 2312 } 2313 2314 PetscCall(PetscFree(onodes1)); 2315 PetscCall(PetscFree(olengths1)); 2316 PetscCall(PetscFree(r_waits1)); 2317 PetscCall(PetscFree4(w1,w2,w3,w4)); 2318 2319 /* Receive messages*/ 2320 PetscCall(PetscMalloc1(nrqs,&r_waits3)); 2321 PetscCallMPI(MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE)); 2322 for (i=0; i<nrqs; ++i) { 2323 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i])); 2324 req_source2[i] = pa[i]; 2325 PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i)); 2326 } 2327 PetscCall(PetscFree(r_waits2)); 2328 2329 /* Wait on sends1 and sends2 */ 2330 PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE)); 2331 PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE)); 2332 PetscCall(PetscFree(s_waits1)); 2333 PetscCall(PetscFree(s_waits2)); 2334 2335 /* Now allocate sending buffers for a->j, and send them off */ 2336 PetscCall(PetscMalloc1(nrqr,&sbuf_aj)); 2337 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2338 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0])); 2339 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2340 2341 PetscCall(PetscMalloc1(nrqr,&s_waits3)); 2342 { 2343 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2344 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2345 PetscInt cend = C->cmap->rend; 2346 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2347 2348 for (i=0; i<nrqr; i++) { 2349 rbuf1_i = rbuf1[i]; 2350 sbuf_aj_i = sbuf_aj[i]; 2351 ct1 = 2*rbuf1_i[0] + 1; 2352 ct2 = 0; 2353 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2354 kmax = rbuf1[i][2*j]; 2355 for (k=0; k<kmax; k++,ct1++) { 2356 row = rbuf1_i[ct1] - rstart; 2357 nzA = a_i[row+1] - a_i[row]; 2358 nzB = b_i[row+1] - b_i[row]; 2359 ncols = nzA + nzB; 2360 cworkA = a_j + a_i[row]; 2361 cworkB = b_j + b_i[row]; 2362 2363 /* load the column indices for this row into cols */ 2364 cols = sbuf_aj_i + ct2; 2365 2366 lwrite = 0; 2367 for (l=0; l<nzB; l++) { 2368 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2369 } 2370 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2371 for (l=0; l<nzB; l++) { 2372 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2373 } 2374 2375 ct2 += ncols; 2376 } 2377 } 2378 PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i)); 2379 } 2380 } 2381 2382 /* create col map: global col of C -> local col of submatrices */ 2383 { 2384 const PetscInt *icol_i; 2385 #if defined(PETSC_USE_CTABLE) 2386 for (i=0; i<ismax; i++) { 2387 if (!allcolumns[i]) { 2388 PetscCall(PetscTableCreate(ncol[i],C->cmap->N,&cmap[i])); 2389 2390 jmax = ncol[i]; 2391 icol_i = icol[i]; 2392 cmap_i = cmap[i]; 2393 for (j=0; j<jmax; j++) { 2394 PetscCall(PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES)); 2395 } 2396 } else cmap[i] = NULL; 2397 } 2398 #else 2399 for (i=0; i<ismax; i++) { 2400 if (!allcolumns[i]) { 2401 PetscCall(PetscCalloc1(C->cmap->N,&cmap[i])); 2402 jmax = ncol[i]; 2403 icol_i = icol[i]; 2404 cmap_i = cmap[i]; 2405 for (j=0; j<jmax; j++) { 2406 cmap_i[icol_i[j]] = j+1; 2407 } 2408 } else cmap[i] = NULL; 2409 } 2410 #endif 2411 } 2412 2413 /* Create lens which is required for MatCreate... */ 2414 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2415 PetscCall(PetscMalloc1(ismax,&lens)); 2416 2417 if (ismax) { 2418 PetscCall(PetscCalloc1(j,&lens[0])); 2419 } 2420 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2421 2422 /* Update lens from local data */ 2423 for (i=0; i<ismax; i++) { 2424 row2proc_i = row2proc[i]; 2425 jmax = nrow[i]; 2426 if (!allcolumns[i]) cmap_i = cmap[i]; 2427 irow_i = irow[i]; 2428 lens_i = lens[i]; 2429 for (j=0; j<jmax; j++) { 2430 row = irow_i[j]; 2431 proc = row2proc_i[j]; 2432 if (proc == rank) { 2433 PetscCall(MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL)); 2434 if (!allcolumns[i]) { 2435 for (k=0; k<ncols; k++) { 2436 #if defined(PETSC_USE_CTABLE) 2437 PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol)); 2438 #else 2439 tcol = cmap_i[cols[k]]; 2440 #endif 2441 if (tcol) lens_i[j]++; 2442 } 2443 } else { /* allcolumns */ 2444 lens_i[j] = ncols; 2445 } 2446 PetscCall(MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL)); 2447 } 2448 } 2449 } 2450 2451 /* Create row map: global row of C -> local row of submatrices */ 2452 #if defined(PETSC_USE_CTABLE) 2453 for (i=0; i<ismax; i++) { 2454 PetscCall(PetscTableCreate(nrow[i],C->rmap->N,&rmap[i])); 2455 irow_i = irow[i]; 2456 jmax = nrow[i]; 2457 for (j=0; j<jmax; j++) { 2458 PetscCall(PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES)); 2459 } 2460 } 2461 #else 2462 for (i=0; i<ismax; i++) { 2463 PetscCall(PetscCalloc1(C->rmap->N,&rmap[i])); 2464 rmap_i = rmap[i]; 2465 irow_i = irow[i]; 2466 jmax = nrow[i]; 2467 for (j=0; j<jmax; j++) { 2468 rmap_i[irow_i[j]] = j; 2469 } 2470 } 2471 #endif 2472 2473 /* Update lens from offproc data */ 2474 { 2475 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2476 2477 PetscCallMPI(MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE)); 2478 for (tmp2=0; tmp2<nrqs; tmp2++) { 2479 sbuf1_i = sbuf1[pa[tmp2]]; 2480 jmax = sbuf1_i[0]; 2481 ct1 = 2*jmax+1; 2482 ct2 = 0; 2483 rbuf2_i = rbuf2[tmp2]; 2484 rbuf3_i = rbuf3[tmp2]; 2485 for (j=1; j<=jmax; j++) { 2486 is_no = sbuf1_i[2*j-1]; 2487 max1 = sbuf1_i[2*j]; 2488 lens_i = lens[is_no]; 2489 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2490 rmap_i = rmap[is_no]; 2491 for (k=0; k<max1; k++,ct1++) { 2492 #if defined(PETSC_USE_CTABLE) 2493 PetscCall(PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row)); 2494 row--; 2495 PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2496 #else 2497 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2498 #endif 2499 max2 = rbuf2_i[ct1]; 2500 for (l=0; l<max2; l++,ct2++) { 2501 if (!allcolumns[is_no]) { 2502 #if defined(PETSC_USE_CTABLE) 2503 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol)); 2504 #else 2505 tcol = cmap_i[rbuf3_i[ct2]]; 2506 #endif 2507 if (tcol) lens_i[row]++; 2508 } else { /* allcolumns */ 2509 lens_i[row]++; /* lens_i[row] += max2 ? */ 2510 } 2511 } 2512 } 2513 } 2514 } 2515 } 2516 PetscCall(PetscFree(r_waits3)); 2517 PetscCallMPI(MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE)); 2518 PetscCall(PetscFree(s_waits3)); 2519 2520 /* Create the submatrices */ 2521 for (i=0; i<ismax; i++) { 2522 PetscInt rbs,cbs; 2523 2524 PetscCall(ISGetBlockSize(isrow[i],&rbs)); 2525 PetscCall(ISGetBlockSize(iscol[i],&cbs)); 2526 2527 PetscCall(MatCreate(PETSC_COMM_SELF,submats+i)); 2528 PetscCall(MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE)); 2529 2530 PetscCall(MatSetBlockSizes(submats[i],rbs,cbs)); 2531 PetscCall(MatSetType(submats[i],((PetscObject)A)->type_name)); 2532 PetscCall(MatSeqAIJSetPreallocation(submats[i],0,lens[i])); 2533 2534 /* create struct Mat_SubSppt and attached it to submat */ 2535 PetscCall(PetscNew(&smat_i)); 2536 subc = (Mat_SeqAIJ*)submats[i]->data; 2537 subc->submatis1 = smat_i; 2538 2539 smat_i->destroy = submats[i]->ops->destroy; 2540 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2541 submats[i]->factortype = C->factortype; 2542 2543 smat_i->id = i; 2544 smat_i->nrqs = nrqs; 2545 smat_i->nrqr = nrqr; 2546 smat_i->rbuf1 = rbuf1; 2547 smat_i->rbuf2 = rbuf2; 2548 smat_i->rbuf3 = rbuf3; 2549 smat_i->sbuf2 = sbuf2; 2550 smat_i->req_source2 = req_source2; 2551 2552 smat_i->sbuf1 = sbuf1; 2553 smat_i->ptr = ptr; 2554 smat_i->tmp = tmp; 2555 smat_i->ctr = ctr; 2556 2557 smat_i->pa = pa; 2558 smat_i->req_size = req_size; 2559 smat_i->req_source1 = req_source1; 2560 2561 smat_i->allcolumns = allcolumns[i]; 2562 smat_i->singleis = PETSC_FALSE; 2563 smat_i->row2proc = row2proc[i]; 2564 smat_i->rmap = rmap[i]; 2565 smat_i->cmap = cmap[i]; 2566 } 2567 2568 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2569 PetscCall(MatCreate(PETSC_COMM_SELF,&submats[0])); 2570 PetscCall(MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE)); 2571 PetscCall(MatSetType(submats[0],MATDUMMY)); 2572 2573 /* create struct Mat_SubSppt and attached it to submat */ 2574 PetscCall(PetscNewLog(submats[0],&smat_i)); 2575 submats[0]->data = (void*)smat_i; 2576 2577 smat_i->destroy = submats[0]->ops->destroy; 2578 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2579 submats[0]->factortype = C->factortype; 2580 2581 smat_i->id = 0; 2582 smat_i->nrqs = nrqs; 2583 smat_i->nrqr = nrqr; 2584 smat_i->rbuf1 = rbuf1; 2585 smat_i->rbuf2 = rbuf2; 2586 smat_i->rbuf3 = rbuf3; 2587 smat_i->sbuf2 = sbuf2; 2588 smat_i->req_source2 = req_source2; 2589 2590 smat_i->sbuf1 = sbuf1; 2591 smat_i->ptr = ptr; 2592 smat_i->tmp = tmp; 2593 smat_i->ctr = ctr; 2594 2595 smat_i->pa = pa; 2596 smat_i->req_size = req_size; 2597 smat_i->req_source1 = req_source1; 2598 2599 smat_i->allcolumns = PETSC_FALSE; 2600 smat_i->singleis = PETSC_FALSE; 2601 smat_i->row2proc = NULL; 2602 smat_i->rmap = NULL; 2603 smat_i->cmap = NULL; 2604 } 2605 2606 if (ismax) PetscCall(PetscFree(lens[0])); 2607 PetscCall(PetscFree(lens)); 2608 if (sbuf_aj) { 2609 PetscCall(PetscFree(sbuf_aj[0])); 2610 PetscCall(PetscFree(sbuf_aj)); 2611 } 2612 2613 } /* endof scall == MAT_INITIAL_MATRIX */ 2614 2615 /* Post recv matrix values */ 2616 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4)); 2617 PetscCall(PetscMalloc1(nrqs,&rbuf4)); 2618 PetscCall(PetscMalloc1(nrqs,&r_waits4)); 2619 for (i=0; i<nrqs; ++i) { 2620 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i])); 2621 PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i)); 2622 } 2623 2624 /* Allocate sending buffers for a->a, and send them off */ 2625 PetscCall(PetscMalloc1(nrqr,&sbuf_aa)); 2626 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2627 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0])); 2628 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2629 2630 PetscCall(PetscMalloc1(nrqr,&s_waits4)); 2631 { 2632 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2633 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2634 PetscInt cend = C->cmap->rend; 2635 PetscInt *b_j = b->j; 2636 PetscScalar *vworkA,*vworkB,*a_a,*b_a; 2637 2638 PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a)); 2639 PetscCall(MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a)); 2640 for (i=0; i<nrqr; i++) { 2641 rbuf1_i = rbuf1[i]; 2642 sbuf_aa_i = sbuf_aa[i]; 2643 ct1 = 2*rbuf1_i[0]+1; 2644 ct2 = 0; 2645 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2646 kmax = rbuf1_i[2*j]; 2647 for (k=0; k<kmax; k++,ct1++) { 2648 row = rbuf1_i[ct1] - rstart; 2649 nzA = a_i[row+1] - a_i[row]; 2650 nzB = b_i[row+1] - b_i[row]; 2651 ncols = nzA + nzB; 2652 cworkB = b_j + b_i[row]; 2653 vworkA = a_a + a_i[row]; 2654 vworkB = b_a + b_i[row]; 2655 2656 /* load the column values for this row into vals*/ 2657 vals = sbuf_aa_i+ct2; 2658 2659 lwrite = 0; 2660 for (l=0; l<nzB; l++) { 2661 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2662 } 2663 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2664 for (l=0; l<nzB; l++) { 2665 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2666 } 2667 2668 ct2 += ncols; 2669 } 2670 } 2671 PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i)); 2672 } 2673 PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a)); 2674 PetscCall(MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a)); 2675 } 2676 2677 /* Assemble the matrices */ 2678 /* First assemble the local rows */ 2679 for (i=0; i<ismax; i++) { 2680 row2proc_i = row2proc[i]; 2681 subc = (Mat_SeqAIJ*)submats[i]->data; 2682 imat_ilen = subc->ilen; 2683 imat_j = subc->j; 2684 imat_i = subc->i; 2685 imat_a = subc->a; 2686 2687 if (!allcolumns[i]) cmap_i = cmap[i]; 2688 rmap_i = rmap[i]; 2689 irow_i = irow[i]; 2690 jmax = nrow[i]; 2691 for (j=0; j<jmax; j++) { 2692 row = irow_i[j]; 2693 proc = row2proc_i[j]; 2694 if (proc == rank) { 2695 old_row = row; 2696 #if defined(PETSC_USE_CTABLE) 2697 PetscCall(PetscTableFind(rmap_i,row+1,&row)); 2698 row--; 2699 #else 2700 row = rmap_i[row]; 2701 #endif 2702 ilen_row = imat_ilen[row]; 2703 PetscCall(MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals)); 2704 mat_i = imat_i[row]; 2705 mat_a = imat_a + mat_i; 2706 mat_j = imat_j + mat_i; 2707 if (!allcolumns[i]) { 2708 for (k=0; k<ncols; k++) { 2709 #if defined(PETSC_USE_CTABLE) 2710 PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol)); 2711 #else 2712 tcol = cmap_i[cols[k]]; 2713 #endif 2714 if (tcol) { 2715 *mat_j++ = tcol - 1; 2716 *mat_a++ = vals[k]; 2717 ilen_row++; 2718 } 2719 } 2720 } else { /* allcolumns */ 2721 for (k=0; k<ncols; k++) { 2722 *mat_j++ = cols[k]; /* global col index! */ 2723 *mat_a++ = vals[k]; 2724 ilen_row++; 2725 } 2726 } 2727 PetscCall(MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals)); 2728 2729 imat_ilen[row] = ilen_row; 2730 } 2731 } 2732 } 2733 2734 /* Now assemble the off proc rows */ 2735 PetscCallMPI(MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE)); 2736 for (tmp2=0; tmp2<nrqs; tmp2++) { 2737 sbuf1_i = sbuf1[pa[tmp2]]; 2738 jmax = sbuf1_i[0]; 2739 ct1 = 2*jmax + 1; 2740 ct2 = 0; 2741 rbuf2_i = rbuf2[tmp2]; 2742 rbuf3_i = rbuf3[tmp2]; 2743 rbuf4_i = rbuf4[tmp2]; 2744 for (j=1; j<=jmax; j++) { 2745 is_no = sbuf1_i[2*j-1]; 2746 rmap_i = rmap[is_no]; 2747 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2748 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2749 imat_ilen = subc->ilen; 2750 imat_j = subc->j; 2751 imat_i = subc->i; 2752 imat_a = subc->a; 2753 max1 = sbuf1_i[2*j]; 2754 for (k=0; k<max1; k++,ct1++) { 2755 row = sbuf1_i[ct1]; 2756 #if defined(PETSC_USE_CTABLE) 2757 PetscCall(PetscTableFind(rmap_i,row+1,&row)); 2758 row--; 2759 #else 2760 row = rmap_i[row]; 2761 #endif 2762 ilen = imat_ilen[row]; 2763 mat_i = imat_i[row]; 2764 mat_a = imat_a + mat_i; 2765 mat_j = imat_j + mat_i; 2766 max2 = rbuf2_i[ct1]; 2767 if (!allcolumns[is_no]) { 2768 for (l=0; l<max2; l++,ct2++) { 2769 #if defined(PETSC_USE_CTABLE) 2770 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol)); 2771 #else 2772 tcol = cmap_i[rbuf3_i[ct2]]; 2773 #endif 2774 if (tcol) { 2775 *mat_j++ = tcol - 1; 2776 *mat_a++ = rbuf4_i[ct2]; 2777 ilen++; 2778 } 2779 } 2780 } else { /* allcolumns */ 2781 for (l=0; l<max2; l++,ct2++) { 2782 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2783 *mat_a++ = rbuf4_i[ct2]; 2784 ilen++; 2785 } 2786 } 2787 imat_ilen[row] = ilen; 2788 } 2789 } 2790 } 2791 2792 if (!iscsorted) { /* sort column indices of the rows */ 2793 for (i=0; i<ismax; i++) { 2794 subc = (Mat_SeqAIJ*)submats[i]->data; 2795 imat_j = subc->j; 2796 imat_i = subc->i; 2797 imat_a = subc->a; 2798 imat_ilen = subc->ilen; 2799 2800 if (allcolumns[i]) continue; 2801 jmax = nrow[i]; 2802 for (j=0; j<jmax; j++) { 2803 mat_i = imat_i[j]; 2804 mat_a = imat_a + mat_i; 2805 mat_j = imat_j + mat_i; 2806 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a)); 2807 } 2808 } 2809 } 2810 2811 PetscCall(PetscFree(r_waits4)); 2812 PetscCallMPI(MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE)); 2813 PetscCall(PetscFree(s_waits4)); 2814 2815 /* Restore the indices */ 2816 for (i=0; i<ismax; i++) { 2817 PetscCall(ISRestoreIndices(isrow[i],irow+i)); 2818 if (!allcolumns[i]) { 2819 PetscCall(ISRestoreIndices(iscol[i],icol+i)); 2820 } 2821 } 2822 2823 for (i=0; i<ismax; i++) { 2824 PetscCall(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY)); 2825 PetscCall(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY)); 2826 } 2827 2828 /* Destroy allocated memory */ 2829 if (sbuf_aa) { 2830 PetscCall(PetscFree(sbuf_aa[0])); 2831 PetscCall(PetscFree(sbuf_aa)); 2832 } 2833 PetscCall(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted)); 2834 2835 for (i=0; i<nrqs; ++i) { 2836 PetscCall(PetscFree(rbuf4[i])); 2837 } 2838 PetscCall(PetscFree(rbuf4)); 2839 2840 PetscCall(PetscFree4(row2proc,cmap,rmap,allcolumns)); 2841 PetscFunctionReturn(0); 2842 } 2843 2844 /* 2845 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2846 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2847 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2848 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2849 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2850 state, and needs to be "assembled" later by compressing B's column space. 2851 2852 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2853 Following this call, C->A & C->B have been created, even if empty. 2854 */ 2855 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2856 { 2857 /* If making this function public, change the error returned in this function away from _PLIB. */ 2858 Mat_MPIAIJ *aij; 2859 Mat_SeqAIJ *Baij; 2860 PetscBool seqaij,Bdisassembled; 2861 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2862 PetscScalar v; 2863 const PetscInt *rowindices,*colindices; 2864 2865 PetscFunctionBegin; 2866 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2867 if (A) { 2868 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij)); 2869 PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2870 if (rowemb) { 2871 PetscCall(ISGetLocalSize(rowemb,&m)); 2872 PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT,m,A->rmap->n); 2873 } else { 2874 if (C->rmap->n != A->rmap->n) { 2875 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2876 } 2877 } 2878 if (dcolemb) { 2879 PetscCall(ISGetLocalSize(dcolemb,&n)); 2880 PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT,n,A->cmap->n); 2881 } else { 2882 PetscCheck(C->cmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2883 } 2884 } 2885 if (B) { 2886 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij)); 2887 PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2888 if (rowemb) { 2889 PetscCall(ISGetLocalSize(rowemb,&m)); 2890 PetscCheck(m == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT,m,A->rmap->n); 2891 } else { 2892 if (C->rmap->n != B->rmap->n) { 2893 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2894 } 2895 } 2896 if (ocolemb) { 2897 PetscCall(ISGetLocalSize(ocolemb,&n)); 2898 PetscCheck(n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT,n,B->cmap->n); 2899 } else { 2900 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2901 } 2902 } 2903 2904 aij = (Mat_MPIAIJ*)C->data; 2905 if (!aij->A) { 2906 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2907 PetscCall(MatCreate(PETSC_COMM_SELF,&aij->A)); 2908 PetscCall(MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n)); 2909 PetscCall(MatSetBlockSizesFromMats(aij->A,C,C)); 2910 PetscCall(MatSetType(aij->A,MATSEQAIJ)); 2911 PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A)); 2912 } 2913 if (A) { 2914 PetscCall(MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A)); 2915 } else { 2916 PetscCall(MatSetUp(aij->A)); 2917 } 2918 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2919 /* 2920 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2921 need to "disassemble" B -- convert it to using C's global indices. 2922 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2923 2924 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2925 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2926 2927 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2928 At least avoid calling MatSetValues() and the implied searches? 2929 */ 2930 2931 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2932 #if defined(PETSC_USE_CTABLE) 2933 PetscCall(PetscTableDestroy(&aij->colmap)); 2934 #else 2935 PetscCall(PetscFree(aij->colmap)); 2936 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2937 if (aij->B) PetscCall(PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt))); 2938 #endif 2939 ngcol = 0; 2940 if (aij->lvec) { 2941 PetscCall(VecGetSize(aij->lvec,&ngcol)); 2942 } 2943 if (aij->garray) { 2944 PetscCall(PetscFree(aij->garray)); 2945 PetscCall(PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt))); 2946 } 2947 PetscCall(VecDestroy(&aij->lvec)); 2948 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2949 } 2950 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2951 PetscCall(MatDestroy(&aij->B)); 2952 } 2953 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2954 PetscCall(MatZeroEntries(aij->B)); 2955 } 2956 } 2957 Bdisassembled = PETSC_FALSE; 2958 if (!aij->B) { 2959 PetscCall(MatCreate(PETSC_COMM_SELF,&aij->B)); 2960 PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B)); 2961 PetscCall(MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N)); 2962 PetscCall(MatSetBlockSizesFromMats(aij->B,B,B)); 2963 PetscCall(MatSetType(aij->B,MATSEQAIJ)); 2964 Bdisassembled = PETSC_TRUE; 2965 } 2966 if (B) { 2967 Baij = (Mat_SeqAIJ*)B->data; 2968 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2969 PetscCall(PetscMalloc1(B->rmap->n,&nz)); 2970 for (i=0; i<B->rmap->n; i++) { 2971 nz[i] = Baij->i[i+1] - Baij->i[i]; 2972 } 2973 PetscCall(MatSeqAIJSetPreallocation(aij->B,0,nz)); 2974 PetscCall(PetscFree(nz)); 2975 } 2976 2977 PetscCall(PetscLayoutGetRange(C->rmap,&rstart,&rend)); 2978 shift = rend-rstart; 2979 count = 0; 2980 rowindices = NULL; 2981 colindices = NULL; 2982 if (rowemb) { 2983 PetscCall(ISGetIndices(rowemb,&rowindices)); 2984 } 2985 if (ocolemb) { 2986 PetscCall(ISGetIndices(ocolemb,&colindices)); 2987 } 2988 for (i=0; i<B->rmap->n; i++) { 2989 PetscInt row; 2990 row = i; 2991 if (rowindices) row = rowindices[i]; 2992 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 2993 col = Baij->j[count]; 2994 if (colindices) col = colindices[col]; 2995 if (Bdisassembled && col>=rstart) col += shift; 2996 v = Baij->a[count]; 2997 PetscCall(MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES)); 2998 ++count; 2999 } 3000 } 3001 /* No assembly for aij->B is necessary. */ 3002 /* FIXME: set aij->B's nonzerostate correctly. */ 3003 } else { 3004 PetscCall(MatSetUp(aij->B)); 3005 } 3006 C->preallocated = PETSC_TRUE; 3007 C->was_assembled = PETSC_FALSE; 3008 C->assembled = PETSC_FALSE; 3009 /* 3010 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3011 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3012 */ 3013 PetscFunctionReturn(0); 3014 } 3015 3016 /* 3017 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3018 */ 3019 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3020 { 3021 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3022 3023 PetscFunctionBegin; 3024 PetscValidPointer(A,2); 3025 PetscValidPointer(B,3); 3026 /* FIXME: make sure C is assembled */ 3027 *A = aij->A; 3028 *B = aij->B; 3029 /* Note that we don't incref *A and *B, so be careful! */ 3030 PetscFunctionReturn(0); 3031 } 3032 3033 /* 3034 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3035 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3036 */ 3037 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3038 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3039 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3040 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3041 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3042 { 3043 PetscMPIInt size,flag; 3044 PetscInt i,ii,cismax,ispar; 3045 Mat *A,*B; 3046 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3047 3048 PetscFunctionBegin; 3049 if (!ismax) PetscFunctionReturn(0); 3050 3051 for (i = 0, cismax = 0; i < ismax; ++i) { 3052 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag)); 3053 PetscCheck(flag == MPI_IDENT,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3054 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3055 if (size > 1) ++cismax; 3056 } 3057 3058 /* 3059 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3060 ispar counts the number of parallel ISs across C's comm. 3061 */ 3062 PetscCall(MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C))); 3063 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3064 PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat)); 3065 PetscFunctionReturn(0); 3066 } 3067 3068 /* if (ispar) */ 3069 /* 3070 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3071 These are used to extract the off-diag portion of the resulting parallel matrix. 3072 The row IS for the off-diag portion is the same as for the diag portion, 3073 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3074 */ 3075 PetscCall(PetscMalloc2(cismax,&cisrow,cismax,&ciscol)); 3076 PetscCall(PetscMalloc1(cismax,&ciscol_p)); 3077 for (i = 0, ii = 0; i < ismax; ++i) { 3078 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3079 if (size > 1) { 3080 /* 3081 TODO: This is the part that's ***NOT SCALABLE***. 3082 To fix this we need to extract just the indices of C's nonzero columns 3083 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3084 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3085 to be done without serializing on the IS list, so, most likely, it is best 3086 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3087 */ 3088 PetscCall(ISGetNonlocalIS(iscol[i],&(ciscol[ii]))); 3089 /* Now we have to 3090 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3091 were sorted on each rank, concatenated they might no longer be sorted; 3092 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3093 indices in the nondecreasing order to the original index positions. 3094 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3095 */ 3096 PetscCall(ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii)); 3097 PetscCall(ISSort(ciscol[ii])); 3098 ++ii; 3099 } 3100 } 3101 PetscCall(PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p)); 3102 for (i = 0, ii = 0; i < ismax; ++i) { 3103 PetscInt j,issize; 3104 const PetscInt *indices; 3105 3106 /* 3107 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3108 */ 3109 PetscCall(ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i)); 3110 PetscCall(ISSort(isrow[i])); 3111 PetscCall(ISGetLocalSize(isrow[i],&issize)); 3112 PetscCall(ISGetIndices(isrow[i],&indices)); 3113 for (j = 1; j < issize; ++j) { 3114 PetscCheck(indices[j] != indices[j-1],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]); 3115 } 3116 PetscCall(ISRestoreIndices(isrow[i],&indices)); 3117 PetscCall(ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i)); 3118 PetscCall(ISSort(iscol[i])); 3119 PetscCall(ISGetLocalSize(iscol[i],&issize)); 3120 PetscCall(ISGetIndices(iscol[i],&indices)); 3121 for (j = 1; j < issize; ++j) { 3122 PetscCheck(indices[j-1] != indices[j],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]); 3123 } 3124 PetscCall(ISRestoreIndices(iscol[i],&indices)); 3125 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3126 if (size > 1) { 3127 cisrow[ii] = isrow[i]; 3128 ++ii; 3129 } 3130 } 3131 /* 3132 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3133 array of sequential matrices underlying the resulting parallel matrices. 3134 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3135 contain duplicates. 3136 3137 There are as many diag matrices as there are original index sets. There are only as many parallel 3138 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3139 3140 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3141 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3142 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3143 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3144 with A[i] and B[ii] extracted from the corresponding MPI submat. 3145 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3146 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3147 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3148 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3149 values into A[i] and B[ii] sitting inside the corresponding submat. 3150 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3151 A[i], B[ii], AA[i] or BB[ii] matrices. 3152 */ 3153 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3154 if (scall == MAT_INITIAL_MATRIX) { 3155 PetscCall(PetscMalloc1(ismax,submat)); 3156 } 3157 3158 /* Now obtain the sequential A and B submatrices separately. */ 3159 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3160 PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A)); 3161 PetscCall((*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B)); 3162 3163 /* 3164 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3165 matrices A & B have been extracted directly into the parallel matrices containing them, or 3166 simply into the sequential matrix identical with the corresponding A (if size == 1). 3167 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3168 to have the same sparsity pattern. 3169 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3170 must be constructed for C. This is done by setseqmat(s). 3171 */ 3172 for (i = 0, ii = 0; i < ismax; ++i) { 3173 /* 3174 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3175 That way we can avoid sorting and computing permutations when reusing. 3176 To this end: 3177 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3178 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3179 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3180 */ 3181 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3182 3183 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3184 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3185 if (size > 1) { 3186 if (scall == MAT_INITIAL_MATRIX) { 3187 PetscCall(MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i)); 3188 PetscCall(MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE)); 3189 PetscCall(MatSetType((*submat)[i],MATMPIAIJ)); 3190 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3191 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3192 } 3193 /* 3194 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3195 */ 3196 { 3197 Mat AA = A[i],BB = B[ii]; 3198 3199 if (AA || BB) { 3200 PetscCall(setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB)); 3201 PetscCall(MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY)); 3202 PetscCall(MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY)); 3203 } 3204 PetscCall(MatDestroy(&AA)); 3205 } 3206 PetscCall(ISDestroy(ciscol+ii)); 3207 PetscCall(ISDestroy(ciscol_p+ii)); 3208 ++ii; 3209 } else { /* if (size == 1) */ 3210 if (scall == MAT_REUSE_MATRIX) { 3211 PetscCall(MatDestroy(&(*submat)[i])); 3212 } 3213 if (isrow_p[i] || iscol_p[i]) { 3214 PetscCall(MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i)); 3215 PetscCall(setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i])); 3216 /* Otherwise A is extracted straight into (*submats)[i]. */ 3217 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3218 PetscCall(MatDestroy(A+i)); 3219 } else (*submat)[i] = A[i]; 3220 } 3221 PetscCall(ISDestroy(&isrow_p[i])); 3222 PetscCall(ISDestroy(&iscol_p[i])); 3223 } 3224 PetscCall(PetscFree2(cisrow,ciscol)); 3225 PetscCall(PetscFree2(isrow_p,iscol_p)); 3226 PetscCall(PetscFree(ciscol_p)); 3227 PetscCall(PetscFree(A)); 3228 PetscCall(MatDestroySubMatrices(cismax,&B)); 3229 PetscFunctionReturn(0); 3230 } 3231 3232 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3233 { 3234 PetscFunctionBegin; 3235 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ)); 3236 PetscFunctionReturn(0); 3237 } 3238