1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 22 { 23 PetscInt i; 24 25 PetscFunctionBegin; 26 PetscCheck(ov >= 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 27 for (i=0; i<ov; ++i) { 28 PetscCall(MatIncreaseOverlap_MPIAIJ_Once(C,imax,is)); 29 } 30 PetscFunctionReturn(0); 31 } 32 33 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 34 { 35 PetscInt i; 36 37 PetscFunctionBegin; 38 PetscCheck(ov >= 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 39 for (i=0; i<ov; ++i) { 40 PetscCall(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is)); 41 } 42 PetscFunctionReturn(0); 43 } 44 45 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 46 { 47 MPI_Comm comm; 48 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 49 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 50 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 51 const PetscInt *indices_i,**indices; 52 PetscLayout rmap; 53 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 54 PetscSF sf; 55 PetscSFNode *remote; 56 57 PetscFunctionBegin; 58 PetscCall(PetscObjectGetComm((PetscObject)mat,&comm)); 59 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 60 PetscCallMPI(MPI_Comm_size(comm,&size)); 61 /* get row map to determine where rows should be going */ 62 PetscCall(MatGetLayouts(mat,&rmap,NULL)); 63 /* retrieve IS data and put all together so that we 64 * can optimize communication 65 * */ 66 PetscCall(PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length)); 67 for (i=0,tlength=0; i<nidx; i++) { 68 PetscCall(ISGetLocalSize(is[i],&length[i])); 69 tlength += length[i]; 70 PetscCall(ISGetIndices(is[i],&indices[i])); 71 } 72 /* find these rows on remote processors */ 73 PetscCall(PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids)); 74 PetscCall(PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp)); 75 nrrows = 0; 76 for (i=0; i<nidx; i++) { 77 length_i = length[i]; 78 indices_i = indices[i]; 79 for (j=0; j<length_i; j++) { 80 owner = -1; 81 PetscCall(PetscLayoutFindOwner(rmap,indices_i[j],&owner)); 82 /* remote processors */ 83 if (owner != rank) { 84 tosizes_temp[owner]++; /* number of rows to owner */ 85 rrow_ranks[nrrows] = owner; /* processor */ 86 rrow_isids[nrrows] = i; /* is id */ 87 remoterows[nrrows++] = indices_i[j]; /* row */ 88 } 89 } 90 PetscCall(ISRestoreIndices(is[i],&indices[i])); 91 } 92 PetscCall(PetscFree2(*(PetscInt***)&indices,length)); 93 /* test if we need to exchange messages 94 * generally speaking, we do not need to exchange 95 * data when overlap is 1 96 * */ 97 PetscCall(MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm)); 98 /* we do not have any messages 99 * It usually corresponds to overlap 1 100 * */ 101 if (!reducednrrows) { 102 PetscCall(PetscFree3(toranks,tosizes,tosizes_temp)); 103 PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids)); 104 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is)); 105 PetscFunctionReturn(0); 106 } 107 nto = 0; 108 /* send sizes and ranks for building a two-sided communcation */ 109 for (i=0; i<size; i++) { 110 if (tosizes_temp[i]) { 111 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 112 tosizes_temp[i] = nto; /* a map from processor to index */ 113 toranks[nto++] = i; /* processor */ 114 } 115 } 116 PetscCall(PetscMalloc1(nto+1,&toffsets)); 117 toffsets[0] = 0; 118 for (i=0; i<nto; i++) { 119 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 120 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 121 } 122 /* send information to other processors */ 123 PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes)); 124 nrecvrows = 0; 125 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 126 PetscCall(PetscMalloc1(nrecvrows,&remote)); 127 nrecvrows = 0; 128 for (i=0; i<nfrom; i++) { 129 for (j=0; j<fromsizes[2*i]; j++) { 130 remote[nrecvrows].rank = fromranks[i]; 131 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 132 } 133 } 134 PetscCall(PetscSFCreate(comm,&sf)); 135 PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER)); 136 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 137 PetscCall(PetscSFSetType(sf,PETSCSFBASIC)); 138 PetscCall(PetscSFSetFromOptions(sf)); 139 /* message pair <no of is, row> */ 140 PetscCall(PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata)); 141 for (i=0; i<nrrows; i++) { 142 owner = rrow_ranks[i]; /* processor */ 143 j = tosizes_temp[owner]; /* index */ 144 todata[toffsets[j]++] = rrow_isids[i]; 145 todata[toffsets[j]++] = remoterows[i]; 146 } 147 PetscCall(PetscFree3(toranks,tosizes,tosizes_temp)); 148 PetscCall(PetscFree3(remoterows,rrow_ranks,rrow_isids)); 149 PetscCall(PetscFree(toffsets)); 150 PetscCall(PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE)); 151 PetscCall(PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE)); 152 PetscCall(PetscSFDestroy(&sf)); 153 /* send rows belonging to the remote so that then we could get the overlapping data back */ 154 PetscCall(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata)); 155 PetscCall(PetscFree2(todata,fromdata)); 156 PetscCall(PetscFree(fromsizes)); 157 PetscCall(PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes)); 158 PetscCall(PetscFree(fromranks)); 159 nrecvrows = 0; 160 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 161 PetscCall(PetscCalloc1(nrecvrows,&todata)); 162 PetscCall(PetscMalloc1(nrecvrows,&remote)); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) { 165 for (j=0; j<tosizes[2*i]; j++) { 166 remote[nrecvrows].rank = toranks[i]; 167 remote[nrecvrows++].index = tosizes[2*i+1]+j; 168 } 169 } 170 PetscCall(PetscSFCreate(comm,&sf)); 171 PetscCall(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER)); 172 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 173 PetscCall(PetscSFSetType(sf,PETSCSFBASIC)); 174 PetscCall(PetscSFSetFromOptions(sf)); 175 /* overlap communication and computation */ 176 PetscCall(PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE)); 177 PetscCall(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is)); 178 PetscCall(PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE)); 179 PetscCall(PetscSFDestroy(&sf)); 180 PetscCall(PetscFree2(sbdata,sbsizes)); 181 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata)); 182 PetscCall(PetscFree(toranks)); 183 PetscCall(PetscFree(tosizes)); 184 PetscCall(PetscFree(todata)); 185 PetscFunctionReturn(0); 186 } 187 188 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 189 { 190 PetscInt *isz,isz_i,i,j,is_id, data_size; 191 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 192 const PetscInt *indices_i_temp; 193 MPI_Comm *iscomms; 194 195 PetscFunctionBegin; 196 max_lsize = 0; 197 PetscCall(PetscMalloc1(nidx,&isz)); 198 for (i=0; i<nidx; i++) { 199 PetscCall(ISGetLocalSize(is[i],&lsize)); 200 max_lsize = lsize>max_lsize ? lsize:max_lsize; 201 isz[i] = lsize; 202 } 203 PetscCall(PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms)); 204 for (i=0; i<nidx; i++) { 205 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL)); 206 PetscCall(ISGetIndices(is[i],&indices_i_temp)); 207 PetscCall(PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i])); 208 PetscCall(ISRestoreIndices(is[i],&indices_i_temp)); 209 PetscCall(ISDestroy(&is[i])); 210 } 211 /* retrieve information to get row id and its overlap */ 212 for (i=0; i<nrecvs;) { 213 is_id = recvdata[i++]; 214 data_size = recvdata[i++]; 215 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 216 isz_i = isz[is_id]; 217 for (j=0; j< data_size; j++) { 218 col = recvdata[i++]; 219 indices_i[isz_i++] = col; 220 } 221 isz[is_id] = isz_i; 222 } 223 /* remove duplicate entities */ 224 for (i=0; i<nidx; i++) { 225 indices_i = indices_temp+(max_lsize+nrecvs)*i; 226 isz_i = isz[i]; 227 PetscCall(PetscSortRemoveDupsInt(&isz_i,indices_i)); 228 PetscCall(ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i])); 229 PetscCall(PetscCommDestroy(&iscomms[i])); 230 } 231 PetscCall(PetscFree(isz)); 232 PetscCall(PetscFree2(indices_temp,iscomms)); 233 PetscFunctionReturn(0); 234 } 235 236 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 237 { 238 PetscLayout rmap,cmap; 239 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 240 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 241 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 242 const PetscInt *gcols,*ai,*aj,*bi,*bj; 243 Mat amat,bmat; 244 PetscMPIInt rank; 245 PetscBool done; 246 MPI_Comm comm; 247 248 PetscFunctionBegin; 249 PetscCall(PetscObjectGetComm((PetscObject)mat,&comm)); 250 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 251 PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols)); 252 /* Even if the mat is symmetric, we still assume it is not symmetric */ 253 PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 254 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 255 PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 256 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 257 /* total number of nonzero values is used to estimate the memory usage in the next step */ 258 tnz = ai[an]+bi[bn]; 259 PetscCall(MatGetLayouts(mat,&rmap,&cmap)); 260 PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL)); 261 PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL)); 262 /* to find the longest message */ 263 max_fszs = 0; 264 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 265 /* better way to estimate number of nonzero in the mat??? */ 266 PetscCall(PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp)); 267 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 268 rows_pos = 0; 269 totalrows = 0; 270 for (i=0; i<nfrom; i++) { 271 PetscCall(PetscArrayzero(rows_pos_i,nidx)); 272 /* group data together */ 273 for (j=0; j<fromsizes[2*i]; j+=2) { 274 is_id = fromrows[rows_pos++];/* no of is */ 275 rows_i = rows_data[is_id]; 276 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 277 } 278 /* estimate a space to avoid multiple allocations */ 279 for (j=0; j<nidx; j++) { 280 indvc_ij = 0; 281 rows_i = rows_data[j]; 282 for (l=0; l<rows_pos_i[j]; l++) { 283 row = rows_i[l]-rstart; 284 start = ai[row]; 285 end = ai[row+1]; 286 for (k=start; k<end; k++) { /* Amat */ 287 col = aj[k] + cstart; 288 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 289 } 290 start = bi[row]; 291 end = bi[row+1]; 292 for (k=start; k<end; k++) { /* Bmat */ 293 col = gcols[bj[k]]; 294 indices_tmp[indvc_ij++] = col; 295 } 296 } 297 PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp)); 298 indv_counts[i*nidx+j] = indvc_ij; 299 totalrows += indvc_ij; 300 } 301 } 302 /* message triple <no of is, number of rows, rows> */ 303 PetscCall(PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes)); 304 totalrows = 0; 305 rows_pos = 0; 306 /* use this code again */ 307 for (i=0;i<nfrom;i++) { 308 PetscCall(PetscArrayzero(rows_pos_i,nidx)); 309 for (j=0; j<fromsizes[2*i]; j+=2) { 310 is_id = fromrows[rows_pos++]; 311 rows_i = rows_data[is_id]; 312 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 313 } 314 /* add data */ 315 for (j=0; j<nidx; j++) { 316 if (!indv_counts[i*nidx+j]) continue; 317 indvc_ij = 0; 318 sbdata[totalrows++] = j; 319 sbdata[totalrows++] = indv_counts[i*nidx+j]; 320 sbsizes[2*i] += 2; 321 rows_i = rows_data[j]; 322 for (l=0; l<rows_pos_i[j]; l++) { 323 row = rows_i[l]-rstart; 324 start = ai[row]; 325 end = ai[row+1]; 326 for (k=start; k<end; k++) { /* Amat */ 327 col = aj[k] + cstart; 328 indices_tmp[indvc_ij++] = col; 329 } 330 start = bi[row]; 331 end = bi[row+1]; 332 for (k=start; k<end; k++) { /* Bmat */ 333 col = gcols[bj[k]]; 334 indices_tmp[indvc_ij++] = col; 335 } 336 } 337 PetscCall(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp)); 338 sbsizes[2*i] += indvc_ij; 339 PetscCall(PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij)); 340 totalrows += indvc_ij; 341 } 342 } 343 PetscCall(PetscMalloc1(nfrom+1,&offsets)); 344 offsets[0] = 0; 345 for (i=0; i<nfrom; i++) { 346 offsets[i+1] = offsets[i] + sbsizes[2*i]; 347 sbsizes[2*i+1] = offsets[i]; 348 } 349 PetscCall(PetscFree(offsets)); 350 if (sbrowsizes) *sbrowsizes = sbsizes; 351 if (sbrows) *sbrows = sbdata; 352 PetscCall(PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp)); 353 PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 354 PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 355 PetscFunctionReturn(0); 356 } 357 358 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 359 { 360 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 361 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 362 PetscInt lsize,lsize_tmp; 363 PetscMPIInt rank,owner; 364 Mat amat,bmat; 365 PetscBool done; 366 PetscLayout cmap,rmap; 367 MPI_Comm comm; 368 369 PetscFunctionBegin; 370 PetscCall(PetscObjectGetComm((PetscObject)mat,&comm)); 371 PetscCallMPI(MPI_Comm_rank(comm,&rank)); 372 PetscCall(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols)); 373 PetscCall(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 374 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 375 PetscCall(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 376 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 377 /* is it a safe way to compute number of nonzero values ? */ 378 tnz = ai[an]+bi[bn]; 379 PetscCall(MatGetLayouts(mat,&rmap,&cmap)); 380 PetscCall(PetscLayoutGetRange(rmap,&rstart,NULL)); 381 PetscCall(PetscLayoutGetRange(cmap,&cstart,NULL)); 382 /* it is a better way to estimate memory than the old implementation 383 * where global size of matrix is used 384 * */ 385 PetscCall(PetscMalloc1(tnz,&indices_temp)); 386 for (i=0; i<nidx; i++) { 387 MPI_Comm iscomm; 388 389 PetscCall(ISGetLocalSize(is[i],&lsize)); 390 PetscCall(ISGetIndices(is[i],&indices)); 391 lsize_tmp = 0; 392 for (j=0; j<lsize; j++) { 393 owner = -1; 394 row = indices[j]; 395 PetscCall(PetscLayoutFindOwner(rmap,row,&owner)); 396 if (owner != rank) continue; 397 /* local number */ 398 row -= rstart; 399 start = ai[row]; 400 end = ai[row+1]; 401 for (k=start; k<end; k++) { /* Amat */ 402 col = aj[k] + cstart; 403 indices_temp[lsize_tmp++] = col; 404 } 405 start = bi[row]; 406 end = bi[row+1]; 407 for (k=start; k<end; k++) { /* Bmat */ 408 col = gcols[bj[k]]; 409 indices_temp[lsize_tmp++] = col; 410 } 411 } 412 PetscCall(ISRestoreIndices(is[i],&indices)); 413 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL)); 414 PetscCall(ISDestroy(&is[i])); 415 PetscCall(PetscSortRemoveDupsInt(&lsize_tmp,indices_temp)); 416 PetscCall(ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i])); 417 PetscCall(PetscCommDestroy(&iscomm)); 418 } 419 PetscCall(PetscFree(indices_temp)); 420 PetscCall(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 421 PetscCall(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 422 PetscFunctionReturn(0); 423 } 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests received (which have to be or which have been processed) 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscMPIInt size,rank,tag1,tag2,proc = 0; 461 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 462 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 463 PetscBT *table; 464 MPI_Comm comm; 465 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 466 MPI_Status *recv_status; 467 MPI_Comm *iscomms; 468 char *t_p; 469 470 PetscFunctionBegin; 471 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag1)); 477 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2)); 478 479 PetscCall(PetscMalloc2(imax,(PetscInt***)&idx,imax,&n)); 480 481 for (i=0; i<imax; i++) { 482 PetscCall(ISGetIndices(is[i],&idx[i])); 483 PetscCall(ISGetLocalSize(is[i],&n[i])); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4)); 489 for (i=0; i<imax; i++) { 490 PetscCall(PetscArrayzero(w4,size)); /* initialise work vector*/ 491 idx_i = idx[i]; 492 len = n[i]; 493 for (j=0; j<len; j++) { 494 row = idx_i[j]; 495 PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 496 PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc)); 497 w4[proc]++; 498 } 499 for (j=0; j<size; j++) { 500 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 501 } 502 } 503 504 nrqs = 0; /* no of outgoing messages */ 505 msz = 0; /* total mesg length (for all proc */ 506 w1[rank] = 0; /* no mesg sent to intself */ 507 w3[rank] = 0; 508 for (i=0; i<size; i++) { 509 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 510 } 511 /* pa - is list of processors to communicate with */ 512 PetscCall(PetscMalloc1(nrqs,&pa)); 513 for (i=0,j=0; i<size; i++) { 514 if (w1[i]) {pa[j] = i; j++;} 515 } 516 517 /* Each message would have a header = 1 + 2*(no of IS) + data */ 518 for (i=0; i<nrqs; i++) { 519 j = pa[i]; 520 w1[j] += w2[j] + 2*w3[j]; 521 msz += w1[j]; 522 } 523 524 /* Determine the number of messages to expect, their lengths, from from-ids */ 525 PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 526 PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 527 528 /* Now post the Irecvs corresponding to these messages */ 529 PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1)); 530 531 /* Allocate Memory for outgoing messages */ 532 PetscCall(PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr)); 533 PetscCall(PetscArrayzero(outdat,size)); 534 PetscCall(PetscArrayzero(ptr,size)); 535 536 { 537 PetscInt *iptr = tmp,ict = 0; 538 for (i=0; i<nrqs; i++) { 539 j = pa[i]; 540 iptr += ict; 541 outdat[j] = iptr; 542 ict = w1[j]; 543 } 544 } 545 546 /* Form the outgoing messages */ 547 /* plug in the headers */ 548 for (i=0; i<nrqs; i++) { 549 j = pa[i]; 550 outdat[j][0] = 0; 551 PetscCall(PetscArrayzero(outdat[j]+1,2*w3[j])); 552 ptr[j] = outdat[j] + 2*w3[j] + 1; 553 } 554 555 /* Memory for doing local proc's work */ 556 { 557 PetscInt M_BPB_imax = 0; 558 #if defined(PETSC_USE_CTABLE) 559 PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax)); 560 PetscCall(PetscMalloc1(imax,&table_data)); 561 for (i=0; i<imax; i++) { 562 PetscCall(PetscTableCreate(n[i],M,&table_data[i])); 563 } 564 PetscCall(PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p)); 565 for (i=0; i<imax; i++) { 566 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 567 } 568 #else 569 PetscInt Mimax = 0; 570 PetscCall(PetscIntMultError(M,imax, &Mimax)); 571 PetscCall(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax)); 572 PetscCall(PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p)); 573 for (i=0; i<imax; i++) { 574 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 575 data[i] = d_p + M*i; 576 } 577 #endif 578 } 579 580 /* Parse the IS and update local tables and the outgoing buf with the data */ 581 { 582 PetscInt n_i,isz_i,*outdat_j,ctr_j; 583 PetscBT table_i; 584 585 for (i=0; i<imax; i++) { 586 PetscCall(PetscArrayzero(ctr,size)); 587 n_i = n[i]; 588 table_i = table[i]; 589 idx_i = idx[i]; 590 #if defined(PETSC_USE_CTABLE) 591 table_data_i = table_data[i]; 592 #else 593 data_i = data[i]; 594 #endif 595 isz_i = isz[i]; 596 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 597 row = idx_i[j]; 598 PetscCall(PetscLayoutFindOwner(C->rmap,row,&proc)); 599 if (proc != rank) { /* copy to the outgoing buffer */ 600 ctr[proc]++; 601 *ptr[proc] = row; 602 ptr[proc]++; 603 } else if (!PetscBTLookupSet(table_i,row)) { 604 #if defined(PETSC_USE_CTABLE) 605 PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES)); 606 #else 607 data_i[isz_i] = row; /* Update the local table */ 608 #endif 609 isz_i++; 610 } 611 } 612 /* Update the headers for the current IS */ 613 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 614 if ((ctr_j = ctr[j])) { 615 outdat_j = outdat[j]; 616 k = ++outdat_j[0]; 617 outdat_j[2*k] = ctr_j; 618 outdat_j[2*k-1] = i; 619 } 620 } 621 isz[i] = isz_i; 622 } 623 } 624 625 /* Now post the sends */ 626 PetscCall(PetscMalloc1(nrqs,&s_waits1)); 627 for (i=0; i<nrqs; ++i) { 628 j = pa[i]; 629 PetscCallMPI(MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i)); 630 } 631 632 /* No longer need the original indices */ 633 PetscCall(PetscMalloc1(imax,&iscomms)); 634 for (i=0; i<imax; ++i) { 635 PetscCall(ISRestoreIndices(is[i],idx+i)); 636 PetscCall(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL)); 637 } 638 PetscCall(PetscFree2(*(PetscInt***)&idx,n)); 639 640 for (i=0; i<imax; ++i) { 641 PetscCall(ISDestroy(&is[i])); 642 } 643 644 /* Do Local work */ 645 #if defined(PETSC_USE_CTABLE) 646 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data)); 647 #else 648 PetscCall(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL)); 649 #endif 650 651 /* Receive messages */ 652 PetscCall(PetscMalloc1(nrqr,&recv_status)); 653 PetscCallMPI(MPI_Waitall(nrqr,r_waits1,recv_status)); 654 PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE)); 655 656 /* Phase 1 sends are complete - deallocate buffers */ 657 PetscCall(PetscFree4(outdat,ptr,tmp,ctr)); 658 PetscCall(PetscFree4(w1,w2,w3,w4)); 659 660 PetscCall(PetscMalloc1(nrqr,&xdata)); 661 PetscCall(PetscMalloc1(nrqr,&isz1)); 662 PetscCall(MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1)); 663 PetscCall(PetscFree(rbuf[0])); 664 PetscCall(PetscFree(rbuf)); 665 666 /* Send the data back */ 667 /* Do a global reduction to know the buffer space req for incoming messages */ 668 { 669 PetscMPIInt *rw1; 670 671 PetscCall(PetscCalloc1(size,&rw1)); 672 673 for (i=0; i<nrqr; ++i) { 674 proc = recv_status[i].MPI_SOURCE; 675 676 PetscCheck(proc == onodes1[i],PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 677 rw1[proc] = isz1[i]; 678 } 679 PetscCall(PetscFree(onodes1)); 680 PetscCall(PetscFree(olengths1)); 681 682 /* Determine the number of messages to expect, their lengths, from from-ids */ 683 PetscCall(PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2)); 684 PetscCall(PetscFree(rw1)); 685 } 686 /* Now post the Irecvs corresponding to these messages */ 687 PetscCall(PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2)); 688 689 /* Now post the sends */ 690 PetscCall(PetscMalloc1(nrqr,&s_waits2)); 691 for (i=0; i<nrqr; ++i) { 692 j = recv_status[i].MPI_SOURCE; 693 PetscCallMPI(MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i)); 694 } 695 696 /* receive work done on other processors */ 697 { 698 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 699 PetscMPIInt idex; 700 PetscBT table_i; 701 702 for (i=0; i<nrqs; ++i) { 703 PetscCallMPI(MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE)); 704 /* Process the message */ 705 rbuf2_i = rbuf2[idex]; 706 ct1 = 2*rbuf2_i[0]+1; 707 jmax = rbuf2[idex][0]; 708 for (j=1; j<=jmax; j++) { 709 max = rbuf2_i[2*j]; 710 is_no = rbuf2_i[2*j-1]; 711 isz_i = isz[is_no]; 712 table_i = table[is_no]; 713 #if defined(PETSC_USE_CTABLE) 714 table_data_i = table_data[is_no]; 715 #else 716 data_i = data[is_no]; 717 #endif 718 for (k=0; k<max; k++,ct1++) { 719 row = rbuf2_i[ct1]; 720 if (!PetscBTLookupSet(table_i,row)) { 721 #if defined(PETSC_USE_CTABLE) 722 PetscCall(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES)); 723 #else 724 data_i[isz_i] = row; 725 #endif 726 isz_i++; 727 } 728 } 729 isz[is_no] = isz_i; 730 } 731 } 732 733 PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE)); 734 } 735 736 #if defined(PETSC_USE_CTABLE) 737 tcount_max = 0; 738 for (i=0; i<imax; ++i) { 739 table_data_i = table_data[i]; 740 PetscCall(PetscTableGetCount(table_data_i,&tcount)); 741 if (tcount_max < tcount) tcount_max = tcount; 742 } 743 PetscCall(PetscMalloc1(tcount_max+1,&tdata)); 744 #endif 745 746 for (i=0; i<imax; ++i) { 747 #if defined(PETSC_USE_CTABLE) 748 PetscTablePosition tpos; 749 table_data_i = table_data[i]; 750 751 PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos)); 752 while (tpos) { 753 PetscCall(PetscTableGetNext(table_data_i,&tpos,&k,&j)); 754 tdata[--j] = --k; 755 } 756 PetscCall(ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i)); 757 #else 758 PetscCall(ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i)); 759 #endif 760 PetscCall(PetscCommDestroy(&iscomms[i])); 761 } 762 763 PetscCall(PetscFree(iscomms)); 764 PetscCall(PetscFree(onodes2)); 765 PetscCall(PetscFree(olengths2)); 766 767 PetscCall(PetscFree(pa)); 768 PetscCall(PetscFree(rbuf2[0])); 769 PetscCall(PetscFree(rbuf2)); 770 PetscCall(PetscFree(s_waits1)); 771 PetscCall(PetscFree(r_waits1)); 772 PetscCall(PetscFree(s_waits2)); 773 PetscCall(PetscFree(r_waits2)); 774 PetscCall(PetscFree(recv_status)); 775 if (xdata) { 776 PetscCall(PetscFree(xdata[0])); 777 PetscCall(PetscFree(xdata)); 778 } 779 PetscCall(PetscFree(isz1)); 780 #if defined(PETSC_USE_CTABLE) 781 for (i=0; i<imax; i++) { 782 PetscCall(PetscTableDestroy((PetscTable*)&table_data[i])); 783 } 784 PetscCall(PetscFree(table_data)); 785 PetscCall(PetscFree(tdata)); 786 PetscCall(PetscFree4(table,data,isz,t_p)); 787 #else 788 PetscCall(PetscFree5(table,data,isz,d_p,t_p)); 789 #endif 790 PetscFunctionReturn(0); 791 } 792 793 /* 794 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 795 the work on the local processor. 796 797 Inputs: 798 C - MAT_MPIAIJ; 799 imax - total no of index sets processed at a time; 800 table - an array of char - size = m bits. 801 802 Output: 803 isz - array containing the count of the solution elements corresponding 804 to each index set; 805 data or table_data - pointer to the solutions 806 */ 807 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 808 { 809 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 810 Mat A = c->A,B = c->B; 811 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 812 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 813 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 814 PetscBT table_i; 815 #if defined(PETSC_USE_CTABLE) 816 PetscTable table_data_i; 817 PetscTablePosition tpos; 818 PetscInt tcount,*tdata; 819 #else 820 PetscInt *data_i; 821 #endif 822 823 PetscFunctionBegin; 824 rstart = C->rmap->rstart; 825 cstart = C->cmap->rstart; 826 ai = a->i; 827 aj = a->j; 828 bi = b->i; 829 bj = b->j; 830 garray = c->garray; 831 832 for (i=0; i<imax; i++) { 833 #if defined(PETSC_USE_CTABLE) 834 /* copy existing entries of table_data_i into tdata[] */ 835 table_data_i = table_data[i]; 836 PetscCall(PetscTableGetCount(table_data_i,&tcount)); 837 PetscCheck(tcount == isz[i],PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT,tcount,i,isz[i]); 838 839 PetscCall(PetscMalloc1(tcount,&tdata)); 840 PetscCall(PetscTableGetHeadPosition(table_data_i,&tpos)); 841 while (tpos) { 842 PetscCall(PetscTableGetNext(table_data_i,&tpos,&row,&j)); 843 tdata[--j] = --row; 844 PetscCheck(j <= tcount - 1,PETSC_COMM_SELF,PETSC_ERR_PLIB," j %" PetscInt_FMT " >= tcount %" PetscInt_FMT,j,tcount); 845 } 846 #else 847 data_i = data[i]; 848 #endif 849 table_i = table[i]; 850 isz_i = isz[i]; 851 max = isz[i]; 852 853 for (j=0; j<max; j++) { 854 #if defined(PETSC_USE_CTABLE) 855 row = tdata[j] - rstart; 856 #else 857 row = data_i[j] - rstart; 858 #endif 859 start = ai[row]; 860 end = ai[row+1]; 861 for (k=start; k<end; k++) { /* Amat */ 862 val = aj[k] + cstart; 863 if (!PetscBTLookupSet(table_i,val)) { 864 #if defined(PETSC_USE_CTABLE) 865 PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES)); 866 #else 867 data_i[isz_i] = val; 868 #endif 869 isz_i++; 870 } 871 } 872 start = bi[row]; 873 end = bi[row+1]; 874 for (k=start; k<end; k++) { /* Bmat */ 875 val = garray[bj[k]]; 876 if (!PetscBTLookupSet(table_i,val)) { 877 #if defined(PETSC_USE_CTABLE) 878 PetscCall(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES)); 879 #else 880 data_i[isz_i] = val; 881 #endif 882 isz_i++; 883 } 884 } 885 } 886 isz[i] = isz_i; 887 888 #if defined(PETSC_USE_CTABLE) 889 PetscCall(PetscFree(tdata)); 890 #endif 891 } 892 PetscFunctionReturn(0); 893 } 894 895 /* 896 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 897 and return the output 898 899 Input: 900 C - the matrix 901 nrqr - no of messages being processed. 902 rbuf - an array of pointers to the received requests 903 904 Output: 905 xdata - array of messages to be sent back 906 isz1 - size of each message 907 908 For better efficiency perhaps we should malloc separately each xdata[i], 909 then if a remalloc is required we need only copy the data for that one row 910 rather then all previous rows as it is now where a single large chunk of 911 memory is used. 912 913 */ 914 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 915 { 916 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 917 Mat A = c->A,B = c->B; 918 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 919 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 920 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 921 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 922 PetscInt *rbuf_i,kmax,rbuf_0; 923 PetscBT xtable; 924 925 PetscFunctionBegin; 926 m = C->rmap->N; 927 rstart = C->rmap->rstart; 928 cstart = C->cmap->rstart; 929 ai = a->i; 930 aj = a->j; 931 bi = b->i; 932 bj = b->j; 933 garray = c->garray; 934 935 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 936 rbuf_i = rbuf[i]; 937 rbuf_0 = rbuf_i[0]; 938 ct += rbuf_0; 939 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 940 } 941 942 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 943 else max1 = 1; 944 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 945 if (nrqr) { 946 PetscCall(PetscMalloc1(mem_estimate,&xdata[0])); 947 ++no_malloc; 948 } 949 PetscCall(PetscBTCreate(m,&xtable)); 950 PetscCall(PetscArrayzero(isz1,nrqr)); 951 952 ct3 = 0; 953 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 954 rbuf_i = rbuf[i]; 955 rbuf_0 = rbuf_i[0]; 956 ct1 = 2*rbuf_0+1; 957 ct2 = ct1; 958 ct3 += ct1; 959 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 960 PetscCall(PetscBTMemzero(m,xtable)); 961 oct2 = ct2; 962 kmax = rbuf_i[2*j]; 963 for (k=0; k<kmax; k++,ct1++) { 964 row = rbuf_i[ct1]; 965 if (!PetscBTLookupSet(xtable,row)) { 966 if (!(ct3 < mem_estimate)) { 967 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 968 PetscCall(PetscMalloc1(new_estimate,&tmp)); 969 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 970 PetscCall(PetscFree(xdata[0])); 971 xdata[0] = tmp; 972 mem_estimate = new_estimate; ++no_malloc; 973 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 974 } 975 xdata[i][ct2++] = row; 976 ct3++; 977 } 978 } 979 for (k=oct2,max2=ct2; k<max2; k++) { 980 row = xdata[i][k] - rstart; 981 start = ai[row]; 982 end = ai[row+1]; 983 for (l=start; l<end; l++) { 984 val = aj[l] + cstart; 985 if (!PetscBTLookupSet(xtable,val)) { 986 if (!(ct3 < mem_estimate)) { 987 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 988 PetscCall(PetscMalloc1(new_estimate,&tmp)); 989 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 990 PetscCall(PetscFree(xdata[0])); 991 xdata[0] = tmp; 992 mem_estimate = new_estimate; ++no_malloc; 993 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 994 } 995 xdata[i][ct2++] = val; 996 ct3++; 997 } 998 } 999 start = bi[row]; 1000 end = bi[row+1]; 1001 for (l=start; l<end; l++) { 1002 val = garray[bj[l]]; 1003 if (!PetscBTLookupSet(xtable,val)) { 1004 if (!(ct3 < mem_estimate)) { 1005 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1006 PetscCall(PetscMalloc1(new_estimate,&tmp)); 1007 PetscCall(PetscArraycpy(tmp,xdata[0],mem_estimate)); 1008 PetscCall(PetscFree(xdata[0])); 1009 xdata[0] = tmp; 1010 mem_estimate = new_estimate; ++no_malloc; 1011 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1012 } 1013 xdata[i][ct2++] = val; 1014 ct3++; 1015 } 1016 } 1017 } 1018 /* Update the header*/ 1019 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1020 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1021 } 1022 xdata[i][0] = rbuf_0; 1023 if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2; 1024 isz1[i] = ct2; /* size of each message */ 1025 } 1026 PetscCall(PetscBTDestroy(&xtable)); 1027 PetscCall(PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc)); 1028 PetscFunctionReturn(0); 1029 } 1030 /* -------------------------------------------------------------------------*/ 1031 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1032 /* 1033 Every processor gets the entire matrix 1034 */ 1035 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1036 { 1037 Mat B; 1038 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1039 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1040 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1041 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1042 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1043 1044 PetscFunctionBegin; 1045 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1046 PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank)); 1047 if (scall == MAT_INITIAL_MATRIX) { 1048 /* ---------------------------------------------------------------- 1049 Tell every processor the number of nonzeros per row 1050 */ 1051 PetscCall(PetscMalloc1(A->rmap->N,&lens)); 1052 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1053 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1054 } 1055 PetscCall(PetscMalloc2(size,&recvcounts,size,&displs)); 1056 for (i=0; i<size; i++) { 1057 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1058 displs[i] = A->rmap->range[i]; 1059 } 1060 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A))); 1061 /* --------------------------------------------------------------- 1062 Create the sequential matrix of the same type as the local block diagonal 1063 */ 1064 PetscCall(MatCreate(PETSC_COMM_SELF,&B)); 1065 PetscCall(MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE)); 1066 PetscCall(MatSetBlockSizesFromMats(B,A,A)); 1067 PetscCall(MatSetType(B,((PetscObject)a->A)->type_name)); 1068 PetscCall(MatSeqAIJSetPreallocation(B,0,lens)); 1069 PetscCall(PetscCalloc1(2,Bin)); 1070 **Bin = B; 1071 b = (Mat_SeqAIJ*)B->data; 1072 1073 /*-------------------------------------------------------------------- 1074 Copy my part of matrix column indices over 1075 */ 1076 sendcount = ad->nz + bd->nz; 1077 jsendbuf = b->j + b->i[rstarts[rank]]; 1078 a_jsendbuf = ad->j; 1079 b_jsendbuf = bd->j; 1080 n = A->rmap->rend - A->rmap->rstart; 1081 cnt = 0; 1082 for (i=0; i<n; i++) { 1083 /* put in lower diagonal portion */ 1084 m = bd->i[i+1] - bd->i[i]; 1085 while (m > 0) { 1086 /* is it above diagonal (in bd (compressed) numbering) */ 1087 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1088 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1089 m--; 1090 } 1091 1092 /* put in diagonal portion */ 1093 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1094 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1095 } 1096 1097 /* put in upper diagonal portion */ 1098 while (m-- > 0) { 1099 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1100 } 1101 } 1102 PetscCheck(cnt == sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt); 1103 1104 /*-------------------------------------------------------------------- 1105 Gather all column indices to all processors 1106 */ 1107 for (i=0; i<size; i++) { 1108 recvcounts[i] = 0; 1109 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1110 recvcounts[i] += lens[j]; 1111 } 1112 } 1113 displs[0] = 0; 1114 for (i=1; i<size; i++) { 1115 displs[i] = displs[i-1] + recvcounts[i-1]; 1116 } 1117 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A))); 1118 /*-------------------------------------------------------------------- 1119 Assemble the matrix into useable form (note numerical values not yet set) 1120 */ 1121 /* set the b->ilen (length of each row) values */ 1122 PetscCall(PetscArraycpy(b->ilen,lens,A->rmap->N)); 1123 /* set the b->i indices */ 1124 b->i[0] = 0; 1125 for (i=1; i<=A->rmap->N; i++) { 1126 b->i[i] = b->i[i-1] + lens[i-1]; 1127 } 1128 PetscCall(PetscFree(lens)); 1129 PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 1130 PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 1131 1132 } else { 1133 B = **Bin; 1134 b = (Mat_SeqAIJ*)B->data; 1135 } 1136 1137 /*-------------------------------------------------------------------- 1138 Copy my part of matrix numerical values into the values location 1139 */ 1140 if (flag == MAT_GET_VALUES) { 1141 const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf; 1142 MatScalar *sendbuf,*recvbuf; 1143 1144 PetscCall(MatSeqAIJGetArrayRead(a->A,&ada)); 1145 PetscCall(MatSeqAIJGetArrayRead(a->B,&bda)); 1146 sendcount = ad->nz + bd->nz; 1147 sendbuf = b->a + b->i[rstarts[rank]]; 1148 a_sendbuf = ada; 1149 b_sendbuf = bda; 1150 b_sendj = bd->j; 1151 n = A->rmap->rend - A->rmap->rstart; 1152 cnt = 0; 1153 for (i=0; i<n; i++) { 1154 /* put in lower diagonal portion */ 1155 m = bd->i[i+1] - bd->i[i]; 1156 while (m > 0) { 1157 /* is it above diagonal (in bd (compressed) numbering) */ 1158 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1159 sendbuf[cnt++] = *b_sendbuf++; 1160 m--; 1161 b_sendj++; 1162 } 1163 1164 /* put in diagonal portion */ 1165 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1166 sendbuf[cnt++] = *a_sendbuf++; 1167 } 1168 1169 /* put in upper diagonal portion */ 1170 while (m-- > 0) { 1171 sendbuf[cnt++] = *b_sendbuf++; 1172 b_sendj++; 1173 } 1174 } 1175 PetscCheck(cnt == sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt); 1176 1177 /* ----------------------------------------------------------------- 1178 Gather all numerical values to all processors 1179 */ 1180 if (!recvcounts) { 1181 PetscCall(PetscMalloc2(size,&recvcounts,size,&displs)); 1182 } 1183 for (i=0; i<size; i++) { 1184 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1185 } 1186 displs[0] = 0; 1187 for (i=1; i<size; i++) { 1188 displs[i] = displs[i-1] + recvcounts[i-1]; 1189 } 1190 recvbuf = b->a; 1191 PetscCallMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A))); 1192 PetscCall(MatSeqAIJRestoreArrayRead(a->A,&ada)); 1193 PetscCall(MatSeqAIJRestoreArrayRead(a->B,&bda)); 1194 } /* endof (flag == MAT_GET_VALUES) */ 1195 PetscCall(PetscFree2(recvcounts,displs)); 1196 1197 if (A->symmetric) PetscCall(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE)); 1198 else if (A->hermitian) PetscCall(MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE)); 1199 else if (A->structurally_symmetric) PetscCall(MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE)); 1200 PetscFunctionReturn(0); 1201 } 1202 1203 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1204 { 1205 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1206 Mat submat,A = c->A,B = c->B; 1207 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1208 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1209 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1210 const PetscInt *icol,*irow; 1211 PetscInt nrow,ncol,start; 1212 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1213 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1214 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1215 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1216 PetscInt *lens,rmax,ncols,*cols,Crow; 1217 #if defined(PETSC_USE_CTABLE) 1218 PetscTable cmap,rmap; 1219 PetscInt *cmap_loc,*rmap_loc; 1220 #else 1221 PetscInt *cmap,*rmap; 1222 #endif 1223 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1224 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1225 PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL; 1226 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1227 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1228 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1229 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1230 MPI_Comm comm; 1231 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1232 PetscMPIInt *onodes1,*olengths1,idex,end; 1233 Mat_SubSppt *smatis1; 1234 PetscBool isrowsorted,iscolsorted; 1235 1236 PetscFunctionBegin; 1237 PetscValidLogicalCollectiveInt(C,ismax,2); 1238 PetscValidLogicalCollectiveEnum(C,scall,5); 1239 PetscCheck(ismax == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1240 PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a)); 1241 PetscCall(MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a)); 1242 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 1243 size = c->size; 1244 rank = c->rank; 1245 1246 PetscCall(ISSorted(iscol[0],&iscolsorted)); 1247 PetscCall(ISSorted(isrow[0],&isrowsorted)); 1248 PetscCall(ISGetIndices(isrow[0],&irow)); 1249 PetscCall(ISGetLocalSize(isrow[0],&nrow)); 1250 if (allcolumns) { 1251 icol = NULL; 1252 ncol = C->cmap->N; 1253 } else { 1254 PetscCall(ISGetIndices(iscol[0],&icol)); 1255 PetscCall(ISGetLocalSize(iscol[0],&ncol)); 1256 } 1257 1258 if (scall == MAT_INITIAL_MATRIX) { 1259 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1260 1261 /* Get some new tags to keep the communication clean */ 1262 tag1 = ((PetscObject)C)->tag; 1263 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2)); 1264 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3)); 1265 1266 /* evaluate communication - mesg to who, length of mesg, and buffer space 1267 required. Based on this, buffers are allocated, and data copied into them */ 1268 PetscCall(PetscCalloc2(size,&w1,size,&w2)); 1269 PetscCall(PetscMalloc1(nrow,&row2proc)); 1270 1271 /* w1[proc] = num of rows owned by proc -- to be requested */ 1272 proc = 0; 1273 nrqs = 0; /* num of outgoing messages */ 1274 for (j=0; j<nrow; j++) { 1275 row = irow[j]; 1276 if (!isrowsorted) proc = 0; 1277 while (row >= C->rmap->range[proc+1]) proc++; 1278 w1[proc]++; 1279 row2proc[j] = proc; /* map row index to proc */ 1280 1281 if (proc != rank && !w2[proc]) { 1282 w2[proc] = 1; nrqs++; 1283 } 1284 } 1285 w1[rank] = 0; /* rows owned by self will not be requested */ 1286 1287 PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/ 1288 for (proc=0,j=0; proc<size; proc++) { 1289 if (w1[proc]) { pa[j++] = proc;} 1290 } 1291 1292 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1293 msz = 0; /* total mesg length (for all procs) */ 1294 for (i=0; i<nrqs; i++) { 1295 proc = pa[i]; 1296 w1[proc] += 3; 1297 msz += w1[proc]; 1298 } 1299 PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz)); 1300 1301 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1302 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1303 PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 1304 1305 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1306 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1307 PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 1308 1309 /* Now post the Irecvs corresponding to these messages */ 1310 PetscCall(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1)); 1311 1312 PetscCall(PetscFree(onodes1)); 1313 PetscCall(PetscFree(olengths1)); 1314 1315 /* Allocate Memory for outgoing messages */ 1316 PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr)); 1317 PetscCall(PetscArrayzero(sbuf1,size)); 1318 PetscCall(PetscArrayzero(ptr,size)); 1319 1320 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1321 iptr = tmp; 1322 for (i=0; i<nrqs; i++) { 1323 proc = pa[i]; 1324 sbuf1[proc] = iptr; 1325 iptr += w1[proc]; 1326 } 1327 1328 /* Form the outgoing messages */ 1329 /* Initialize the header space */ 1330 for (i=0; i<nrqs; i++) { 1331 proc = pa[i]; 1332 PetscCall(PetscArrayzero(sbuf1[proc],3)); 1333 ptr[proc] = sbuf1[proc] + 3; 1334 } 1335 1336 /* Parse the isrow and copy data into outbuf */ 1337 PetscCall(PetscArrayzero(ctr,size)); 1338 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1339 proc = row2proc[j]; 1340 if (proc != rank) { /* copy to the outgoing buf*/ 1341 *ptr[proc] = irow[j]; 1342 ctr[proc]++; ptr[proc]++; 1343 } 1344 } 1345 1346 /* Update the headers for the current IS */ 1347 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1348 if ((ctr_j = ctr[j])) { 1349 sbuf1_j = sbuf1[j]; 1350 k = ++sbuf1_j[0]; 1351 sbuf1_j[2*k] = ctr_j; 1352 sbuf1_j[2*k-1] = 0; 1353 } 1354 } 1355 1356 /* Now post the sends */ 1357 PetscCall(PetscMalloc1(nrqs,&s_waits1)); 1358 for (i=0; i<nrqs; ++i) { 1359 proc = pa[i]; 1360 PetscCallMPI(MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i)); 1361 } 1362 1363 /* Post Receives to capture the buffer size */ 1364 PetscCall(PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2)); 1365 PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3)); 1366 1367 if (nrqs) rbuf2[0] = tmp + msz; 1368 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1369 1370 for (i=0; i<nrqs; ++i) { 1371 proc = pa[i]; 1372 PetscCallMPI(MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i)); 1373 } 1374 1375 PetscCall(PetscFree2(w1,w2)); 1376 1377 /* Send to other procs the buf size they should allocate */ 1378 /* Receive messages*/ 1379 PetscCall(PetscMalloc1(nrqr,&r_status1)); 1380 PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1)); 1381 1382 PetscCallMPI(MPI_Waitall(nrqr,r_waits1,r_status1)); 1383 for (i=0; i<nrqr; ++i) { 1384 req_size[i] = 0; 1385 rbuf1_i = rbuf1[i]; 1386 start = 2*rbuf1_i[0] + 1; 1387 PetscCallMPI(MPI_Get_count(r_status1+i,MPIU_INT,&end)); 1388 PetscCall(PetscMalloc1(end,&sbuf2[i])); 1389 sbuf2_i = sbuf2[i]; 1390 for (j=start; j<end; j++) { 1391 k = rbuf1_i[j] - rstart; 1392 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1393 sbuf2_i[j] = ncols; 1394 req_size[i] += ncols; 1395 } 1396 req_source1[i] = r_status1[i].MPI_SOURCE; 1397 1398 /* form the header */ 1399 sbuf2_i[0] = req_size[i]; 1400 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1401 1402 PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i)); 1403 } 1404 1405 PetscCall(PetscFree(r_status1)); 1406 PetscCall(PetscFree(r_waits1)); 1407 1408 /* rbuf2 is received, Post recv column indices a->j */ 1409 PetscCallMPI(MPI_Waitall(nrqs,r_waits2,r_status2)); 1410 1411 PetscCall(PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3)); 1412 for (i=0; i<nrqs; ++i) { 1413 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i])); 1414 req_source2[i] = r_status2[i].MPI_SOURCE; 1415 PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i)); 1416 } 1417 1418 /* Wait on sends1 and sends2 */ 1419 PetscCall(PetscMalloc1(nrqs,&s_status1)); 1420 PetscCallMPI(MPI_Waitall(nrqs,s_waits1,s_status1)); 1421 PetscCall(PetscFree(s_waits1)); 1422 PetscCall(PetscFree(s_status1)); 1423 1424 PetscCallMPI(MPI_Waitall(nrqr,s_waits2,s_status2)); 1425 PetscCall(PetscFree4(r_status2,s_waits2,r_waits2,s_status2)); 1426 1427 /* Now allocate sending buffers for a->j, and send them off */ 1428 PetscCall(PetscMalloc1(nrqr,&sbuf_aj)); 1429 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1430 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0])); 1431 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1432 1433 for (i=0; i<nrqr; i++) { /* for each requested message */ 1434 rbuf1_i = rbuf1[i]; 1435 sbuf_aj_i = sbuf_aj[i]; 1436 ct1 = 2*rbuf1_i[0] + 1; 1437 ct2 = 0; 1438 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1439 1440 kmax = rbuf1[i][2]; 1441 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1442 row = rbuf1_i[ct1] - rstart; 1443 nzA = ai[row+1] - ai[row]; 1444 nzB = bi[row+1] - bi[row]; 1445 ncols = nzA + nzB; 1446 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1447 1448 /* load the column indices for this row into cols*/ 1449 cols = sbuf_aj_i + ct2; 1450 1451 lwrite = 0; 1452 for (l=0; l<nzB; l++) { 1453 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1454 } 1455 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1456 for (l=0; l<nzB; l++) { 1457 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1458 } 1459 1460 ct2 += ncols; 1461 } 1462 PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i)); 1463 } 1464 1465 /* create column map (cmap): global col of C -> local col of submat */ 1466 #if defined(PETSC_USE_CTABLE) 1467 if (!allcolumns) { 1468 PetscCall(PetscTableCreate(ncol,C->cmap->N,&cmap)); 1469 PetscCall(PetscCalloc1(C->cmap->n,&cmap_loc)); 1470 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1471 if (icol[j] >= cstart && icol[j] <cend) { 1472 cmap_loc[icol[j] - cstart] = j+1; 1473 } else { /* use PetscTable for non-local col indices */ 1474 PetscCall(PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES)); 1475 } 1476 } 1477 } else { 1478 cmap = NULL; 1479 cmap_loc = NULL; 1480 } 1481 PetscCall(PetscCalloc1(C->rmap->n,&rmap_loc)); 1482 #else 1483 if (!allcolumns) { 1484 PetscCall(PetscCalloc1(C->cmap->N,&cmap)); 1485 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1486 } else { 1487 cmap = NULL; 1488 } 1489 #endif 1490 1491 /* Create lens for MatSeqAIJSetPreallocation() */ 1492 PetscCall(PetscCalloc1(nrow,&lens)); 1493 1494 /* Compute lens from local part of C */ 1495 for (j=0; j<nrow; j++) { 1496 row = irow[j]; 1497 proc = row2proc[j]; 1498 if (proc == rank) { 1499 /* diagonal part A = c->A */ 1500 ncols = ai[row-rstart+1] - ai[row-rstart]; 1501 cols = aj + ai[row-rstart]; 1502 if (!allcolumns) { 1503 for (k=0; k<ncols; k++) { 1504 #if defined(PETSC_USE_CTABLE) 1505 tcol = cmap_loc[cols[k]]; 1506 #else 1507 tcol = cmap[cols[k]+cstart]; 1508 #endif 1509 if (tcol) lens[j]++; 1510 } 1511 } else { /* allcolumns */ 1512 lens[j] = ncols; 1513 } 1514 1515 /* off-diagonal part B = c->B */ 1516 ncols = bi[row-rstart+1] - bi[row-rstart]; 1517 cols = bj + bi[row-rstart]; 1518 if (!allcolumns) { 1519 for (k=0; k<ncols; k++) { 1520 #if defined(PETSC_USE_CTABLE) 1521 PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol)); 1522 #else 1523 tcol = cmap[bmap[cols[k]]]; 1524 #endif 1525 if (tcol) lens[j]++; 1526 } 1527 } else { /* allcolumns */ 1528 lens[j] += ncols; 1529 } 1530 } 1531 } 1532 1533 /* Create row map (rmap): global row of C -> local row of submat */ 1534 #if defined(PETSC_USE_CTABLE) 1535 PetscCall(PetscTableCreate(nrow,C->rmap->N,&rmap)); 1536 for (j=0; j<nrow; j++) { 1537 row = irow[j]; 1538 proc = row2proc[j]; 1539 if (proc == rank) { /* a local row */ 1540 rmap_loc[row - rstart] = j; 1541 } else { 1542 PetscCall(PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES)); 1543 } 1544 } 1545 #else 1546 PetscCall(PetscCalloc1(C->rmap->N,&rmap)); 1547 for (j=0; j<nrow; j++) { 1548 rmap[irow[j]] = j; 1549 } 1550 #endif 1551 1552 /* Update lens from offproc data */ 1553 /* recv a->j is done */ 1554 PetscCallMPI(MPI_Waitall(nrqs,r_waits3,r_status3)); 1555 for (i=0; i<nrqs; i++) { 1556 proc = pa[i]; 1557 sbuf1_i = sbuf1[proc]; 1558 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1559 ct1 = 2 + 1; 1560 ct2 = 0; 1561 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1562 rbuf3_i = rbuf3[i]; /* received C->j */ 1563 1564 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1565 max1 = sbuf1_i[2]; 1566 for (k=0; k<max1; k++,ct1++) { 1567 #if defined(PETSC_USE_CTABLE) 1568 PetscCall(PetscTableFind(rmap,sbuf1_i[ct1]+1,&row)); 1569 row--; 1570 PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1571 #else 1572 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1573 #endif 1574 /* Now, store row index of submat in sbuf1_i[ct1] */ 1575 sbuf1_i[ct1] = row; 1576 1577 nnz = rbuf2_i[ct1]; 1578 if (!allcolumns) { 1579 for (l=0; l<nnz; l++,ct2++) { 1580 #if defined(PETSC_USE_CTABLE) 1581 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1582 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1583 } else { 1584 PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol)); 1585 } 1586 #else 1587 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1588 #endif 1589 if (tcol) lens[row]++; 1590 } 1591 } else { /* allcolumns */ 1592 lens[row] += nnz; 1593 } 1594 } 1595 } 1596 PetscCallMPI(MPI_Waitall(nrqr,s_waits3,s_status3)); 1597 PetscCall(PetscFree4(r_waits3,s_waits3,r_status3,s_status3)); 1598 1599 /* Create the submatrices */ 1600 PetscCall(MatCreate(PETSC_COMM_SELF,&submat)); 1601 PetscCall(MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE)); 1602 1603 PetscCall(ISGetBlockSize(isrow[0],&i)); 1604 PetscCall(ISGetBlockSize(iscol[0],&j)); 1605 PetscCall(MatSetBlockSizes(submat,i,j)); 1606 PetscCall(MatSetType(submat,((PetscObject)A)->type_name)); 1607 PetscCall(MatSeqAIJSetPreallocation(submat,0,lens)); 1608 1609 /* create struct Mat_SubSppt and attached it to submat */ 1610 PetscCall(PetscNew(&smatis1)); 1611 subc = (Mat_SeqAIJ*)submat->data; 1612 subc->submatis1 = smatis1; 1613 1614 smatis1->id = 0; 1615 smatis1->nrqs = nrqs; 1616 smatis1->nrqr = nrqr; 1617 smatis1->rbuf1 = rbuf1; 1618 smatis1->rbuf2 = rbuf2; 1619 smatis1->rbuf3 = rbuf3; 1620 smatis1->sbuf2 = sbuf2; 1621 smatis1->req_source2 = req_source2; 1622 1623 smatis1->sbuf1 = sbuf1; 1624 smatis1->ptr = ptr; 1625 smatis1->tmp = tmp; 1626 smatis1->ctr = ctr; 1627 1628 smatis1->pa = pa; 1629 smatis1->req_size = req_size; 1630 smatis1->req_source1 = req_source1; 1631 1632 smatis1->allcolumns = allcolumns; 1633 smatis1->singleis = PETSC_TRUE; 1634 smatis1->row2proc = row2proc; 1635 smatis1->rmap = rmap; 1636 smatis1->cmap = cmap; 1637 #if defined(PETSC_USE_CTABLE) 1638 smatis1->rmap_loc = rmap_loc; 1639 smatis1->cmap_loc = cmap_loc; 1640 #endif 1641 1642 smatis1->destroy = submat->ops->destroy; 1643 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1644 submat->factortype = C->factortype; 1645 1646 /* compute rmax */ 1647 rmax = 0; 1648 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1649 1650 } else { /* scall == MAT_REUSE_MATRIX */ 1651 submat = submats[0]; 1652 PetscCheck(submat->rmap->n == nrow && submat->cmap->n == ncol,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1653 1654 subc = (Mat_SeqAIJ*)submat->data; 1655 rmax = subc->rmax; 1656 smatis1 = subc->submatis1; 1657 nrqs = smatis1->nrqs; 1658 nrqr = smatis1->nrqr; 1659 rbuf1 = smatis1->rbuf1; 1660 rbuf2 = smatis1->rbuf2; 1661 rbuf3 = smatis1->rbuf3; 1662 req_source2 = smatis1->req_source2; 1663 1664 sbuf1 = smatis1->sbuf1; 1665 sbuf2 = smatis1->sbuf2; 1666 ptr = smatis1->ptr; 1667 tmp = smatis1->tmp; 1668 ctr = smatis1->ctr; 1669 1670 pa = smatis1->pa; 1671 req_size = smatis1->req_size; 1672 req_source1 = smatis1->req_source1; 1673 1674 allcolumns = smatis1->allcolumns; 1675 row2proc = smatis1->row2proc; 1676 rmap = smatis1->rmap; 1677 cmap = smatis1->cmap; 1678 #if defined(PETSC_USE_CTABLE) 1679 rmap_loc = smatis1->rmap_loc; 1680 cmap_loc = smatis1->cmap_loc; 1681 #endif 1682 } 1683 1684 /* Post recv matrix values */ 1685 PetscCall(PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals)); 1686 PetscCall(PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4)); 1687 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4)); 1688 for (i=0; i<nrqs; ++i) { 1689 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i])); 1690 PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i)); 1691 } 1692 1693 /* Allocate sending buffers for a->a, and send them off */ 1694 PetscCall(PetscMalloc1(nrqr,&sbuf_aa)); 1695 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1696 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0])); 1697 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1698 1699 for (i=0; i<nrqr; i++) { 1700 rbuf1_i = rbuf1[i]; 1701 sbuf_aa_i = sbuf_aa[i]; 1702 ct1 = 2*rbuf1_i[0]+1; 1703 ct2 = 0; 1704 /* max1=rbuf1_i[0]; PetscCheck(max1 == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1705 1706 kmax = rbuf1_i[2]; 1707 for (k=0; k<kmax; k++,ct1++) { 1708 row = rbuf1_i[ct1] - rstart; 1709 nzA = ai[row+1] - ai[row]; 1710 nzB = bi[row+1] - bi[row]; 1711 ncols = nzA + nzB; 1712 cworkB = bj + bi[row]; 1713 vworkA = a_a + ai[row]; 1714 vworkB = b_a + bi[row]; 1715 1716 /* load the column values for this row into vals*/ 1717 vals = sbuf_aa_i + ct2; 1718 1719 lwrite = 0; 1720 for (l=0; l<nzB; l++) { 1721 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1722 } 1723 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1724 for (l=0; l<nzB; l++) { 1725 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1726 } 1727 1728 ct2 += ncols; 1729 } 1730 PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i)); 1731 } 1732 1733 /* Assemble submat */ 1734 /* First assemble the local rows */ 1735 for (j=0; j<nrow; j++) { 1736 row = irow[j]; 1737 proc = row2proc[j]; 1738 if (proc == rank) { 1739 Crow = row - rstart; /* local row index of C */ 1740 #if defined(PETSC_USE_CTABLE) 1741 row = rmap_loc[Crow]; /* row index of submat */ 1742 #else 1743 row = rmap[row]; 1744 #endif 1745 1746 if (allcolumns) { 1747 /* diagonal part A = c->A */ 1748 ncols = ai[Crow+1] - ai[Crow]; 1749 cols = aj + ai[Crow]; 1750 vals = a_a + ai[Crow]; 1751 i = 0; 1752 for (k=0; k<ncols; k++) { 1753 subcols[i] = cols[k] + cstart; 1754 subvals[i++] = vals[k]; 1755 } 1756 1757 /* off-diagonal part B = c->B */ 1758 ncols = bi[Crow+1] - bi[Crow]; 1759 cols = bj + bi[Crow]; 1760 vals = b_a + bi[Crow]; 1761 for (k=0; k<ncols; k++) { 1762 subcols[i] = bmap[cols[k]]; 1763 subvals[i++] = vals[k]; 1764 } 1765 1766 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES)); 1767 1768 } else { /* !allcolumns */ 1769 #if defined(PETSC_USE_CTABLE) 1770 /* diagonal part A = c->A */ 1771 ncols = ai[Crow+1] - ai[Crow]; 1772 cols = aj + ai[Crow]; 1773 vals = a_a + ai[Crow]; 1774 i = 0; 1775 for (k=0; k<ncols; k++) { 1776 tcol = cmap_loc[cols[k]]; 1777 if (tcol) { 1778 subcols[i] = --tcol; 1779 subvals[i++] = vals[k]; 1780 } 1781 } 1782 1783 /* off-diagonal part B = c->B */ 1784 ncols = bi[Crow+1] - bi[Crow]; 1785 cols = bj + bi[Crow]; 1786 vals = b_a + bi[Crow]; 1787 for (k=0; k<ncols; k++) { 1788 PetscCall(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol)); 1789 if (tcol) { 1790 subcols[i] = --tcol; 1791 subvals[i++] = vals[k]; 1792 } 1793 } 1794 #else 1795 /* diagonal part A = c->A */ 1796 ncols = ai[Crow+1] - ai[Crow]; 1797 cols = aj + ai[Crow]; 1798 vals = a_a + ai[Crow]; 1799 i = 0; 1800 for (k=0; k<ncols; k++) { 1801 tcol = cmap[cols[k]+cstart]; 1802 if (tcol) { 1803 subcols[i] = --tcol; 1804 subvals[i++] = vals[k]; 1805 } 1806 } 1807 1808 /* off-diagonal part B = c->B */ 1809 ncols = bi[Crow+1] - bi[Crow]; 1810 cols = bj + bi[Crow]; 1811 vals = b_a + bi[Crow]; 1812 for (k=0; k<ncols; k++) { 1813 tcol = cmap[bmap[cols[k]]]; 1814 if (tcol) { 1815 subcols[i] = --tcol; 1816 subvals[i++] = vals[k]; 1817 } 1818 } 1819 #endif 1820 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES)); 1821 } 1822 } 1823 } 1824 1825 /* Now assemble the off-proc rows */ 1826 for (i=0; i<nrqs; i++) { /* for each requested message */ 1827 /* recv values from other processes */ 1828 PetscCallMPI(MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i)); 1829 proc = pa[idex]; 1830 sbuf1_i = sbuf1[proc]; 1831 /* jmax = sbuf1_i[0]; PetscCheck(jmax == 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1832 ct1 = 2 + 1; 1833 ct2 = 0; /* count of received C->j */ 1834 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1835 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1836 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1837 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1838 1839 /* is_no = sbuf1_i[2*j-1]; PetscCheck(is_no == 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1840 max1 = sbuf1_i[2]; /* num of rows */ 1841 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1842 row = sbuf1_i[ct1]; /* row index of submat */ 1843 if (!allcolumns) { 1844 idex = 0; 1845 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1846 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1847 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1848 #if defined(PETSC_USE_CTABLE) 1849 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1850 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1851 } else { 1852 PetscCall(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol)); 1853 } 1854 #else 1855 tcol = cmap[rbuf3_i[ct2]]; 1856 #endif 1857 if (tcol) { 1858 subcols[idex] = --tcol; /* may not be sorted */ 1859 subvals[idex++] = rbuf4_i[ct2]; 1860 1861 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1862 For reuse, we replace received C->j with index that should be inserted to submat */ 1863 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1864 } 1865 } 1866 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES)); 1867 } else { /* scall == MAT_REUSE_MATRIX */ 1868 submat = submats[0]; 1869 subc = (Mat_SeqAIJ*)submat->data; 1870 1871 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1872 for (l=0; l<nnz; l++) { 1873 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1874 subvals[idex++] = rbuf4_i[ct2]; 1875 } 1876 1877 bj = subc->j + subc->i[row]; /* sorted column indices */ 1878 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES)); 1879 } 1880 } else { /* allcolumns */ 1881 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1882 PetscCall(MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES)); 1883 ct2 += nnz; 1884 } 1885 } 1886 } 1887 1888 /* sending a->a are done */ 1889 PetscCallMPI(MPI_Waitall(nrqr,s_waits4,s_status4)); 1890 PetscCall(PetscFree4(r_waits4,s_waits4,r_status4,s_status4)); 1891 1892 PetscCall(MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY)); 1893 PetscCall(MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY)); 1894 submats[0] = submat; 1895 1896 /* Restore the indices */ 1897 PetscCall(ISRestoreIndices(isrow[0],&irow)); 1898 if (!allcolumns) { 1899 PetscCall(ISRestoreIndices(iscol[0],&icol)); 1900 } 1901 1902 /* Destroy allocated memory */ 1903 for (i=0; i<nrqs; ++i) { 1904 PetscCall(PetscFree(rbuf4[i])); 1905 } 1906 PetscCall(PetscFree3(rbuf4,subcols,subvals)); 1907 if (sbuf_aa) { 1908 PetscCall(PetscFree(sbuf_aa[0])); 1909 PetscCall(PetscFree(sbuf_aa)); 1910 } 1911 1912 if (scall == MAT_INITIAL_MATRIX) { 1913 PetscCall(PetscFree(lens)); 1914 if (sbuf_aj) { 1915 PetscCall(PetscFree(sbuf_aj[0])); 1916 PetscCall(PetscFree(sbuf_aj)); 1917 } 1918 } 1919 PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a)); 1920 PetscCall(MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a)); 1921 PetscFunctionReturn(0); 1922 } 1923 1924 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1925 { 1926 PetscInt ncol; 1927 PetscBool colflag,allcolumns=PETSC_FALSE; 1928 1929 PetscFunctionBegin; 1930 /* Allocate memory to hold all the submatrices */ 1931 if (scall == MAT_INITIAL_MATRIX) { 1932 PetscCall(PetscCalloc1(2,submat)); 1933 } 1934 1935 /* Check for special case: each processor gets entire matrix columns */ 1936 PetscCall(ISIdentity(iscol[0],&colflag)); 1937 PetscCall(ISGetLocalSize(iscol[0],&ncol)); 1938 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1939 1940 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat)); 1941 PetscFunctionReturn(0); 1942 } 1943 1944 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1945 { 1946 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1947 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1948 Mat_SeqAIJ *subc; 1949 Mat_SubSppt *smat; 1950 1951 PetscFunctionBegin; 1952 /* Check for special case: each processor has a single IS */ 1953 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1954 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat)); 1955 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1956 PetscFunctionReturn(0); 1957 } 1958 1959 /* Collect global wantallmatrix and nstages */ 1960 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1961 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1962 if (!nmax) nmax = 1; 1963 1964 if (scall == MAT_INITIAL_MATRIX) { 1965 /* Collect global wantallmatrix and nstages */ 1966 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1967 PetscCall(ISIdentity(*isrow,&rowflag)); 1968 PetscCall(ISIdentity(*iscol,&colflag)); 1969 PetscCall(ISGetLocalSize(*isrow,&nrow)); 1970 PetscCall(ISGetLocalSize(*iscol,&ncol)); 1971 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1972 wantallmatrix = PETSC_TRUE; 1973 1974 PetscCall(PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL)); 1975 } 1976 } 1977 1978 /* Determine the number of stages through which submatrices are done 1979 Each stage will extract nmax submatrices. 1980 nmax is determined by the matrix column dimension. 1981 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1982 */ 1983 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1984 1985 in[0] = -1*(PetscInt)wantallmatrix; 1986 in[1] = nstages; 1987 PetscCall(MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C))); 1988 wantallmatrix = (PetscBool)(-out[0]); 1989 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 1990 1991 } else { /* MAT_REUSE_MATRIX */ 1992 if (ismax) { 1993 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 1994 smat = subc->submatis1; 1995 } else { /* (*submat)[0] is a dummy matrix */ 1996 smat = (Mat_SubSppt*)(*submat)[0]->data; 1997 } 1998 if (!smat) { 1999 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2000 wantallmatrix = PETSC_TRUE; 2001 } else if (smat->singleis) { 2002 PetscCall(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat)); 2003 PetscFunctionReturn(0); 2004 } else { 2005 nstages = smat->nstages; 2006 } 2007 } 2008 2009 if (wantallmatrix) { 2010 PetscCall(MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat)); 2011 PetscFunctionReturn(0); 2012 } 2013 2014 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2015 if (scall == MAT_INITIAL_MATRIX) { 2016 PetscCall(PetscCalloc1(ismax+nstages,submat)); 2017 } 2018 2019 for (i=0,pos=0; i<nstages; i++) { 2020 if (pos+nmax <= ismax) max_no = nmax; 2021 else if (pos >= ismax) max_no = 0; 2022 else max_no = ismax-pos; 2023 2024 PetscCall(MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos)); 2025 if (!max_no) { 2026 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2027 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2028 smat->nstages = nstages; 2029 } 2030 pos++; /* advance to next dummy matrix if any */ 2031 } else pos += max_no; 2032 } 2033 2034 if (ismax && scall == MAT_INITIAL_MATRIX) { 2035 /* save nstages for reuse */ 2036 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2037 smat = subc->submatis1; 2038 smat->nstages = nstages; 2039 } 2040 PetscFunctionReturn(0); 2041 } 2042 2043 /* -------------------------------------------------------------------------*/ 2044 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2045 { 2046 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2047 Mat A = c->A; 2048 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2049 const PetscInt **icol,**irow; 2050 PetscInt *nrow,*ncol,start; 2051 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2052 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2053 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2054 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2055 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2056 #if defined(PETSC_USE_CTABLE) 2057 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2058 #else 2059 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2060 #endif 2061 const PetscInt *irow_i; 2062 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2063 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2064 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2065 MPI_Comm comm; 2066 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2067 PetscMPIInt *onodes1,*olengths1,end; 2068 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2069 Mat_SubSppt *smat_i; 2070 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2071 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2072 2073 PetscFunctionBegin; 2074 PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 2075 size = c->size; 2076 rank = c->rank; 2077 2078 PetscCall(PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns)); 2079 PetscCall(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted)); 2080 2081 for (i=0; i<ismax; i++) { 2082 PetscCall(ISSorted(iscol[i],&issorted[i])); 2083 if (!issorted[i]) iscsorted = issorted[i]; 2084 2085 PetscCall(ISSorted(isrow[i],&issorted[i])); 2086 2087 PetscCall(ISGetIndices(isrow[i],&irow[i])); 2088 PetscCall(ISGetLocalSize(isrow[i],&nrow[i])); 2089 2090 /* Check for special case: allcolumn */ 2091 PetscCall(ISIdentity(iscol[i],&colflag)); 2092 PetscCall(ISGetLocalSize(iscol[i],&ncol[i])); 2093 if (colflag && ncol[i] == C->cmap->N) { 2094 allcolumns[i] = PETSC_TRUE; 2095 icol[i] = NULL; 2096 } else { 2097 allcolumns[i] = PETSC_FALSE; 2098 PetscCall(ISGetIndices(iscol[i],&icol[i])); 2099 } 2100 } 2101 2102 if (scall == MAT_REUSE_MATRIX) { 2103 /* Assumes new rows are same length as the old rows */ 2104 for (i=0; i<ismax; i++) { 2105 PetscCheck(submats[i],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%" PetscInt_FMT "] is null, cannot reuse",i); 2106 subc = (Mat_SeqAIJ*)submats[i]->data; 2107 PetscCheck(!(submats[i]->rmap->n != nrow[i]) && !(submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2108 2109 /* Initial matrix as if empty */ 2110 PetscCall(PetscArrayzero(subc->ilen,submats[i]->rmap->n)); 2111 2112 smat_i = subc->submatis1; 2113 2114 nrqs = smat_i->nrqs; 2115 nrqr = smat_i->nrqr; 2116 rbuf1 = smat_i->rbuf1; 2117 rbuf2 = smat_i->rbuf2; 2118 rbuf3 = smat_i->rbuf3; 2119 req_source2 = smat_i->req_source2; 2120 2121 sbuf1 = smat_i->sbuf1; 2122 sbuf2 = smat_i->sbuf2; 2123 ptr = smat_i->ptr; 2124 tmp = smat_i->tmp; 2125 ctr = smat_i->ctr; 2126 2127 pa = smat_i->pa; 2128 req_size = smat_i->req_size; 2129 req_source1 = smat_i->req_source1; 2130 2131 allcolumns[i] = smat_i->allcolumns; 2132 row2proc[i] = smat_i->row2proc; 2133 rmap[i] = smat_i->rmap; 2134 cmap[i] = smat_i->cmap; 2135 } 2136 2137 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2138 PetscCheck(submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2139 smat_i = (Mat_SubSppt*)submats[0]->data; 2140 2141 nrqs = smat_i->nrqs; 2142 nrqr = smat_i->nrqr; 2143 rbuf1 = smat_i->rbuf1; 2144 rbuf2 = smat_i->rbuf2; 2145 rbuf3 = smat_i->rbuf3; 2146 req_source2 = smat_i->req_source2; 2147 2148 sbuf1 = smat_i->sbuf1; 2149 sbuf2 = smat_i->sbuf2; 2150 ptr = smat_i->ptr; 2151 tmp = smat_i->tmp; 2152 ctr = smat_i->ctr; 2153 2154 pa = smat_i->pa; 2155 req_size = smat_i->req_size; 2156 req_source1 = smat_i->req_source1; 2157 2158 allcolumns[0] = PETSC_FALSE; 2159 } 2160 } else { /* scall == MAT_INITIAL_MATRIX */ 2161 /* Get some new tags to keep the communication clean */ 2162 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag2)); 2163 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag3)); 2164 2165 /* evaluate communication - mesg to who, length of mesg, and buffer space 2166 required. Based on this, buffers are allocated, and data copied into them*/ 2167 PetscCall(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4)); /* mesg size, initialize work vectors */ 2168 2169 for (i=0; i<ismax; i++) { 2170 jmax = nrow[i]; 2171 irow_i = irow[i]; 2172 2173 PetscCall(PetscMalloc1(jmax,&row2proc_i)); 2174 row2proc[i] = row2proc_i; 2175 2176 if (issorted[i]) proc = 0; 2177 for (j=0; j<jmax; j++) { 2178 if (!issorted[i]) proc = 0; 2179 row = irow_i[j]; 2180 while (row >= C->rmap->range[proc+1]) proc++; 2181 w4[proc]++; 2182 row2proc_i[j] = proc; /* map row index to proc */ 2183 } 2184 for (j=0; j<size; j++) { 2185 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2186 } 2187 } 2188 2189 nrqs = 0; /* no of outgoing messages */ 2190 msz = 0; /* total mesg length (for all procs) */ 2191 w1[rank] = 0; /* no mesg sent to self */ 2192 w3[rank] = 0; 2193 for (i=0; i<size; i++) { 2194 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2195 } 2196 PetscCall(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/ 2197 for (i=0,j=0; i<size; i++) { 2198 if (w1[i]) { pa[j] = i; j++; } 2199 } 2200 2201 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2202 for (i=0; i<nrqs; i++) { 2203 j = pa[i]; 2204 w1[j] += w2[j] + 2* w3[j]; 2205 msz += w1[j]; 2206 } 2207 PetscCall(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz)); 2208 2209 /* Determine the number of messages to expect, their lengths, from from-ids */ 2210 PetscCall(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 2211 PetscCall(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 2212 2213 /* Now post the Irecvs corresponding to these messages */ 2214 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag0)); 2215 PetscCall(PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1)); 2216 2217 /* Allocate Memory for outgoing messages */ 2218 PetscCall(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr)); 2219 PetscCall(PetscArrayzero(sbuf1,size)); 2220 PetscCall(PetscArrayzero(ptr,size)); 2221 2222 { 2223 PetscInt *iptr = tmp; 2224 k = 0; 2225 for (i=0; i<nrqs; i++) { 2226 j = pa[i]; 2227 iptr += k; 2228 sbuf1[j] = iptr; 2229 k = w1[j]; 2230 } 2231 } 2232 2233 /* Form the outgoing messages. Initialize the header space */ 2234 for (i=0; i<nrqs; i++) { 2235 j = pa[i]; 2236 sbuf1[j][0] = 0; 2237 PetscCall(PetscArrayzero(sbuf1[j]+1,2*w3[j])); 2238 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2239 } 2240 2241 /* Parse the isrow and copy data into outbuf */ 2242 for (i=0; i<ismax; i++) { 2243 row2proc_i = row2proc[i]; 2244 PetscCall(PetscArrayzero(ctr,size)); 2245 irow_i = irow[i]; 2246 jmax = nrow[i]; 2247 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2248 proc = row2proc_i[j]; 2249 if (proc != rank) { /* copy to the outgoing buf*/ 2250 ctr[proc]++; 2251 *ptr[proc] = irow_i[j]; 2252 ptr[proc]++; 2253 } 2254 } 2255 /* Update the headers for the current IS */ 2256 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2257 if ((ctr_j = ctr[j])) { 2258 sbuf1_j = sbuf1[j]; 2259 k = ++sbuf1_j[0]; 2260 sbuf1_j[2*k] = ctr_j; 2261 sbuf1_j[2*k-1] = i; 2262 } 2263 } 2264 } 2265 2266 /* Now post the sends */ 2267 PetscCall(PetscMalloc1(nrqs,&s_waits1)); 2268 for (i=0; i<nrqs; ++i) { 2269 j = pa[i]; 2270 PetscCallMPI(MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i)); 2271 } 2272 2273 /* Post Receives to capture the buffer size */ 2274 PetscCall(PetscMalloc1(nrqs,&r_waits2)); 2275 PetscCall(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3)); 2276 if (nrqs) rbuf2[0] = tmp + msz; 2277 for (i=1; i<nrqs; ++i) { 2278 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2279 } 2280 for (i=0; i<nrqs; ++i) { 2281 j = pa[i]; 2282 PetscCallMPI(MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i)); 2283 } 2284 2285 /* Send to other procs the buf size they should allocate */ 2286 /* Receive messages*/ 2287 PetscCall(PetscMalloc1(nrqr,&s_waits2)); 2288 PetscCall(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1)); 2289 { 2290 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2291 PetscInt *sbuf2_i; 2292 2293 PetscCallMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE)); 2294 for (i=0; i<nrqr; ++i) { 2295 req_size[i] = 0; 2296 rbuf1_i = rbuf1[i]; 2297 start = 2*rbuf1_i[0] + 1; 2298 end = olengths1[i]; 2299 PetscCall(PetscMalloc1(end,&sbuf2[i])); 2300 sbuf2_i = sbuf2[i]; 2301 for (j=start; j<end; j++) { 2302 id = rbuf1_i[j] - rstart; 2303 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2304 sbuf2_i[j] = ncols; 2305 req_size[i] += ncols; 2306 } 2307 req_source1[i] = onodes1[i]; 2308 /* form the header */ 2309 sbuf2_i[0] = req_size[i]; 2310 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2311 2312 PetscCallMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i)); 2313 } 2314 } 2315 2316 PetscCall(PetscFree(onodes1)); 2317 PetscCall(PetscFree(olengths1)); 2318 PetscCall(PetscFree(r_waits1)); 2319 PetscCall(PetscFree4(w1,w2,w3,w4)); 2320 2321 /* Receive messages*/ 2322 PetscCall(PetscMalloc1(nrqs,&r_waits3)); 2323 PetscCallMPI(MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE)); 2324 for (i=0; i<nrqs; ++i) { 2325 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf3[i])); 2326 req_source2[i] = pa[i]; 2327 PetscCallMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i)); 2328 } 2329 PetscCall(PetscFree(r_waits2)); 2330 2331 /* Wait on sends1 and sends2 */ 2332 PetscCallMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE)); 2333 PetscCallMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE)); 2334 PetscCall(PetscFree(s_waits1)); 2335 PetscCall(PetscFree(s_waits2)); 2336 2337 /* Now allocate sending buffers for a->j, and send them off */ 2338 PetscCall(PetscMalloc1(nrqr,&sbuf_aj)); 2339 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2340 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aj[0])); 2341 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2342 2343 PetscCall(PetscMalloc1(nrqr,&s_waits3)); 2344 { 2345 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2346 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2347 PetscInt cend = C->cmap->rend; 2348 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2349 2350 for (i=0; i<nrqr; i++) { 2351 rbuf1_i = rbuf1[i]; 2352 sbuf_aj_i = sbuf_aj[i]; 2353 ct1 = 2*rbuf1_i[0] + 1; 2354 ct2 = 0; 2355 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2356 kmax = rbuf1[i][2*j]; 2357 for (k=0; k<kmax; k++,ct1++) { 2358 row = rbuf1_i[ct1] - rstart; 2359 nzA = a_i[row+1] - a_i[row]; 2360 nzB = b_i[row+1] - b_i[row]; 2361 ncols = nzA + nzB; 2362 cworkA = a_j + a_i[row]; 2363 cworkB = b_j + b_i[row]; 2364 2365 /* load the column indices for this row into cols */ 2366 cols = sbuf_aj_i + ct2; 2367 2368 lwrite = 0; 2369 for (l=0; l<nzB; l++) { 2370 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2371 } 2372 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2373 for (l=0; l<nzB; l++) { 2374 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2375 } 2376 2377 ct2 += ncols; 2378 } 2379 } 2380 PetscCallMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i)); 2381 } 2382 } 2383 2384 /* create col map: global col of C -> local col of submatrices */ 2385 { 2386 const PetscInt *icol_i; 2387 #if defined(PETSC_USE_CTABLE) 2388 for (i=0; i<ismax; i++) { 2389 if (!allcolumns[i]) { 2390 PetscCall(PetscTableCreate(ncol[i],C->cmap->N,&cmap[i])); 2391 2392 jmax = ncol[i]; 2393 icol_i = icol[i]; 2394 cmap_i = cmap[i]; 2395 for (j=0; j<jmax; j++) { 2396 PetscCall(PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES)); 2397 } 2398 } else cmap[i] = NULL; 2399 } 2400 #else 2401 for (i=0; i<ismax; i++) { 2402 if (!allcolumns[i]) { 2403 PetscCall(PetscCalloc1(C->cmap->N,&cmap[i])); 2404 jmax = ncol[i]; 2405 icol_i = icol[i]; 2406 cmap_i = cmap[i]; 2407 for (j=0; j<jmax; j++) { 2408 cmap_i[icol_i[j]] = j+1; 2409 } 2410 } else cmap[i] = NULL; 2411 } 2412 #endif 2413 } 2414 2415 /* Create lens which is required for MatCreate... */ 2416 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2417 PetscCall(PetscMalloc1(ismax,&lens)); 2418 2419 if (ismax) { 2420 PetscCall(PetscCalloc1(j,&lens[0])); 2421 } 2422 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2423 2424 /* Update lens from local data */ 2425 for (i=0; i<ismax; i++) { 2426 row2proc_i = row2proc[i]; 2427 jmax = nrow[i]; 2428 if (!allcolumns[i]) cmap_i = cmap[i]; 2429 irow_i = irow[i]; 2430 lens_i = lens[i]; 2431 for (j=0; j<jmax; j++) { 2432 row = irow_i[j]; 2433 proc = row2proc_i[j]; 2434 if (proc == rank) { 2435 PetscCall(MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL)); 2436 if (!allcolumns[i]) { 2437 for (k=0; k<ncols; k++) { 2438 #if defined(PETSC_USE_CTABLE) 2439 PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol)); 2440 #else 2441 tcol = cmap_i[cols[k]]; 2442 #endif 2443 if (tcol) lens_i[j]++; 2444 } 2445 } else { /* allcolumns */ 2446 lens_i[j] = ncols; 2447 } 2448 PetscCall(MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL)); 2449 } 2450 } 2451 } 2452 2453 /* Create row map: global row of C -> local row of submatrices */ 2454 #if defined(PETSC_USE_CTABLE) 2455 for (i=0; i<ismax; i++) { 2456 PetscCall(PetscTableCreate(nrow[i],C->rmap->N,&rmap[i])); 2457 irow_i = irow[i]; 2458 jmax = nrow[i]; 2459 for (j=0; j<jmax; j++) { 2460 PetscCall(PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES)); 2461 } 2462 } 2463 #else 2464 for (i=0; i<ismax; i++) { 2465 PetscCall(PetscCalloc1(C->rmap->N,&rmap[i])); 2466 rmap_i = rmap[i]; 2467 irow_i = irow[i]; 2468 jmax = nrow[i]; 2469 for (j=0; j<jmax; j++) { 2470 rmap_i[irow_i[j]] = j; 2471 } 2472 } 2473 #endif 2474 2475 /* Update lens from offproc data */ 2476 { 2477 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2478 2479 PetscCallMPI(MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE)); 2480 for (tmp2=0; tmp2<nrqs; tmp2++) { 2481 sbuf1_i = sbuf1[pa[tmp2]]; 2482 jmax = sbuf1_i[0]; 2483 ct1 = 2*jmax+1; 2484 ct2 = 0; 2485 rbuf2_i = rbuf2[tmp2]; 2486 rbuf3_i = rbuf3[tmp2]; 2487 for (j=1; j<=jmax; j++) { 2488 is_no = sbuf1_i[2*j-1]; 2489 max1 = sbuf1_i[2*j]; 2490 lens_i = lens[is_no]; 2491 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2492 rmap_i = rmap[is_no]; 2493 for (k=0; k<max1; k++,ct1++) { 2494 #if defined(PETSC_USE_CTABLE) 2495 PetscCall(PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row)); 2496 row--; 2497 PetscCheck(row >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2498 #else 2499 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2500 #endif 2501 max2 = rbuf2_i[ct1]; 2502 for (l=0; l<max2; l++,ct2++) { 2503 if (!allcolumns[is_no]) { 2504 #if defined(PETSC_USE_CTABLE) 2505 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol)); 2506 #else 2507 tcol = cmap_i[rbuf3_i[ct2]]; 2508 #endif 2509 if (tcol) lens_i[row]++; 2510 } else { /* allcolumns */ 2511 lens_i[row]++; /* lens_i[row] += max2 ? */ 2512 } 2513 } 2514 } 2515 } 2516 } 2517 } 2518 PetscCall(PetscFree(r_waits3)); 2519 PetscCallMPI(MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE)); 2520 PetscCall(PetscFree(s_waits3)); 2521 2522 /* Create the submatrices */ 2523 for (i=0; i<ismax; i++) { 2524 PetscInt rbs,cbs; 2525 2526 PetscCall(ISGetBlockSize(isrow[i],&rbs)); 2527 PetscCall(ISGetBlockSize(iscol[i],&cbs)); 2528 2529 PetscCall(MatCreate(PETSC_COMM_SELF,submats+i)); 2530 PetscCall(MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE)); 2531 2532 PetscCall(MatSetBlockSizes(submats[i],rbs,cbs)); 2533 PetscCall(MatSetType(submats[i],((PetscObject)A)->type_name)); 2534 PetscCall(MatSeqAIJSetPreallocation(submats[i],0,lens[i])); 2535 2536 /* create struct Mat_SubSppt and attached it to submat */ 2537 PetscCall(PetscNew(&smat_i)); 2538 subc = (Mat_SeqAIJ*)submats[i]->data; 2539 subc->submatis1 = smat_i; 2540 2541 smat_i->destroy = submats[i]->ops->destroy; 2542 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2543 submats[i]->factortype = C->factortype; 2544 2545 smat_i->id = i; 2546 smat_i->nrqs = nrqs; 2547 smat_i->nrqr = nrqr; 2548 smat_i->rbuf1 = rbuf1; 2549 smat_i->rbuf2 = rbuf2; 2550 smat_i->rbuf3 = rbuf3; 2551 smat_i->sbuf2 = sbuf2; 2552 smat_i->req_source2 = req_source2; 2553 2554 smat_i->sbuf1 = sbuf1; 2555 smat_i->ptr = ptr; 2556 smat_i->tmp = tmp; 2557 smat_i->ctr = ctr; 2558 2559 smat_i->pa = pa; 2560 smat_i->req_size = req_size; 2561 smat_i->req_source1 = req_source1; 2562 2563 smat_i->allcolumns = allcolumns[i]; 2564 smat_i->singleis = PETSC_FALSE; 2565 smat_i->row2proc = row2proc[i]; 2566 smat_i->rmap = rmap[i]; 2567 smat_i->cmap = cmap[i]; 2568 } 2569 2570 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2571 PetscCall(MatCreate(PETSC_COMM_SELF,&submats[0])); 2572 PetscCall(MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE)); 2573 PetscCall(MatSetType(submats[0],MATDUMMY)); 2574 2575 /* create struct Mat_SubSppt and attached it to submat */ 2576 PetscCall(PetscNewLog(submats[0],&smat_i)); 2577 submats[0]->data = (void*)smat_i; 2578 2579 smat_i->destroy = submats[0]->ops->destroy; 2580 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2581 submats[0]->factortype = C->factortype; 2582 2583 smat_i->id = 0; 2584 smat_i->nrqs = nrqs; 2585 smat_i->nrqr = nrqr; 2586 smat_i->rbuf1 = rbuf1; 2587 smat_i->rbuf2 = rbuf2; 2588 smat_i->rbuf3 = rbuf3; 2589 smat_i->sbuf2 = sbuf2; 2590 smat_i->req_source2 = req_source2; 2591 2592 smat_i->sbuf1 = sbuf1; 2593 smat_i->ptr = ptr; 2594 smat_i->tmp = tmp; 2595 smat_i->ctr = ctr; 2596 2597 smat_i->pa = pa; 2598 smat_i->req_size = req_size; 2599 smat_i->req_source1 = req_source1; 2600 2601 smat_i->allcolumns = PETSC_FALSE; 2602 smat_i->singleis = PETSC_FALSE; 2603 smat_i->row2proc = NULL; 2604 smat_i->rmap = NULL; 2605 smat_i->cmap = NULL; 2606 } 2607 2608 if (ismax) PetscCall(PetscFree(lens[0])); 2609 PetscCall(PetscFree(lens)); 2610 if (sbuf_aj) { 2611 PetscCall(PetscFree(sbuf_aj[0])); 2612 PetscCall(PetscFree(sbuf_aj)); 2613 } 2614 2615 } /* endof scall == MAT_INITIAL_MATRIX */ 2616 2617 /* Post recv matrix values */ 2618 PetscCall(PetscObjectGetNewTag((PetscObject)C,&tag4)); 2619 PetscCall(PetscMalloc1(nrqs,&rbuf4)); 2620 PetscCall(PetscMalloc1(nrqs,&r_waits4)); 2621 for (i=0; i<nrqs; ++i) { 2622 PetscCall(PetscMalloc1(rbuf2[i][0],&rbuf4[i])); 2623 PetscCallMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i)); 2624 } 2625 2626 /* Allocate sending buffers for a->a, and send them off */ 2627 PetscCall(PetscMalloc1(nrqr,&sbuf_aa)); 2628 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2629 if (nrqr) PetscCall(PetscMalloc1(j,&sbuf_aa[0])); 2630 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2631 2632 PetscCall(PetscMalloc1(nrqr,&s_waits4)); 2633 { 2634 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2635 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2636 PetscInt cend = C->cmap->rend; 2637 PetscInt *b_j = b->j; 2638 PetscScalar *vworkA,*vworkB,*a_a,*b_a; 2639 2640 PetscCall(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a)); 2641 PetscCall(MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a)); 2642 for (i=0; i<nrqr; i++) { 2643 rbuf1_i = rbuf1[i]; 2644 sbuf_aa_i = sbuf_aa[i]; 2645 ct1 = 2*rbuf1_i[0]+1; 2646 ct2 = 0; 2647 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2648 kmax = rbuf1_i[2*j]; 2649 for (k=0; k<kmax; k++,ct1++) { 2650 row = rbuf1_i[ct1] - rstart; 2651 nzA = a_i[row+1] - a_i[row]; 2652 nzB = b_i[row+1] - b_i[row]; 2653 ncols = nzA + nzB; 2654 cworkB = b_j + b_i[row]; 2655 vworkA = a_a + a_i[row]; 2656 vworkB = b_a + b_i[row]; 2657 2658 /* load the column values for this row into vals*/ 2659 vals = sbuf_aa_i+ct2; 2660 2661 lwrite = 0; 2662 for (l=0; l<nzB; l++) { 2663 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2664 } 2665 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2666 for (l=0; l<nzB; l++) { 2667 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2668 } 2669 2670 ct2 += ncols; 2671 } 2672 } 2673 PetscCallMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i)); 2674 } 2675 PetscCall(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a)); 2676 PetscCall(MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a)); 2677 } 2678 2679 /* Assemble the matrices */ 2680 /* First assemble the local rows */ 2681 for (i=0; i<ismax; i++) { 2682 row2proc_i = row2proc[i]; 2683 subc = (Mat_SeqAIJ*)submats[i]->data; 2684 imat_ilen = subc->ilen; 2685 imat_j = subc->j; 2686 imat_i = subc->i; 2687 imat_a = subc->a; 2688 2689 if (!allcolumns[i]) cmap_i = cmap[i]; 2690 rmap_i = rmap[i]; 2691 irow_i = irow[i]; 2692 jmax = nrow[i]; 2693 for (j=0; j<jmax; j++) { 2694 row = irow_i[j]; 2695 proc = row2proc_i[j]; 2696 if (proc == rank) { 2697 old_row = row; 2698 #if defined(PETSC_USE_CTABLE) 2699 PetscCall(PetscTableFind(rmap_i,row+1,&row)); 2700 row--; 2701 #else 2702 row = rmap_i[row]; 2703 #endif 2704 ilen_row = imat_ilen[row]; 2705 PetscCall(MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals)); 2706 mat_i = imat_i[row]; 2707 mat_a = imat_a + mat_i; 2708 mat_j = imat_j + mat_i; 2709 if (!allcolumns[i]) { 2710 for (k=0; k<ncols; k++) { 2711 #if defined(PETSC_USE_CTABLE) 2712 PetscCall(PetscTableFind(cmap_i,cols[k]+1,&tcol)); 2713 #else 2714 tcol = cmap_i[cols[k]]; 2715 #endif 2716 if (tcol) { 2717 *mat_j++ = tcol - 1; 2718 *mat_a++ = vals[k]; 2719 ilen_row++; 2720 } 2721 } 2722 } else { /* allcolumns */ 2723 for (k=0; k<ncols; k++) { 2724 *mat_j++ = cols[k]; /* global col index! */ 2725 *mat_a++ = vals[k]; 2726 ilen_row++; 2727 } 2728 } 2729 PetscCall(MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals)); 2730 2731 imat_ilen[row] = ilen_row; 2732 } 2733 } 2734 } 2735 2736 /* Now assemble the off proc rows */ 2737 PetscCallMPI(MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE)); 2738 for (tmp2=0; tmp2<nrqs; tmp2++) { 2739 sbuf1_i = sbuf1[pa[tmp2]]; 2740 jmax = sbuf1_i[0]; 2741 ct1 = 2*jmax + 1; 2742 ct2 = 0; 2743 rbuf2_i = rbuf2[tmp2]; 2744 rbuf3_i = rbuf3[tmp2]; 2745 rbuf4_i = rbuf4[tmp2]; 2746 for (j=1; j<=jmax; j++) { 2747 is_no = sbuf1_i[2*j-1]; 2748 rmap_i = rmap[is_no]; 2749 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2750 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2751 imat_ilen = subc->ilen; 2752 imat_j = subc->j; 2753 imat_i = subc->i; 2754 imat_a = subc->a; 2755 max1 = sbuf1_i[2*j]; 2756 for (k=0; k<max1; k++,ct1++) { 2757 row = sbuf1_i[ct1]; 2758 #if defined(PETSC_USE_CTABLE) 2759 PetscCall(PetscTableFind(rmap_i,row+1,&row)); 2760 row--; 2761 #else 2762 row = rmap_i[row]; 2763 #endif 2764 ilen = imat_ilen[row]; 2765 mat_i = imat_i[row]; 2766 mat_a = imat_a + mat_i; 2767 mat_j = imat_j + mat_i; 2768 max2 = rbuf2_i[ct1]; 2769 if (!allcolumns[is_no]) { 2770 for (l=0; l<max2; l++,ct2++) { 2771 #if defined(PETSC_USE_CTABLE) 2772 PetscCall(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol)); 2773 #else 2774 tcol = cmap_i[rbuf3_i[ct2]]; 2775 #endif 2776 if (tcol) { 2777 *mat_j++ = tcol - 1; 2778 *mat_a++ = rbuf4_i[ct2]; 2779 ilen++; 2780 } 2781 } 2782 } else { /* allcolumns */ 2783 for (l=0; l<max2; l++,ct2++) { 2784 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2785 *mat_a++ = rbuf4_i[ct2]; 2786 ilen++; 2787 } 2788 } 2789 imat_ilen[row] = ilen; 2790 } 2791 } 2792 } 2793 2794 if (!iscsorted) { /* sort column indices of the rows */ 2795 for (i=0; i<ismax; i++) { 2796 subc = (Mat_SeqAIJ*)submats[i]->data; 2797 imat_j = subc->j; 2798 imat_i = subc->i; 2799 imat_a = subc->a; 2800 imat_ilen = subc->ilen; 2801 2802 if (allcolumns[i]) continue; 2803 jmax = nrow[i]; 2804 for (j=0; j<jmax; j++) { 2805 mat_i = imat_i[j]; 2806 mat_a = imat_a + mat_i; 2807 mat_j = imat_j + mat_i; 2808 PetscCall(PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a)); 2809 } 2810 } 2811 } 2812 2813 PetscCall(PetscFree(r_waits4)); 2814 PetscCallMPI(MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE)); 2815 PetscCall(PetscFree(s_waits4)); 2816 2817 /* Restore the indices */ 2818 for (i=0; i<ismax; i++) { 2819 PetscCall(ISRestoreIndices(isrow[i],irow+i)); 2820 if (!allcolumns[i]) { 2821 PetscCall(ISRestoreIndices(iscol[i],icol+i)); 2822 } 2823 } 2824 2825 for (i=0; i<ismax; i++) { 2826 PetscCall(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY)); 2827 PetscCall(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY)); 2828 } 2829 2830 /* Destroy allocated memory */ 2831 if (sbuf_aa) { 2832 PetscCall(PetscFree(sbuf_aa[0])); 2833 PetscCall(PetscFree(sbuf_aa)); 2834 } 2835 PetscCall(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted)); 2836 2837 for (i=0; i<nrqs; ++i) { 2838 PetscCall(PetscFree(rbuf4[i])); 2839 } 2840 PetscCall(PetscFree(rbuf4)); 2841 2842 PetscCall(PetscFree4(row2proc,cmap,rmap,allcolumns)); 2843 PetscFunctionReturn(0); 2844 } 2845 2846 /* 2847 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2848 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2849 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2850 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2851 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2852 state, and needs to be "assembled" later by compressing B's column space. 2853 2854 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2855 Following this call, C->A & C->B have been created, even if empty. 2856 */ 2857 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2858 { 2859 /* If making this function public, change the error returned in this function away from _PLIB. */ 2860 Mat_MPIAIJ *aij; 2861 Mat_SeqAIJ *Baij; 2862 PetscBool seqaij,Bdisassembled; 2863 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2864 PetscScalar v; 2865 const PetscInt *rowindices,*colindices; 2866 2867 PetscFunctionBegin; 2868 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2869 if (A) { 2870 PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij)); 2871 PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2872 if (rowemb) { 2873 PetscCall(ISGetLocalSize(rowemb,&m)); 2874 PetscCheck(m == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT,m,A->rmap->n); 2875 } else { 2876 if (C->rmap->n != A->rmap->n) { 2877 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2878 } 2879 } 2880 if (dcolemb) { 2881 PetscCall(ISGetLocalSize(dcolemb,&n)); 2882 PetscCheck(n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT,n,A->cmap->n); 2883 } else { 2884 PetscCheck(C->cmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2885 } 2886 } 2887 if (B) { 2888 PetscCall(PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij)); 2889 PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2890 if (rowemb) { 2891 PetscCall(ISGetLocalSize(rowemb,&m)); 2892 PetscCheck(m == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT,m,A->rmap->n); 2893 } else { 2894 if (C->rmap->n != B->rmap->n) { 2895 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2896 } 2897 } 2898 if (ocolemb) { 2899 PetscCall(ISGetLocalSize(ocolemb,&n)); 2900 PetscCheck(n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT,n,B->cmap->n); 2901 } else { 2902 PetscCheck(C->cmap->N - C->cmap->n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2903 } 2904 } 2905 2906 aij = (Mat_MPIAIJ*)C->data; 2907 if (!aij->A) { 2908 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2909 PetscCall(MatCreate(PETSC_COMM_SELF,&aij->A)); 2910 PetscCall(MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n)); 2911 PetscCall(MatSetBlockSizesFromMats(aij->A,C,C)); 2912 PetscCall(MatSetType(aij->A,MATSEQAIJ)); 2913 PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A)); 2914 } 2915 if (A) { 2916 PetscCall(MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A)); 2917 } else { 2918 PetscCall(MatSetUp(aij->A)); 2919 } 2920 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2921 /* 2922 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2923 need to "disassemble" B -- convert it to using C's global indices. 2924 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2925 2926 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2927 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2928 2929 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2930 At least avoid calling MatSetValues() and the implied searches? 2931 */ 2932 2933 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2934 #if defined(PETSC_USE_CTABLE) 2935 PetscCall(PetscTableDestroy(&aij->colmap)); 2936 #else 2937 PetscCall(PetscFree(aij->colmap)); 2938 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2939 if (aij->B) PetscCall(PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt))); 2940 #endif 2941 ngcol = 0; 2942 if (aij->lvec) { 2943 PetscCall(VecGetSize(aij->lvec,&ngcol)); 2944 } 2945 if (aij->garray) { 2946 PetscCall(PetscFree(aij->garray)); 2947 PetscCall(PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt))); 2948 } 2949 PetscCall(VecDestroy(&aij->lvec)); 2950 PetscCall(VecScatterDestroy(&aij->Mvctx)); 2951 } 2952 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2953 PetscCall(MatDestroy(&aij->B)); 2954 } 2955 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2956 PetscCall(MatZeroEntries(aij->B)); 2957 } 2958 } 2959 Bdisassembled = PETSC_FALSE; 2960 if (!aij->B) { 2961 PetscCall(MatCreate(PETSC_COMM_SELF,&aij->B)); 2962 PetscCall(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B)); 2963 PetscCall(MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N)); 2964 PetscCall(MatSetBlockSizesFromMats(aij->B,B,B)); 2965 PetscCall(MatSetType(aij->B,MATSEQAIJ)); 2966 Bdisassembled = PETSC_TRUE; 2967 } 2968 if (B) { 2969 Baij = (Mat_SeqAIJ*)B->data; 2970 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2971 PetscCall(PetscMalloc1(B->rmap->n,&nz)); 2972 for (i=0; i<B->rmap->n; i++) { 2973 nz[i] = Baij->i[i+1] - Baij->i[i]; 2974 } 2975 PetscCall(MatSeqAIJSetPreallocation(aij->B,0,nz)); 2976 PetscCall(PetscFree(nz)); 2977 } 2978 2979 PetscCall(PetscLayoutGetRange(C->rmap,&rstart,&rend)); 2980 shift = rend-rstart; 2981 count = 0; 2982 rowindices = NULL; 2983 colindices = NULL; 2984 if (rowemb) { 2985 PetscCall(ISGetIndices(rowemb,&rowindices)); 2986 } 2987 if (ocolemb) { 2988 PetscCall(ISGetIndices(ocolemb,&colindices)); 2989 } 2990 for (i=0; i<B->rmap->n; i++) { 2991 PetscInt row; 2992 row = i; 2993 if (rowindices) row = rowindices[i]; 2994 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 2995 col = Baij->j[count]; 2996 if (colindices) col = colindices[col]; 2997 if (Bdisassembled && col>=rstart) col += shift; 2998 v = Baij->a[count]; 2999 PetscCall(MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES)); 3000 ++count; 3001 } 3002 } 3003 /* No assembly for aij->B is necessary. */ 3004 /* FIXME: set aij->B's nonzerostate correctly. */ 3005 } else { 3006 PetscCall(MatSetUp(aij->B)); 3007 } 3008 C->preallocated = PETSC_TRUE; 3009 C->was_assembled = PETSC_FALSE; 3010 C->assembled = PETSC_FALSE; 3011 /* 3012 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3013 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3014 */ 3015 PetscFunctionReturn(0); 3016 } 3017 3018 /* 3019 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3020 */ 3021 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3022 { 3023 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3024 3025 PetscFunctionBegin; 3026 PetscValidPointer(A,2); 3027 PetscValidPointer(B,3); 3028 /* FIXME: make sure C is assembled */ 3029 *A = aij->A; 3030 *B = aij->B; 3031 /* Note that we don't incref *A and *B, so be careful! */ 3032 PetscFunctionReturn(0); 3033 } 3034 3035 /* 3036 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3037 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3038 */ 3039 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3040 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3041 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3042 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3043 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3044 { 3045 PetscMPIInt size,flag; 3046 PetscInt i,ii,cismax,ispar; 3047 Mat *A,*B; 3048 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3049 3050 PetscFunctionBegin; 3051 if (!ismax) PetscFunctionReturn(0); 3052 3053 for (i = 0, cismax = 0; i < ismax; ++i) { 3054 PetscCallMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag)); 3055 PetscCheck(flag == MPI_IDENT,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3056 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3057 if (size > 1) ++cismax; 3058 } 3059 3060 /* 3061 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3062 ispar counts the number of parallel ISs across C's comm. 3063 */ 3064 PetscCall(MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C))); 3065 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3066 PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat)); 3067 PetscFunctionReturn(0); 3068 } 3069 3070 /* if (ispar) */ 3071 /* 3072 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3073 These are used to extract the off-diag portion of the resulting parallel matrix. 3074 The row IS for the off-diag portion is the same as for the diag portion, 3075 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3076 */ 3077 PetscCall(PetscMalloc2(cismax,&cisrow,cismax,&ciscol)); 3078 PetscCall(PetscMalloc1(cismax,&ciscol_p)); 3079 for (i = 0, ii = 0; i < ismax; ++i) { 3080 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3081 if (size > 1) { 3082 /* 3083 TODO: This is the part that's ***NOT SCALABLE***. 3084 To fix this we need to extract just the indices of C's nonzero columns 3085 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3086 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3087 to be done without serializing on the IS list, so, most likely, it is best 3088 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3089 */ 3090 PetscCall(ISGetNonlocalIS(iscol[i],&(ciscol[ii]))); 3091 /* Now we have to 3092 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3093 were sorted on each rank, concatenated they might no longer be sorted; 3094 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3095 indices in the nondecreasing order to the original index positions. 3096 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3097 */ 3098 PetscCall(ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii)); 3099 PetscCall(ISSort(ciscol[ii])); 3100 ++ii; 3101 } 3102 } 3103 PetscCall(PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p)); 3104 for (i = 0, ii = 0; i < ismax; ++i) { 3105 PetscInt j,issize; 3106 const PetscInt *indices; 3107 3108 /* 3109 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3110 */ 3111 PetscCall(ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i)); 3112 PetscCall(ISSort(isrow[i])); 3113 PetscCall(ISGetLocalSize(isrow[i],&issize)); 3114 PetscCall(ISGetIndices(isrow[i],&indices)); 3115 for (j = 1; j < issize; ++j) { 3116 PetscCheck(indices[j] != indices[j-1],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]); 3117 } 3118 PetscCall(ISRestoreIndices(isrow[i],&indices)); 3119 PetscCall(ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i)); 3120 PetscCall(ISSort(iscol[i])); 3121 PetscCall(ISGetLocalSize(iscol[i],&issize)); 3122 PetscCall(ISGetIndices(iscol[i],&indices)); 3123 for (j = 1; j < issize; ++j) { 3124 PetscCheck(indices[j-1] != indices[j],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]); 3125 } 3126 PetscCall(ISRestoreIndices(iscol[i],&indices)); 3127 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3128 if (size > 1) { 3129 cisrow[ii] = isrow[i]; 3130 ++ii; 3131 } 3132 } 3133 /* 3134 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3135 array of sequential matrices underlying the resulting parallel matrices. 3136 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3137 contain duplicates. 3138 3139 There are as many diag matrices as there are original index sets. There are only as many parallel 3140 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3141 3142 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3143 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3144 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3145 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3146 with A[i] and B[ii] extracted from the corresponding MPI submat. 3147 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3148 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3149 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3150 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3151 values into A[i] and B[ii] sitting inside the corresponding submat. 3152 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3153 A[i], B[ii], AA[i] or BB[ii] matrices. 3154 */ 3155 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3156 if (scall == MAT_INITIAL_MATRIX) { 3157 PetscCall(PetscMalloc1(ismax,submat)); 3158 } 3159 3160 /* Now obtain the sequential A and B submatrices separately. */ 3161 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3162 PetscCall((*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A)); 3163 PetscCall((*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B)); 3164 3165 /* 3166 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3167 matrices A & B have been extracted directly into the parallel matrices containing them, or 3168 simply into the sequential matrix identical with the corresponding A (if size == 1). 3169 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3170 to have the same sparsity pattern. 3171 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3172 must be constructed for C. This is done by setseqmat(s). 3173 */ 3174 for (i = 0, ii = 0; i < ismax; ++i) { 3175 /* 3176 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3177 That way we can avoid sorting and computing permutations when reusing. 3178 To this end: 3179 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3180 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3181 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3182 */ 3183 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3184 3185 PetscCallMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3186 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3187 if (size > 1) { 3188 if (scall == MAT_INITIAL_MATRIX) { 3189 PetscCall(MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i)); 3190 PetscCall(MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE)); 3191 PetscCall(MatSetType((*submat)[i],MATMPIAIJ)); 3192 PetscCall(PetscLayoutSetUp((*submat)[i]->rmap)); 3193 PetscCall(PetscLayoutSetUp((*submat)[i]->cmap)); 3194 } 3195 /* 3196 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3197 */ 3198 { 3199 Mat AA = A[i],BB = B[ii]; 3200 3201 if (AA || BB) { 3202 PetscCall(setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB)); 3203 PetscCall(MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY)); 3204 PetscCall(MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY)); 3205 } 3206 PetscCall(MatDestroy(&AA)); 3207 } 3208 PetscCall(ISDestroy(ciscol+ii)); 3209 PetscCall(ISDestroy(ciscol_p+ii)); 3210 ++ii; 3211 } else { /* if (size == 1) */ 3212 if (scall == MAT_REUSE_MATRIX) { 3213 PetscCall(MatDestroy(&(*submat)[i])); 3214 } 3215 if (isrow_p[i] || iscol_p[i]) { 3216 PetscCall(MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i)); 3217 PetscCall(setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i])); 3218 /* Otherwise A is extracted straight into (*submats)[i]. */ 3219 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3220 PetscCall(MatDestroy(A+i)); 3221 } else (*submat)[i] = A[i]; 3222 } 3223 PetscCall(ISDestroy(&isrow_p[i])); 3224 PetscCall(ISDestroy(&iscol_p[i])); 3225 } 3226 PetscCall(PetscFree2(cisrow,ciscol)); 3227 PetscCall(PetscFree2(isrow_p,iscol_p)); 3228 PetscCall(PetscFree(ciscol_p)); 3229 PetscCall(PetscFree(A)); 3230 PetscCall(MatDestroySubMatrices(cismax,&B)); 3231 PetscFunctionReturn(0); 3232 } 3233 3234 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3235 { 3236 PetscFunctionBegin; 3237 PetscCall(MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ)); 3238 PetscFunctionReturn(0); 3239 } 3240