1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 22 { 23 PetscInt i; 24 25 PetscFunctionBegin; 26 PetscCheckFalse(ov < 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 27 for (i=0; i<ov; ++i) { 28 CHKERRQ(MatIncreaseOverlap_MPIAIJ_Once(C,imax,is)); 29 } 30 PetscFunctionReturn(0); 31 } 32 33 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 34 { 35 PetscInt i; 36 37 PetscFunctionBegin; 38 PetscCheckFalse(ov < 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 39 for (i=0; i<ov; ++i) { 40 CHKERRQ(MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is)); 41 } 42 PetscFunctionReturn(0); 43 } 44 45 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 46 { 47 MPI_Comm comm; 48 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 49 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 50 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 51 const PetscInt *indices_i,**indices; 52 PetscLayout rmap; 53 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 54 PetscSF sf; 55 PetscSFNode *remote; 56 57 PetscFunctionBegin; 58 CHKERRQ(PetscObjectGetComm((PetscObject)mat,&comm)); 59 CHKERRMPI(MPI_Comm_rank(comm,&rank)); 60 CHKERRMPI(MPI_Comm_size(comm,&size)); 61 /* get row map to determine where rows should be going */ 62 CHKERRQ(MatGetLayouts(mat,&rmap,NULL)); 63 /* retrieve IS data and put all together so that we 64 * can optimize communication 65 * */ 66 CHKERRQ(PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length)); 67 for (i=0,tlength=0; i<nidx; i++) { 68 CHKERRQ(ISGetLocalSize(is[i],&length[i])); 69 tlength += length[i]; 70 CHKERRQ(ISGetIndices(is[i],&indices[i])); 71 } 72 /* find these rows on remote processors */ 73 CHKERRQ(PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids)); 74 CHKERRQ(PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp)); 75 nrrows = 0; 76 for (i=0; i<nidx; i++) { 77 length_i = length[i]; 78 indices_i = indices[i]; 79 for (j=0; j<length_i; j++) { 80 owner = -1; 81 CHKERRQ(PetscLayoutFindOwner(rmap,indices_i[j],&owner)); 82 /* remote processors */ 83 if (owner != rank) { 84 tosizes_temp[owner]++; /* number of rows to owner */ 85 rrow_ranks[nrrows] = owner; /* processor */ 86 rrow_isids[nrrows] = i; /* is id */ 87 remoterows[nrrows++] = indices_i[j]; /* row */ 88 } 89 } 90 CHKERRQ(ISRestoreIndices(is[i],&indices[i])); 91 } 92 CHKERRQ(PetscFree2(*(PetscInt***)&indices,length)); 93 /* test if we need to exchange messages 94 * generally speaking, we do not need to exchange 95 * data when overlap is 1 96 * */ 97 CHKERRMPI(MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm)); 98 /* we do not have any messages 99 * It usually corresponds to overlap 1 100 * */ 101 if (!reducednrrows) { 102 CHKERRQ(PetscFree3(toranks,tosizes,tosizes_temp)); 103 CHKERRQ(PetscFree3(remoterows,rrow_ranks,rrow_isids)); 104 CHKERRQ(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is)); 105 PetscFunctionReturn(0); 106 } 107 nto = 0; 108 /* send sizes and ranks for building a two-sided communcation */ 109 for (i=0; i<size; i++) { 110 if (tosizes_temp[i]) { 111 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 112 tosizes_temp[i] = nto; /* a map from processor to index */ 113 toranks[nto++] = i; /* processor */ 114 } 115 } 116 CHKERRQ(PetscMalloc1(nto+1,&toffsets)); 117 toffsets[0] = 0; 118 for (i=0; i<nto; i++) { 119 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 120 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 121 } 122 /* send information to other processors */ 123 CHKERRQ(PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes)); 124 nrecvrows = 0; 125 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 126 CHKERRQ(PetscMalloc1(nrecvrows,&remote)); 127 nrecvrows = 0; 128 for (i=0; i<nfrom; i++) { 129 for (j=0; j<fromsizes[2*i]; j++) { 130 remote[nrecvrows].rank = fromranks[i]; 131 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 132 } 133 } 134 CHKERRQ(PetscSFCreate(comm,&sf)); 135 CHKERRQ(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER)); 136 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 137 CHKERRQ(PetscSFSetType(sf,PETSCSFBASIC)); 138 CHKERRQ(PetscSFSetFromOptions(sf)); 139 /* message pair <no of is, row> */ 140 CHKERRQ(PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata)); 141 for (i=0; i<nrrows; i++) { 142 owner = rrow_ranks[i]; /* processor */ 143 j = tosizes_temp[owner]; /* index */ 144 todata[toffsets[j]++] = rrow_isids[i]; 145 todata[toffsets[j]++] = remoterows[i]; 146 } 147 CHKERRQ(PetscFree3(toranks,tosizes,tosizes_temp)); 148 CHKERRQ(PetscFree3(remoterows,rrow_ranks,rrow_isids)); 149 CHKERRQ(PetscFree(toffsets)); 150 CHKERRQ(PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE)); 151 CHKERRQ(PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE)); 152 CHKERRQ(PetscSFDestroy(&sf)); 153 /* send rows belonging to the remote so that then we could get the overlapping data back */ 154 CHKERRQ(MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata)); 155 CHKERRQ(PetscFree2(todata,fromdata)); 156 CHKERRQ(PetscFree(fromsizes)); 157 CHKERRQ(PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes)); 158 CHKERRQ(PetscFree(fromranks)); 159 nrecvrows = 0; 160 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 161 CHKERRQ(PetscCalloc1(nrecvrows,&todata)); 162 CHKERRQ(PetscMalloc1(nrecvrows,&remote)); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) { 165 for (j=0; j<tosizes[2*i]; j++) { 166 remote[nrecvrows].rank = toranks[i]; 167 remote[nrecvrows++].index = tosizes[2*i+1]+j; 168 } 169 } 170 CHKERRQ(PetscSFCreate(comm,&sf)); 171 CHKERRQ(PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER)); 172 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 173 CHKERRQ(PetscSFSetType(sf,PETSCSFBASIC)); 174 CHKERRQ(PetscSFSetFromOptions(sf)); 175 /* overlap communication and computation */ 176 CHKERRQ(PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE)); 177 CHKERRQ(MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is)); 178 CHKERRQ(PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE)); 179 CHKERRQ(PetscSFDestroy(&sf)); 180 CHKERRQ(PetscFree2(sbdata,sbsizes)); 181 CHKERRQ(MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata)); 182 CHKERRQ(PetscFree(toranks)); 183 CHKERRQ(PetscFree(tosizes)); 184 CHKERRQ(PetscFree(todata)); 185 PetscFunctionReturn(0); 186 } 187 188 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 189 { 190 PetscInt *isz,isz_i,i,j,is_id, data_size; 191 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 192 const PetscInt *indices_i_temp; 193 MPI_Comm *iscomms; 194 195 PetscFunctionBegin; 196 max_lsize = 0; 197 CHKERRQ(PetscMalloc1(nidx,&isz)); 198 for (i=0; i<nidx; i++) { 199 CHKERRQ(ISGetLocalSize(is[i],&lsize)); 200 max_lsize = lsize>max_lsize ? lsize:max_lsize; 201 isz[i] = lsize; 202 } 203 CHKERRQ(PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms)); 204 for (i=0; i<nidx; i++) { 205 CHKERRQ(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL)); 206 CHKERRQ(ISGetIndices(is[i],&indices_i_temp)); 207 CHKERRQ(PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i])); 208 CHKERRQ(ISRestoreIndices(is[i],&indices_i_temp)); 209 CHKERRQ(ISDestroy(&is[i])); 210 } 211 /* retrieve information to get row id and its overlap */ 212 for (i=0; i<nrecvs;) { 213 is_id = recvdata[i++]; 214 data_size = recvdata[i++]; 215 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 216 isz_i = isz[is_id]; 217 for (j=0; j< data_size; j++) { 218 col = recvdata[i++]; 219 indices_i[isz_i++] = col; 220 } 221 isz[is_id] = isz_i; 222 } 223 /* remove duplicate entities */ 224 for (i=0; i<nidx; i++) { 225 indices_i = indices_temp+(max_lsize+nrecvs)*i; 226 isz_i = isz[i]; 227 CHKERRQ(PetscSortRemoveDupsInt(&isz_i,indices_i)); 228 CHKERRQ(ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i])); 229 CHKERRQ(PetscCommDestroy(&iscomms[i])); 230 } 231 CHKERRQ(PetscFree(isz)); 232 CHKERRQ(PetscFree2(indices_temp,iscomms)); 233 PetscFunctionReturn(0); 234 } 235 236 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 237 { 238 PetscLayout rmap,cmap; 239 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 240 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 241 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 242 const PetscInt *gcols,*ai,*aj,*bi,*bj; 243 Mat amat,bmat; 244 PetscMPIInt rank; 245 PetscBool done; 246 MPI_Comm comm; 247 248 PetscFunctionBegin; 249 CHKERRQ(PetscObjectGetComm((PetscObject)mat,&comm)); 250 CHKERRMPI(MPI_Comm_rank(comm,&rank)); 251 CHKERRQ(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols)); 252 /* Even if the mat is symmetric, we still assume it is not symmetric */ 253 CHKERRQ(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 254 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 255 CHKERRQ(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 256 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 257 /* total number of nonzero values is used to estimate the memory usage in the next step */ 258 tnz = ai[an]+bi[bn]; 259 CHKERRQ(MatGetLayouts(mat,&rmap,&cmap)); 260 CHKERRQ(PetscLayoutGetRange(rmap,&rstart,NULL)); 261 CHKERRQ(PetscLayoutGetRange(cmap,&cstart,NULL)); 262 /* to find the longest message */ 263 max_fszs = 0; 264 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 265 /* better way to estimate number of nonzero in the mat??? */ 266 CHKERRQ(PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp)); 267 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 268 rows_pos = 0; 269 totalrows = 0; 270 for (i=0; i<nfrom; i++) { 271 CHKERRQ(PetscArrayzero(rows_pos_i,nidx)); 272 /* group data together */ 273 for (j=0; j<fromsizes[2*i]; j+=2) { 274 is_id = fromrows[rows_pos++];/* no of is */ 275 rows_i = rows_data[is_id]; 276 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 277 } 278 /* estimate a space to avoid multiple allocations */ 279 for (j=0; j<nidx; j++) { 280 indvc_ij = 0; 281 rows_i = rows_data[j]; 282 for (l=0; l<rows_pos_i[j]; l++) { 283 row = rows_i[l]-rstart; 284 start = ai[row]; 285 end = ai[row+1]; 286 for (k=start; k<end; k++) { /* Amat */ 287 col = aj[k] + cstart; 288 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 289 } 290 start = bi[row]; 291 end = bi[row+1]; 292 for (k=start; k<end; k++) { /* Bmat */ 293 col = gcols[bj[k]]; 294 indices_tmp[indvc_ij++] = col; 295 } 296 } 297 CHKERRQ(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp)); 298 indv_counts[i*nidx+j] = indvc_ij; 299 totalrows += indvc_ij; 300 } 301 } 302 /* message triple <no of is, number of rows, rows> */ 303 CHKERRQ(PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes)); 304 totalrows = 0; 305 rows_pos = 0; 306 /* use this code again */ 307 for (i=0;i<nfrom;i++) { 308 CHKERRQ(PetscArrayzero(rows_pos_i,nidx)); 309 for (j=0; j<fromsizes[2*i]; j+=2) { 310 is_id = fromrows[rows_pos++]; 311 rows_i = rows_data[is_id]; 312 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 313 } 314 /* add data */ 315 for (j=0; j<nidx; j++) { 316 if (!indv_counts[i*nidx+j]) continue; 317 indvc_ij = 0; 318 sbdata[totalrows++] = j; 319 sbdata[totalrows++] = indv_counts[i*nidx+j]; 320 sbsizes[2*i] += 2; 321 rows_i = rows_data[j]; 322 for (l=0; l<rows_pos_i[j]; l++) { 323 row = rows_i[l]-rstart; 324 start = ai[row]; 325 end = ai[row+1]; 326 for (k=start; k<end; k++) { /* Amat */ 327 col = aj[k] + cstart; 328 indices_tmp[indvc_ij++] = col; 329 } 330 start = bi[row]; 331 end = bi[row+1]; 332 for (k=start; k<end; k++) { /* Bmat */ 333 col = gcols[bj[k]]; 334 indices_tmp[indvc_ij++] = col; 335 } 336 } 337 CHKERRQ(PetscSortRemoveDupsInt(&indvc_ij,indices_tmp)); 338 sbsizes[2*i] += indvc_ij; 339 CHKERRQ(PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij)); 340 totalrows += indvc_ij; 341 } 342 } 343 CHKERRQ(PetscMalloc1(nfrom+1,&offsets)); 344 offsets[0] = 0; 345 for (i=0; i<nfrom; i++) { 346 offsets[i+1] = offsets[i] + sbsizes[2*i]; 347 sbsizes[2*i+1] = offsets[i]; 348 } 349 CHKERRQ(PetscFree(offsets)); 350 if (sbrowsizes) *sbrowsizes = sbsizes; 351 if (sbrows) *sbrows = sbdata; 352 CHKERRQ(PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp)); 353 CHKERRQ(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 354 CHKERRQ(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 355 PetscFunctionReturn(0); 356 } 357 358 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 359 { 360 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 361 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 362 PetscInt lsize,lsize_tmp; 363 PetscMPIInt rank,owner; 364 Mat amat,bmat; 365 PetscBool done; 366 PetscLayout cmap,rmap; 367 MPI_Comm comm; 368 369 PetscFunctionBegin; 370 CHKERRQ(PetscObjectGetComm((PetscObject)mat,&comm)); 371 CHKERRMPI(MPI_Comm_rank(comm,&rank)); 372 CHKERRQ(MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols)); 373 CHKERRQ(MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 374 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 375 CHKERRQ(MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 376 PetscCheck(done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 377 /* is it a safe way to compute number of nonzero values ? */ 378 tnz = ai[an]+bi[bn]; 379 CHKERRQ(MatGetLayouts(mat,&rmap,&cmap)); 380 CHKERRQ(PetscLayoutGetRange(rmap,&rstart,NULL)); 381 CHKERRQ(PetscLayoutGetRange(cmap,&cstart,NULL)); 382 /* it is a better way to estimate memory than the old implementation 383 * where global size of matrix is used 384 * */ 385 CHKERRQ(PetscMalloc1(tnz,&indices_temp)); 386 for (i=0; i<nidx; i++) { 387 MPI_Comm iscomm; 388 389 CHKERRQ(ISGetLocalSize(is[i],&lsize)); 390 CHKERRQ(ISGetIndices(is[i],&indices)); 391 lsize_tmp = 0; 392 for (j=0; j<lsize; j++) { 393 owner = -1; 394 row = indices[j]; 395 CHKERRQ(PetscLayoutFindOwner(rmap,row,&owner)); 396 if (owner != rank) continue; 397 /* local number */ 398 row -= rstart; 399 start = ai[row]; 400 end = ai[row+1]; 401 for (k=start; k<end; k++) { /* Amat */ 402 col = aj[k] + cstart; 403 indices_temp[lsize_tmp++] = col; 404 } 405 start = bi[row]; 406 end = bi[row+1]; 407 for (k=start; k<end; k++) { /* Bmat */ 408 col = gcols[bj[k]]; 409 indices_temp[lsize_tmp++] = col; 410 } 411 } 412 CHKERRQ(ISRestoreIndices(is[i],&indices)); 413 CHKERRQ(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL)); 414 CHKERRQ(ISDestroy(&is[i])); 415 CHKERRQ(PetscSortRemoveDupsInt(&lsize_tmp,indices_temp)); 416 CHKERRQ(ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i])); 417 CHKERRQ(PetscCommDestroy(&iscomm)); 418 } 419 CHKERRQ(PetscFree(indices_temp)); 420 CHKERRQ(MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done)); 421 CHKERRQ(MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done)); 422 PetscFunctionReturn(0); 423 } 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests received (which have to be or which have been processed) 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscMPIInt size,rank,tag1,tag2,proc = 0; 461 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 462 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 463 PetscBT *table; 464 MPI_Comm comm; 465 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 466 MPI_Status *recv_status; 467 MPI_Comm *iscomms; 468 char *t_p; 469 470 PetscFunctionBegin; 471 CHKERRQ(PetscObjectGetComm((PetscObject)C,&comm)); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag1)); 477 CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag2)); 478 479 CHKERRQ(PetscMalloc2(imax,(PetscInt***)&idx,imax,&n)); 480 481 for (i=0; i<imax; i++) { 482 CHKERRQ(ISGetIndices(is[i],&idx[i])); 483 CHKERRQ(ISGetLocalSize(is[i],&n[i])); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 CHKERRQ(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4)); 489 for (i=0; i<imax; i++) { 490 CHKERRQ(PetscArrayzero(w4,size)); /* initialise work vector*/ 491 idx_i = idx[i]; 492 len = n[i]; 493 for (j=0; j<len; j++) { 494 row = idx_i[j]; 495 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 496 CHKERRQ(PetscLayoutFindOwner(C->rmap,row,&proc)); 497 w4[proc]++; 498 } 499 for (j=0; j<size; j++) { 500 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 501 } 502 } 503 504 nrqs = 0; /* no of outgoing messages */ 505 msz = 0; /* total mesg length (for all proc */ 506 w1[rank] = 0; /* no mesg sent to intself */ 507 w3[rank] = 0; 508 for (i=0; i<size; i++) { 509 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 510 } 511 /* pa - is list of processors to communicate with */ 512 CHKERRQ(PetscMalloc1(nrqs,&pa)); 513 for (i=0,j=0; i<size; i++) { 514 if (w1[i]) {pa[j] = i; j++;} 515 } 516 517 /* Each message would have a header = 1 + 2*(no of IS) + data */ 518 for (i=0; i<nrqs; i++) { 519 j = pa[i]; 520 w1[j] += w2[j] + 2*w3[j]; 521 msz += w1[j]; 522 } 523 524 /* Determine the number of messages to expect, their lengths, from from-ids */ 525 CHKERRQ(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 526 CHKERRQ(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 527 528 /* Now post the Irecvs corresponding to these messages */ 529 CHKERRQ(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1)); 530 531 /* Allocate Memory for outgoing messages */ 532 CHKERRQ(PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr)); 533 CHKERRQ(PetscArrayzero(outdat,size)); 534 CHKERRQ(PetscArrayzero(ptr,size)); 535 536 { 537 PetscInt *iptr = tmp,ict = 0; 538 for (i=0; i<nrqs; i++) { 539 j = pa[i]; 540 iptr += ict; 541 outdat[j] = iptr; 542 ict = w1[j]; 543 } 544 } 545 546 /* Form the outgoing messages */ 547 /* plug in the headers */ 548 for (i=0; i<nrqs; i++) { 549 j = pa[i]; 550 outdat[j][0] = 0; 551 CHKERRQ(PetscArrayzero(outdat[j]+1,2*w3[j])); 552 ptr[j] = outdat[j] + 2*w3[j] + 1; 553 } 554 555 /* Memory for doing local proc's work */ 556 { 557 PetscInt M_BPB_imax = 0; 558 #if defined(PETSC_USE_CTABLE) 559 CHKERRQ(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax)); 560 CHKERRQ(PetscMalloc1(imax,&table_data)); 561 for (i=0; i<imax; i++) { 562 CHKERRQ(PetscTableCreate(n[i],M,&table_data[i])); 563 } 564 CHKERRQ(PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p)); 565 for (i=0; i<imax; i++) { 566 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 567 } 568 #else 569 PetscInt Mimax = 0; 570 CHKERRQ(PetscIntMultError(M,imax, &Mimax)); 571 CHKERRQ(PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax)); 572 CHKERRQ(PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p)); 573 for (i=0; i<imax; i++) { 574 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 575 data[i] = d_p + M*i; 576 } 577 #endif 578 } 579 580 /* Parse the IS and update local tables and the outgoing buf with the data */ 581 { 582 PetscInt n_i,isz_i,*outdat_j,ctr_j; 583 PetscBT table_i; 584 585 for (i=0; i<imax; i++) { 586 CHKERRQ(PetscArrayzero(ctr,size)); 587 n_i = n[i]; 588 table_i = table[i]; 589 idx_i = idx[i]; 590 #if defined(PETSC_USE_CTABLE) 591 table_data_i = table_data[i]; 592 #else 593 data_i = data[i]; 594 #endif 595 isz_i = isz[i]; 596 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 597 row = idx_i[j]; 598 CHKERRQ(PetscLayoutFindOwner(C->rmap,row,&proc)); 599 if (proc != rank) { /* copy to the outgoing buffer */ 600 ctr[proc]++; 601 *ptr[proc] = row; 602 ptr[proc]++; 603 } else if (!PetscBTLookupSet(table_i,row)) { 604 #if defined(PETSC_USE_CTABLE) 605 CHKERRQ(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES)); 606 #else 607 data_i[isz_i] = row; /* Update the local table */ 608 #endif 609 isz_i++; 610 } 611 } 612 /* Update the headers for the current IS */ 613 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 614 if ((ctr_j = ctr[j])) { 615 outdat_j = outdat[j]; 616 k = ++outdat_j[0]; 617 outdat_j[2*k] = ctr_j; 618 outdat_j[2*k-1] = i; 619 } 620 } 621 isz[i] = isz_i; 622 } 623 } 624 625 /* Now post the sends */ 626 CHKERRQ(PetscMalloc1(nrqs,&s_waits1)); 627 for (i=0; i<nrqs; ++i) { 628 j = pa[i]; 629 CHKERRMPI(MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i)); 630 } 631 632 /* No longer need the original indices */ 633 CHKERRQ(PetscMalloc1(imax,&iscomms)); 634 for (i=0; i<imax; ++i) { 635 CHKERRQ(ISRestoreIndices(is[i],idx+i)); 636 CHKERRQ(PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL)); 637 } 638 CHKERRQ(PetscFree2(*(PetscInt***)&idx,n)); 639 640 for (i=0; i<imax; ++i) { 641 CHKERRQ(ISDestroy(&is[i])); 642 } 643 644 /* Do Local work */ 645 #if defined(PETSC_USE_CTABLE) 646 CHKERRQ(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data)); 647 #else 648 CHKERRQ(MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL)); 649 #endif 650 651 /* Receive messages */ 652 CHKERRQ(PetscMalloc1(nrqr,&recv_status)); 653 CHKERRMPI(MPI_Waitall(nrqr,r_waits1,recv_status)); 654 CHKERRMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE)); 655 656 /* Phase 1 sends are complete - deallocate buffers */ 657 CHKERRQ(PetscFree4(outdat,ptr,tmp,ctr)); 658 CHKERRQ(PetscFree4(w1,w2,w3,w4)); 659 660 CHKERRQ(PetscMalloc1(nrqr,&xdata)); 661 CHKERRQ(PetscMalloc1(nrqr,&isz1)); 662 CHKERRQ(MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1)); 663 CHKERRQ(PetscFree(rbuf[0])); 664 CHKERRQ(PetscFree(rbuf)); 665 666 /* Send the data back */ 667 /* Do a global reduction to know the buffer space req for incoming messages */ 668 { 669 PetscMPIInt *rw1; 670 671 CHKERRQ(PetscCalloc1(size,&rw1)); 672 673 for (i=0; i<nrqr; ++i) { 674 proc = recv_status[i].MPI_SOURCE; 675 676 PetscCheckFalse(proc != onodes1[i],PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 677 rw1[proc] = isz1[i]; 678 } 679 CHKERRQ(PetscFree(onodes1)); 680 CHKERRQ(PetscFree(olengths1)); 681 682 /* Determine the number of messages to expect, their lengths, from from-ids */ 683 CHKERRQ(PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2)); 684 CHKERRQ(PetscFree(rw1)); 685 } 686 /* Now post the Irecvs corresponding to these messages */ 687 CHKERRQ(PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2)); 688 689 /* Now post the sends */ 690 CHKERRQ(PetscMalloc1(nrqr,&s_waits2)); 691 for (i=0; i<nrqr; ++i) { 692 j = recv_status[i].MPI_SOURCE; 693 CHKERRMPI(MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i)); 694 } 695 696 /* receive work done on other processors */ 697 { 698 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 699 PetscMPIInt idex; 700 PetscBT table_i; 701 702 for (i=0; i<nrqs; ++i) { 703 CHKERRMPI(MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE)); 704 /* Process the message */ 705 rbuf2_i = rbuf2[idex]; 706 ct1 = 2*rbuf2_i[0]+1; 707 jmax = rbuf2[idex][0]; 708 for (j=1; j<=jmax; j++) { 709 max = rbuf2_i[2*j]; 710 is_no = rbuf2_i[2*j-1]; 711 isz_i = isz[is_no]; 712 table_i = table[is_no]; 713 #if defined(PETSC_USE_CTABLE) 714 table_data_i = table_data[is_no]; 715 #else 716 data_i = data[is_no]; 717 #endif 718 for (k=0; k<max; k++,ct1++) { 719 row = rbuf2_i[ct1]; 720 if (!PetscBTLookupSet(table_i,row)) { 721 #if defined(PETSC_USE_CTABLE) 722 CHKERRQ(PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES)); 723 #else 724 data_i[isz_i] = row; 725 #endif 726 isz_i++; 727 } 728 } 729 isz[is_no] = isz_i; 730 } 731 } 732 733 CHKERRMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE)); 734 } 735 736 #if defined(PETSC_USE_CTABLE) 737 tcount_max = 0; 738 for (i=0; i<imax; ++i) { 739 table_data_i = table_data[i]; 740 CHKERRQ(PetscTableGetCount(table_data_i,&tcount)); 741 if (tcount_max < tcount) tcount_max = tcount; 742 } 743 CHKERRQ(PetscMalloc1(tcount_max+1,&tdata)); 744 #endif 745 746 for (i=0; i<imax; ++i) { 747 #if defined(PETSC_USE_CTABLE) 748 PetscTablePosition tpos; 749 table_data_i = table_data[i]; 750 751 CHKERRQ(PetscTableGetHeadPosition(table_data_i,&tpos)); 752 while (tpos) { 753 CHKERRQ(PetscTableGetNext(table_data_i,&tpos,&k,&j)); 754 tdata[--j] = --k; 755 } 756 CHKERRQ(ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i)); 757 #else 758 CHKERRQ(ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i)); 759 #endif 760 CHKERRQ(PetscCommDestroy(&iscomms[i])); 761 } 762 763 CHKERRQ(PetscFree(iscomms)); 764 CHKERRQ(PetscFree(onodes2)); 765 CHKERRQ(PetscFree(olengths2)); 766 767 CHKERRQ(PetscFree(pa)); 768 CHKERRQ(PetscFree(rbuf2[0])); 769 CHKERRQ(PetscFree(rbuf2)); 770 CHKERRQ(PetscFree(s_waits1)); 771 CHKERRQ(PetscFree(r_waits1)); 772 CHKERRQ(PetscFree(s_waits2)); 773 CHKERRQ(PetscFree(r_waits2)); 774 CHKERRQ(PetscFree(recv_status)); 775 if (xdata) { 776 CHKERRQ(PetscFree(xdata[0])); 777 CHKERRQ(PetscFree(xdata)); 778 } 779 CHKERRQ(PetscFree(isz1)); 780 #if defined(PETSC_USE_CTABLE) 781 for (i=0; i<imax; i++) { 782 CHKERRQ(PetscTableDestroy((PetscTable*)&table_data[i])); 783 } 784 CHKERRQ(PetscFree(table_data)); 785 CHKERRQ(PetscFree(tdata)); 786 CHKERRQ(PetscFree4(table,data,isz,t_p)); 787 #else 788 CHKERRQ(PetscFree5(table,data,isz,d_p,t_p)); 789 #endif 790 PetscFunctionReturn(0); 791 } 792 793 /* 794 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 795 the work on the local processor. 796 797 Inputs: 798 C - MAT_MPIAIJ; 799 imax - total no of index sets processed at a time; 800 table - an array of char - size = m bits. 801 802 Output: 803 isz - array containing the count of the solution elements corresponding 804 to each index set; 805 data or table_data - pointer to the solutions 806 */ 807 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 808 { 809 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 810 Mat A = c->A,B = c->B; 811 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 812 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 813 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 814 PetscBT table_i; 815 #if defined(PETSC_USE_CTABLE) 816 PetscTable table_data_i; 817 PetscTablePosition tpos; 818 PetscInt tcount,*tdata; 819 #else 820 PetscInt *data_i; 821 #endif 822 823 PetscFunctionBegin; 824 rstart = C->rmap->rstart; 825 cstart = C->cmap->rstart; 826 ai = a->i; 827 aj = a->j; 828 bi = b->i; 829 bj = b->j; 830 garray = c->garray; 831 832 for (i=0; i<imax; i++) { 833 #if defined(PETSC_USE_CTABLE) 834 /* copy existing entries of table_data_i into tdata[] */ 835 table_data_i = table_data[i]; 836 CHKERRQ(PetscTableGetCount(table_data_i,&tcount)); 837 PetscCheckFalse(tcount != isz[i],PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT,tcount,i,isz[i]); 838 839 CHKERRQ(PetscMalloc1(tcount,&tdata)); 840 CHKERRQ(PetscTableGetHeadPosition(table_data_i,&tpos)); 841 while (tpos) { 842 CHKERRQ(PetscTableGetNext(table_data_i,&tpos,&row,&j)); 843 tdata[--j] = --row; 844 PetscCheckFalse(j > tcount - 1,PETSC_COMM_SELF,PETSC_ERR_PLIB," j %" PetscInt_FMT " >= tcount %" PetscInt_FMT,j,tcount); 845 } 846 #else 847 data_i = data[i]; 848 #endif 849 table_i = table[i]; 850 isz_i = isz[i]; 851 max = isz[i]; 852 853 for (j=0; j<max; j++) { 854 #if defined(PETSC_USE_CTABLE) 855 row = tdata[j] - rstart; 856 #else 857 row = data_i[j] - rstart; 858 #endif 859 start = ai[row]; 860 end = ai[row+1]; 861 for (k=start; k<end; k++) { /* Amat */ 862 val = aj[k] + cstart; 863 if (!PetscBTLookupSet(table_i,val)) { 864 #if defined(PETSC_USE_CTABLE) 865 CHKERRQ(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES)); 866 #else 867 data_i[isz_i] = val; 868 #endif 869 isz_i++; 870 } 871 } 872 start = bi[row]; 873 end = bi[row+1]; 874 for (k=start; k<end; k++) { /* Bmat */ 875 val = garray[bj[k]]; 876 if (!PetscBTLookupSet(table_i,val)) { 877 #if defined(PETSC_USE_CTABLE) 878 CHKERRQ(PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES)); 879 #else 880 data_i[isz_i] = val; 881 #endif 882 isz_i++; 883 } 884 } 885 } 886 isz[i] = isz_i; 887 888 #if defined(PETSC_USE_CTABLE) 889 CHKERRQ(PetscFree(tdata)); 890 #endif 891 } 892 PetscFunctionReturn(0); 893 } 894 895 /* 896 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 897 and return the output 898 899 Input: 900 C - the matrix 901 nrqr - no of messages being processed. 902 rbuf - an array of pointers to the received requests 903 904 Output: 905 xdata - array of messages to be sent back 906 isz1 - size of each message 907 908 For better efficiency perhaps we should malloc separately each xdata[i], 909 then if a remalloc is required we need only copy the data for that one row 910 rather then all previous rows as it is now where a single large chunk of 911 memory is used. 912 913 */ 914 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 915 { 916 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 917 Mat A = c->A,B = c->B; 918 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 919 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 920 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 921 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 922 PetscInt *rbuf_i,kmax,rbuf_0; 923 PetscBT xtable; 924 925 PetscFunctionBegin; 926 m = C->rmap->N; 927 rstart = C->rmap->rstart; 928 cstart = C->cmap->rstart; 929 ai = a->i; 930 aj = a->j; 931 bi = b->i; 932 bj = b->j; 933 garray = c->garray; 934 935 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 936 rbuf_i = rbuf[i]; 937 rbuf_0 = rbuf_i[0]; 938 ct += rbuf_0; 939 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 940 } 941 942 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 943 else max1 = 1; 944 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 945 if (nrqr) { 946 CHKERRQ(PetscMalloc1(mem_estimate,&xdata[0])); 947 ++no_malloc; 948 } 949 CHKERRQ(PetscBTCreate(m,&xtable)); 950 CHKERRQ(PetscArrayzero(isz1,nrqr)); 951 952 ct3 = 0; 953 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 954 rbuf_i = rbuf[i]; 955 rbuf_0 = rbuf_i[0]; 956 ct1 = 2*rbuf_0+1; 957 ct2 = ct1; 958 ct3 += ct1; 959 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 960 CHKERRQ(PetscBTMemzero(m,xtable)); 961 oct2 = ct2; 962 kmax = rbuf_i[2*j]; 963 for (k=0; k<kmax; k++,ct1++) { 964 row = rbuf_i[ct1]; 965 if (!PetscBTLookupSet(xtable,row)) { 966 if (!(ct3 < mem_estimate)) { 967 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 968 CHKERRQ(PetscMalloc1(new_estimate,&tmp)); 969 CHKERRQ(PetscArraycpy(tmp,xdata[0],mem_estimate)); 970 CHKERRQ(PetscFree(xdata[0])); 971 xdata[0] = tmp; 972 mem_estimate = new_estimate; ++no_malloc; 973 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 974 } 975 xdata[i][ct2++] = row; 976 ct3++; 977 } 978 } 979 for (k=oct2,max2=ct2; k<max2; k++) { 980 row = xdata[i][k] - rstart; 981 start = ai[row]; 982 end = ai[row+1]; 983 for (l=start; l<end; l++) { 984 val = aj[l] + cstart; 985 if (!PetscBTLookupSet(xtable,val)) { 986 if (!(ct3 < mem_estimate)) { 987 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 988 CHKERRQ(PetscMalloc1(new_estimate,&tmp)); 989 CHKERRQ(PetscArraycpy(tmp,xdata[0],mem_estimate)); 990 CHKERRQ(PetscFree(xdata[0])); 991 xdata[0] = tmp; 992 mem_estimate = new_estimate; ++no_malloc; 993 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 994 } 995 xdata[i][ct2++] = val; 996 ct3++; 997 } 998 } 999 start = bi[row]; 1000 end = bi[row+1]; 1001 for (l=start; l<end; l++) { 1002 val = garray[bj[l]]; 1003 if (!PetscBTLookupSet(xtable,val)) { 1004 if (!(ct3 < mem_estimate)) { 1005 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1006 CHKERRQ(PetscMalloc1(new_estimate,&tmp)); 1007 CHKERRQ(PetscArraycpy(tmp,xdata[0],mem_estimate)); 1008 CHKERRQ(PetscFree(xdata[0])); 1009 xdata[0] = tmp; 1010 mem_estimate = new_estimate; ++no_malloc; 1011 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1012 } 1013 xdata[i][ct2++] = val; 1014 ct3++; 1015 } 1016 } 1017 } 1018 /* Update the header*/ 1019 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1020 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1021 } 1022 xdata[i][0] = rbuf_0; 1023 if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2; 1024 isz1[i] = ct2; /* size of each message */ 1025 } 1026 CHKERRQ(PetscBTDestroy(&xtable)); 1027 CHKERRQ(PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc)); 1028 PetscFunctionReturn(0); 1029 } 1030 /* -------------------------------------------------------------------------*/ 1031 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1032 /* 1033 Every processor gets the entire matrix 1034 */ 1035 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1036 { 1037 Mat B; 1038 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1039 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1040 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1041 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1042 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1043 1044 PetscFunctionBegin; 1045 CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1046 CHKERRMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank)); 1047 if (scall == MAT_INITIAL_MATRIX) { 1048 /* ---------------------------------------------------------------- 1049 Tell every processor the number of nonzeros per row 1050 */ 1051 CHKERRQ(PetscMalloc1(A->rmap->N,&lens)); 1052 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1053 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1054 } 1055 CHKERRQ(PetscMalloc2(size,&recvcounts,size,&displs)); 1056 for (i=0; i<size; i++) { 1057 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1058 displs[i] = A->rmap->range[i]; 1059 } 1060 CHKERRMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A))); 1061 /* --------------------------------------------------------------- 1062 Create the sequential matrix of the same type as the local block diagonal 1063 */ 1064 CHKERRQ(MatCreate(PETSC_COMM_SELF,&B)); 1065 CHKERRQ(MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE)); 1066 CHKERRQ(MatSetBlockSizesFromMats(B,A,A)); 1067 CHKERRQ(MatSetType(B,((PetscObject)a->A)->type_name)); 1068 CHKERRQ(MatSeqAIJSetPreallocation(B,0,lens)); 1069 CHKERRQ(PetscCalloc1(2,Bin)); 1070 **Bin = B; 1071 b = (Mat_SeqAIJ*)B->data; 1072 1073 /*-------------------------------------------------------------------- 1074 Copy my part of matrix column indices over 1075 */ 1076 sendcount = ad->nz + bd->nz; 1077 jsendbuf = b->j + b->i[rstarts[rank]]; 1078 a_jsendbuf = ad->j; 1079 b_jsendbuf = bd->j; 1080 n = A->rmap->rend - A->rmap->rstart; 1081 cnt = 0; 1082 for (i=0; i<n; i++) { 1083 /* put in lower diagonal portion */ 1084 m = bd->i[i+1] - bd->i[i]; 1085 while (m > 0) { 1086 /* is it above diagonal (in bd (compressed) numbering) */ 1087 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1088 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1089 m--; 1090 } 1091 1092 /* put in diagonal portion */ 1093 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1094 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1095 } 1096 1097 /* put in upper diagonal portion */ 1098 while (m-- > 0) { 1099 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1100 } 1101 } 1102 PetscCheckFalse(cnt != sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt); 1103 1104 /*-------------------------------------------------------------------- 1105 Gather all column indices to all processors 1106 */ 1107 for (i=0; i<size; i++) { 1108 recvcounts[i] = 0; 1109 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1110 recvcounts[i] += lens[j]; 1111 } 1112 } 1113 displs[0] = 0; 1114 for (i=1; i<size; i++) { 1115 displs[i] = displs[i-1] + recvcounts[i-1]; 1116 } 1117 CHKERRMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A))); 1118 /*-------------------------------------------------------------------- 1119 Assemble the matrix into useable form (note numerical values not yet set) 1120 */ 1121 /* set the b->ilen (length of each row) values */ 1122 CHKERRQ(PetscArraycpy(b->ilen,lens,A->rmap->N)); 1123 /* set the b->i indices */ 1124 b->i[0] = 0; 1125 for (i=1; i<=A->rmap->N; i++) { 1126 b->i[i] = b->i[i-1] + lens[i-1]; 1127 } 1128 CHKERRQ(PetscFree(lens)); 1129 CHKERRQ(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY)); 1130 CHKERRQ(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY)); 1131 1132 } else { 1133 B = **Bin; 1134 b = (Mat_SeqAIJ*)B->data; 1135 } 1136 1137 /*-------------------------------------------------------------------- 1138 Copy my part of matrix numerical values into the values location 1139 */ 1140 if (flag == MAT_GET_VALUES) { 1141 const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf; 1142 MatScalar *sendbuf,*recvbuf; 1143 1144 CHKERRQ(MatSeqAIJGetArrayRead(a->A,&ada)); 1145 CHKERRQ(MatSeqAIJGetArrayRead(a->B,&bda)); 1146 sendcount = ad->nz + bd->nz; 1147 sendbuf = b->a + b->i[rstarts[rank]]; 1148 a_sendbuf = ada; 1149 b_sendbuf = bda; 1150 b_sendj = bd->j; 1151 n = A->rmap->rend - A->rmap->rstart; 1152 cnt = 0; 1153 for (i=0; i<n; i++) { 1154 /* put in lower diagonal portion */ 1155 m = bd->i[i+1] - bd->i[i]; 1156 while (m > 0) { 1157 /* is it above diagonal (in bd (compressed) numbering) */ 1158 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1159 sendbuf[cnt++] = *b_sendbuf++; 1160 m--; 1161 b_sendj++; 1162 } 1163 1164 /* put in diagonal portion */ 1165 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1166 sendbuf[cnt++] = *a_sendbuf++; 1167 } 1168 1169 /* put in upper diagonal portion */ 1170 while (m-- > 0) { 1171 sendbuf[cnt++] = *b_sendbuf++; 1172 b_sendj++; 1173 } 1174 } 1175 PetscCheckFalse(cnt != sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt); 1176 1177 /* ----------------------------------------------------------------- 1178 Gather all numerical values to all processors 1179 */ 1180 if (!recvcounts) { 1181 CHKERRQ(PetscMalloc2(size,&recvcounts,size,&displs)); 1182 } 1183 for (i=0; i<size; i++) { 1184 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1185 } 1186 displs[0] = 0; 1187 for (i=1; i<size; i++) { 1188 displs[i] = displs[i-1] + recvcounts[i-1]; 1189 } 1190 recvbuf = b->a; 1191 CHKERRMPI(MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A))); 1192 CHKERRQ(MatSeqAIJRestoreArrayRead(a->A,&ada)); 1193 CHKERRQ(MatSeqAIJRestoreArrayRead(a->B,&bda)); 1194 } /* endof (flag == MAT_GET_VALUES) */ 1195 CHKERRQ(PetscFree2(recvcounts,displs)); 1196 1197 if (A->symmetric) { 1198 CHKERRQ(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE)); 1199 } else if (A->hermitian) { 1200 CHKERRQ(MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE)); 1201 } else if (A->structurally_symmetric) { 1202 CHKERRQ(MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE)); 1203 } 1204 PetscFunctionReturn(0); 1205 } 1206 1207 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1208 { 1209 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1210 Mat submat,A = c->A,B = c->B; 1211 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1212 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1213 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1214 const PetscInt *icol,*irow; 1215 PetscInt nrow,ncol,start; 1216 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1217 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1218 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1219 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1220 PetscInt *lens,rmax,ncols,*cols,Crow; 1221 #if defined(PETSC_USE_CTABLE) 1222 PetscTable cmap,rmap; 1223 PetscInt *cmap_loc,*rmap_loc; 1224 #else 1225 PetscInt *cmap,*rmap; 1226 #endif 1227 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1228 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1229 PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL; 1230 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1231 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1232 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1233 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1234 MPI_Comm comm; 1235 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1236 PetscMPIInt *onodes1,*olengths1,idex,end; 1237 Mat_SubSppt *smatis1; 1238 PetscBool isrowsorted,iscolsorted; 1239 1240 PetscFunctionBegin; 1241 PetscValidLogicalCollectiveInt(C,ismax,2); 1242 PetscValidLogicalCollectiveEnum(C,scall,5); 1243 PetscCheckFalse(ismax != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1244 CHKERRQ(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a)); 1245 CHKERRQ(MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a)); 1246 CHKERRQ(PetscObjectGetComm((PetscObject)C,&comm)); 1247 size = c->size; 1248 rank = c->rank; 1249 1250 CHKERRQ(ISSorted(iscol[0],&iscolsorted)); 1251 CHKERRQ(ISSorted(isrow[0],&isrowsorted)); 1252 CHKERRQ(ISGetIndices(isrow[0],&irow)); 1253 CHKERRQ(ISGetLocalSize(isrow[0],&nrow)); 1254 if (allcolumns) { 1255 icol = NULL; 1256 ncol = C->cmap->N; 1257 } else { 1258 CHKERRQ(ISGetIndices(iscol[0],&icol)); 1259 CHKERRQ(ISGetLocalSize(iscol[0],&ncol)); 1260 } 1261 1262 if (scall == MAT_INITIAL_MATRIX) { 1263 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1264 1265 /* Get some new tags to keep the communication clean */ 1266 tag1 = ((PetscObject)C)->tag; 1267 CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag2)); 1268 CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag3)); 1269 1270 /* evaluate communication - mesg to who, length of mesg, and buffer space 1271 required. Based on this, buffers are allocated, and data copied into them */ 1272 CHKERRQ(PetscCalloc2(size,&w1,size,&w2)); 1273 CHKERRQ(PetscMalloc1(nrow,&row2proc)); 1274 1275 /* w1[proc] = num of rows owned by proc -- to be requested */ 1276 proc = 0; 1277 nrqs = 0; /* num of outgoing messages */ 1278 for (j=0; j<nrow; j++) { 1279 row = irow[j]; 1280 if (!isrowsorted) proc = 0; 1281 while (row >= C->rmap->range[proc+1]) proc++; 1282 w1[proc]++; 1283 row2proc[j] = proc; /* map row index to proc */ 1284 1285 if (proc != rank && !w2[proc]) { 1286 w2[proc] = 1; nrqs++; 1287 } 1288 } 1289 w1[rank] = 0; /* rows owned by self will not be requested */ 1290 1291 CHKERRQ(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/ 1292 for (proc=0,j=0; proc<size; proc++) { 1293 if (w1[proc]) { pa[j++] = proc;} 1294 } 1295 1296 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1297 msz = 0; /* total mesg length (for all procs) */ 1298 for (i=0; i<nrqs; i++) { 1299 proc = pa[i]; 1300 w1[proc] += 3; 1301 msz += w1[proc]; 1302 } 1303 CHKERRQ(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz)); 1304 1305 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1306 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1307 CHKERRQ(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 1308 1309 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1310 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1311 CHKERRQ(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 1312 1313 /* Now post the Irecvs corresponding to these messages */ 1314 CHKERRQ(PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1)); 1315 1316 CHKERRQ(PetscFree(onodes1)); 1317 CHKERRQ(PetscFree(olengths1)); 1318 1319 /* Allocate Memory for outgoing messages */ 1320 CHKERRQ(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr)); 1321 CHKERRQ(PetscArrayzero(sbuf1,size)); 1322 CHKERRQ(PetscArrayzero(ptr,size)); 1323 1324 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1325 iptr = tmp; 1326 for (i=0; i<nrqs; i++) { 1327 proc = pa[i]; 1328 sbuf1[proc] = iptr; 1329 iptr += w1[proc]; 1330 } 1331 1332 /* Form the outgoing messages */ 1333 /* Initialize the header space */ 1334 for (i=0; i<nrqs; i++) { 1335 proc = pa[i]; 1336 CHKERRQ(PetscArrayzero(sbuf1[proc],3)); 1337 ptr[proc] = sbuf1[proc] + 3; 1338 } 1339 1340 /* Parse the isrow and copy data into outbuf */ 1341 CHKERRQ(PetscArrayzero(ctr,size)); 1342 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1343 proc = row2proc[j]; 1344 if (proc != rank) { /* copy to the outgoing buf*/ 1345 *ptr[proc] = irow[j]; 1346 ctr[proc]++; ptr[proc]++; 1347 } 1348 } 1349 1350 /* Update the headers for the current IS */ 1351 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1352 if ((ctr_j = ctr[j])) { 1353 sbuf1_j = sbuf1[j]; 1354 k = ++sbuf1_j[0]; 1355 sbuf1_j[2*k] = ctr_j; 1356 sbuf1_j[2*k-1] = 0; 1357 } 1358 } 1359 1360 /* Now post the sends */ 1361 CHKERRQ(PetscMalloc1(nrqs,&s_waits1)); 1362 for (i=0; i<nrqs; ++i) { 1363 proc = pa[i]; 1364 CHKERRMPI(MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i)); 1365 } 1366 1367 /* Post Receives to capture the buffer size */ 1368 CHKERRQ(PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2)); 1369 CHKERRQ(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3)); 1370 1371 if (nrqs) rbuf2[0] = tmp + msz; 1372 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1373 1374 for (i=0; i<nrqs; ++i) { 1375 proc = pa[i]; 1376 CHKERRMPI(MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i)); 1377 } 1378 1379 CHKERRQ(PetscFree2(w1,w2)); 1380 1381 /* Send to other procs the buf size they should allocate */ 1382 /* Receive messages*/ 1383 CHKERRQ(PetscMalloc1(nrqr,&r_status1)); 1384 CHKERRQ(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1)); 1385 1386 CHKERRMPI(MPI_Waitall(nrqr,r_waits1,r_status1)); 1387 for (i=0; i<nrqr; ++i) { 1388 req_size[i] = 0; 1389 rbuf1_i = rbuf1[i]; 1390 start = 2*rbuf1_i[0] + 1; 1391 CHKERRMPI(MPI_Get_count(r_status1+i,MPIU_INT,&end)); 1392 CHKERRQ(PetscMalloc1(end,&sbuf2[i])); 1393 sbuf2_i = sbuf2[i]; 1394 for (j=start; j<end; j++) { 1395 k = rbuf1_i[j] - rstart; 1396 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1397 sbuf2_i[j] = ncols; 1398 req_size[i] += ncols; 1399 } 1400 req_source1[i] = r_status1[i].MPI_SOURCE; 1401 1402 /* form the header */ 1403 sbuf2_i[0] = req_size[i]; 1404 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1405 1406 CHKERRMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i)); 1407 } 1408 1409 CHKERRQ(PetscFree(r_status1)); 1410 CHKERRQ(PetscFree(r_waits1)); 1411 1412 /* rbuf2 is received, Post recv column indices a->j */ 1413 CHKERRMPI(MPI_Waitall(nrqs,r_waits2,r_status2)); 1414 1415 CHKERRQ(PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3)); 1416 for (i=0; i<nrqs; ++i) { 1417 CHKERRQ(PetscMalloc1(rbuf2[i][0],&rbuf3[i])); 1418 req_source2[i] = r_status2[i].MPI_SOURCE; 1419 CHKERRMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i)); 1420 } 1421 1422 /* Wait on sends1 and sends2 */ 1423 CHKERRQ(PetscMalloc1(nrqs,&s_status1)); 1424 CHKERRMPI(MPI_Waitall(nrqs,s_waits1,s_status1)); 1425 CHKERRQ(PetscFree(s_waits1)); 1426 CHKERRQ(PetscFree(s_status1)); 1427 1428 CHKERRMPI(MPI_Waitall(nrqr,s_waits2,s_status2)); 1429 CHKERRQ(PetscFree4(r_status2,s_waits2,r_waits2,s_status2)); 1430 1431 /* Now allocate sending buffers for a->j, and send them off */ 1432 CHKERRQ(PetscMalloc1(nrqr,&sbuf_aj)); 1433 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1434 if (nrqr) CHKERRQ(PetscMalloc1(j,&sbuf_aj[0])); 1435 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1436 1437 for (i=0; i<nrqr; i++) { /* for each requested message */ 1438 rbuf1_i = rbuf1[i]; 1439 sbuf_aj_i = sbuf_aj[i]; 1440 ct1 = 2*rbuf1_i[0] + 1; 1441 ct2 = 0; 1442 /* max1=rbuf1_i[0]; PetscCheckFalse(max1 != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1443 1444 kmax = rbuf1[i][2]; 1445 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1446 row = rbuf1_i[ct1] - rstart; 1447 nzA = ai[row+1] - ai[row]; 1448 nzB = bi[row+1] - bi[row]; 1449 ncols = nzA + nzB; 1450 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1451 1452 /* load the column indices for this row into cols*/ 1453 cols = sbuf_aj_i + ct2; 1454 1455 lwrite = 0; 1456 for (l=0; l<nzB; l++) { 1457 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1458 } 1459 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1460 for (l=0; l<nzB; l++) { 1461 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1462 } 1463 1464 ct2 += ncols; 1465 } 1466 CHKERRMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i)); 1467 } 1468 1469 /* create column map (cmap): global col of C -> local col of submat */ 1470 #if defined(PETSC_USE_CTABLE) 1471 if (!allcolumns) { 1472 CHKERRQ(PetscTableCreate(ncol,C->cmap->N,&cmap)); 1473 CHKERRQ(PetscCalloc1(C->cmap->n,&cmap_loc)); 1474 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1475 if (icol[j] >= cstart && icol[j] <cend) { 1476 cmap_loc[icol[j] - cstart] = j+1; 1477 } else { /* use PetscTable for non-local col indices */ 1478 CHKERRQ(PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES)); 1479 } 1480 } 1481 } else { 1482 cmap = NULL; 1483 cmap_loc = NULL; 1484 } 1485 CHKERRQ(PetscCalloc1(C->rmap->n,&rmap_loc)); 1486 #else 1487 if (!allcolumns) { 1488 CHKERRQ(PetscCalloc1(C->cmap->N,&cmap)); 1489 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1490 } else { 1491 cmap = NULL; 1492 } 1493 #endif 1494 1495 /* Create lens for MatSeqAIJSetPreallocation() */ 1496 CHKERRQ(PetscCalloc1(nrow,&lens)); 1497 1498 /* Compute lens from local part of C */ 1499 for (j=0; j<nrow; j++) { 1500 row = irow[j]; 1501 proc = row2proc[j]; 1502 if (proc == rank) { 1503 /* diagonal part A = c->A */ 1504 ncols = ai[row-rstart+1] - ai[row-rstart]; 1505 cols = aj + ai[row-rstart]; 1506 if (!allcolumns) { 1507 for (k=0; k<ncols; k++) { 1508 #if defined(PETSC_USE_CTABLE) 1509 tcol = cmap_loc[cols[k]]; 1510 #else 1511 tcol = cmap[cols[k]+cstart]; 1512 #endif 1513 if (tcol) lens[j]++; 1514 } 1515 } else { /* allcolumns */ 1516 lens[j] = ncols; 1517 } 1518 1519 /* off-diagonal part B = c->B */ 1520 ncols = bi[row-rstart+1] - bi[row-rstart]; 1521 cols = bj + bi[row-rstart]; 1522 if (!allcolumns) { 1523 for (k=0; k<ncols; k++) { 1524 #if defined(PETSC_USE_CTABLE) 1525 CHKERRQ(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol)); 1526 #else 1527 tcol = cmap[bmap[cols[k]]]; 1528 #endif 1529 if (tcol) lens[j]++; 1530 } 1531 } else { /* allcolumns */ 1532 lens[j] += ncols; 1533 } 1534 } 1535 } 1536 1537 /* Create row map (rmap): global row of C -> local row of submat */ 1538 #if defined(PETSC_USE_CTABLE) 1539 CHKERRQ(PetscTableCreate(nrow,C->rmap->N,&rmap)); 1540 for (j=0; j<nrow; j++) { 1541 row = irow[j]; 1542 proc = row2proc[j]; 1543 if (proc == rank) { /* a local row */ 1544 rmap_loc[row - rstart] = j; 1545 } else { 1546 CHKERRQ(PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES)); 1547 } 1548 } 1549 #else 1550 CHKERRQ(PetscCalloc1(C->rmap->N,&rmap)); 1551 for (j=0; j<nrow; j++) { 1552 rmap[irow[j]] = j; 1553 } 1554 #endif 1555 1556 /* Update lens from offproc data */ 1557 /* recv a->j is done */ 1558 CHKERRMPI(MPI_Waitall(nrqs,r_waits3,r_status3)); 1559 for (i=0; i<nrqs; i++) { 1560 proc = pa[i]; 1561 sbuf1_i = sbuf1[proc]; 1562 /* jmax = sbuf1_i[0]; PetscCheckFalse(jmax != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1563 ct1 = 2 + 1; 1564 ct2 = 0; 1565 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1566 rbuf3_i = rbuf3[i]; /* received C->j */ 1567 1568 /* is_no = sbuf1_i[2*j-1]; PetscCheckFalse(is_no != 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1569 max1 = sbuf1_i[2]; 1570 for (k=0; k<max1; k++,ct1++) { 1571 #if defined(PETSC_USE_CTABLE) 1572 CHKERRQ(PetscTableFind(rmap,sbuf1_i[ct1]+1,&row)); 1573 row--; 1574 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1575 #else 1576 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1577 #endif 1578 /* Now, store row index of submat in sbuf1_i[ct1] */ 1579 sbuf1_i[ct1] = row; 1580 1581 nnz = rbuf2_i[ct1]; 1582 if (!allcolumns) { 1583 for (l=0; l<nnz; l++,ct2++) { 1584 #if defined(PETSC_USE_CTABLE) 1585 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1586 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1587 } else { 1588 CHKERRQ(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol)); 1589 } 1590 #else 1591 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1592 #endif 1593 if (tcol) lens[row]++; 1594 } 1595 } else { /* allcolumns */ 1596 lens[row] += nnz; 1597 } 1598 } 1599 } 1600 CHKERRMPI(MPI_Waitall(nrqr,s_waits3,s_status3)); 1601 CHKERRQ(PetscFree4(r_waits3,s_waits3,r_status3,s_status3)); 1602 1603 /* Create the submatrices */ 1604 CHKERRQ(MatCreate(PETSC_COMM_SELF,&submat)); 1605 CHKERRQ(MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE)); 1606 1607 CHKERRQ(ISGetBlockSize(isrow[0],&i)); 1608 CHKERRQ(ISGetBlockSize(iscol[0],&j)); 1609 CHKERRQ(MatSetBlockSizes(submat,i,j)); 1610 CHKERRQ(MatSetType(submat,((PetscObject)A)->type_name)); 1611 CHKERRQ(MatSeqAIJSetPreallocation(submat,0,lens)); 1612 1613 /* create struct Mat_SubSppt and attached it to submat */ 1614 CHKERRQ(PetscNew(&smatis1)); 1615 subc = (Mat_SeqAIJ*)submat->data; 1616 subc->submatis1 = smatis1; 1617 1618 smatis1->id = 0; 1619 smatis1->nrqs = nrqs; 1620 smatis1->nrqr = nrqr; 1621 smatis1->rbuf1 = rbuf1; 1622 smatis1->rbuf2 = rbuf2; 1623 smatis1->rbuf3 = rbuf3; 1624 smatis1->sbuf2 = sbuf2; 1625 smatis1->req_source2 = req_source2; 1626 1627 smatis1->sbuf1 = sbuf1; 1628 smatis1->ptr = ptr; 1629 smatis1->tmp = tmp; 1630 smatis1->ctr = ctr; 1631 1632 smatis1->pa = pa; 1633 smatis1->req_size = req_size; 1634 smatis1->req_source1 = req_source1; 1635 1636 smatis1->allcolumns = allcolumns; 1637 smatis1->singleis = PETSC_TRUE; 1638 smatis1->row2proc = row2proc; 1639 smatis1->rmap = rmap; 1640 smatis1->cmap = cmap; 1641 #if defined(PETSC_USE_CTABLE) 1642 smatis1->rmap_loc = rmap_loc; 1643 smatis1->cmap_loc = cmap_loc; 1644 #endif 1645 1646 smatis1->destroy = submat->ops->destroy; 1647 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1648 submat->factortype = C->factortype; 1649 1650 /* compute rmax */ 1651 rmax = 0; 1652 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1653 1654 } else { /* scall == MAT_REUSE_MATRIX */ 1655 submat = submats[0]; 1656 PetscCheckFalse(submat->rmap->n != nrow || submat->cmap->n != ncol,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1657 1658 subc = (Mat_SeqAIJ*)submat->data; 1659 rmax = subc->rmax; 1660 smatis1 = subc->submatis1; 1661 nrqs = smatis1->nrqs; 1662 nrqr = smatis1->nrqr; 1663 rbuf1 = smatis1->rbuf1; 1664 rbuf2 = smatis1->rbuf2; 1665 rbuf3 = smatis1->rbuf3; 1666 req_source2 = smatis1->req_source2; 1667 1668 sbuf1 = smatis1->sbuf1; 1669 sbuf2 = smatis1->sbuf2; 1670 ptr = smatis1->ptr; 1671 tmp = smatis1->tmp; 1672 ctr = smatis1->ctr; 1673 1674 pa = smatis1->pa; 1675 req_size = smatis1->req_size; 1676 req_source1 = smatis1->req_source1; 1677 1678 allcolumns = smatis1->allcolumns; 1679 row2proc = smatis1->row2proc; 1680 rmap = smatis1->rmap; 1681 cmap = smatis1->cmap; 1682 #if defined(PETSC_USE_CTABLE) 1683 rmap_loc = smatis1->rmap_loc; 1684 cmap_loc = smatis1->cmap_loc; 1685 #endif 1686 } 1687 1688 /* Post recv matrix values */ 1689 CHKERRQ(PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals)); 1690 CHKERRQ(PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4)); 1691 CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag4)); 1692 for (i=0; i<nrqs; ++i) { 1693 CHKERRQ(PetscMalloc1(rbuf2[i][0],&rbuf4[i])); 1694 CHKERRMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i)); 1695 } 1696 1697 /* Allocate sending buffers for a->a, and send them off */ 1698 CHKERRQ(PetscMalloc1(nrqr,&sbuf_aa)); 1699 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1700 if (nrqr) CHKERRQ(PetscMalloc1(j,&sbuf_aa[0])); 1701 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1702 1703 for (i=0; i<nrqr; i++) { 1704 rbuf1_i = rbuf1[i]; 1705 sbuf_aa_i = sbuf_aa[i]; 1706 ct1 = 2*rbuf1_i[0]+1; 1707 ct2 = 0; 1708 /* max1=rbuf1_i[0]; PetscCheckFalse(max1 != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1709 1710 kmax = rbuf1_i[2]; 1711 for (k=0; k<kmax; k++,ct1++) { 1712 row = rbuf1_i[ct1] - rstart; 1713 nzA = ai[row+1] - ai[row]; 1714 nzB = bi[row+1] - bi[row]; 1715 ncols = nzA + nzB; 1716 cworkB = bj + bi[row]; 1717 vworkA = a_a + ai[row]; 1718 vworkB = b_a + bi[row]; 1719 1720 /* load the column values for this row into vals*/ 1721 vals = sbuf_aa_i + ct2; 1722 1723 lwrite = 0; 1724 for (l=0; l<nzB; l++) { 1725 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1726 } 1727 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1728 for (l=0; l<nzB; l++) { 1729 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1730 } 1731 1732 ct2 += ncols; 1733 } 1734 CHKERRMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i)); 1735 } 1736 1737 /* Assemble submat */ 1738 /* First assemble the local rows */ 1739 for (j=0; j<nrow; j++) { 1740 row = irow[j]; 1741 proc = row2proc[j]; 1742 if (proc == rank) { 1743 Crow = row - rstart; /* local row index of C */ 1744 #if defined(PETSC_USE_CTABLE) 1745 row = rmap_loc[Crow]; /* row index of submat */ 1746 #else 1747 row = rmap[row]; 1748 #endif 1749 1750 if (allcolumns) { 1751 /* diagonal part A = c->A */ 1752 ncols = ai[Crow+1] - ai[Crow]; 1753 cols = aj + ai[Crow]; 1754 vals = a_a + ai[Crow]; 1755 i = 0; 1756 for (k=0; k<ncols; k++) { 1757 subcols[i] = cols[k] + cstart; 1758 subvals[i++] = vals[k]; 1759 } 1760 1761 /* off-diagonal part B = c->B */ 1762 ncols = bi[Crow+1] - bi[Crow]; 1763 cols = bj + bi[Crow]; 1764 vals = b_a + bi[Crow]; 1765 for (k=0; k<ncols; k++) { 1766 subcols[i] = bmap[cols[k]]; 1767 subvals[i++] = vals[k]; 1768 } 1769 1770 CHKERRQ(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES)); 1771 1772 } else { /* !allcolumns */ 1773 #if defined(PETSC_USE_CTABLE) 1774 /* diagonal part A = c->A */ 1775 ncols = ai[Crow+1] - ai[Crow]; 1776 cols = aj + ai[Crow]; 1777 vals = a_a + ai[Crow]; 1778 i = 0; 1779 for (k=0; k<ncols; k++) { 1780 tcol = cmap_loc[cols[k]]; 1781 if (tcol) { 1782 subcols[i] = --tcol; 1783 subvals[i++] = vals[k]; 1784 } 1785 } 1786 1787 /* off-diagonal part B = c->B */ 1788 ncols = bi[Crow+1] - bi[Crow]; 1789 cols = bj + bi[Crow]; 1790 vals = b_a + bi[Crow]; 1791 for (k=0; k<ncols; k++) { 1792 CHKERRQ(PetscTableFind(cmap,bmap[cols[k]]+1,&tcol)); 1793 if (tcol) { 1794 subcols[i] = --tcol; 1795 subvals[i++] = vals[k]; 1796 } 1797 } 1798 #else 1799 /* diagonal part A = c->A */ 1800 ncols = ai[Crow+1] - ai[Crow]; 1801 cols = aj + ai[Crow]; 1802 vals = a_a + ai[Crow]; 1803 i = 0; 1804 for (k=0; k<ncols; k++) { 1805 tcol = cmap[cols[k]+cstart]; 1806 if (tcol) { 1807 subcols[i] = --tcol; 1808 subvals[i++] = vals[k]; 1809 } 1810 } 1811 1812 /* off-diagonal part B = c->B */ 1813 ncols = bi[Crow+1] - bi[Crow]; 1814 cols = bj + bi[Crow]; 1815 vals = b_a + bi[Crow]; 1816 for (k=0; k<ncols; k++) { 1817 tcol = cmap[bmap[cols[k]]]; 1818 if (tcol) { 1819 subcols[i] = --tcol; 1820 subvals[i++] = vals[k]; 1821 } 1822 } 1823 #endif 1824 CHKERRQ(MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES)); 1825 } 1826 } 1827 } 1828 1829 /* Now assemble the off-proc rows */ 1830 for (i=0; i<nrqs; i++) { /* for each requested message */ 1831 /* recv values from other processes */ 1832 CHKERRMPI(MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i)); 1833 proc = pa[idex]; 1834 sbuf1_i = sbuf1[proc]; 1835 /* jmax = sbuf1_i[0]; PetscCheckFalse(jmax != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1836 ct1 = 2 + 1; 1837 ct2 = 0; /* count of received C->j */ 1838 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1839 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1840 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1841 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1842 1843 /* is_no = sbuf1_i[2*j-1]; PetscCheckFalse(is_no != 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1844 max1 = sbuf1_i[2]; /* num of rows */ 1845 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1846 row = sbuf1_i[ct1]; /* row index of submat */ 1847 if (!allcolumns) { 1848 idex = 0; 1849 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1850 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1851 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1852 #if defined(PETSC_USE_CTABLE) 1853 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1854 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1855 } else { 1856 CHKERRQ(PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol)); 1857 } 1858 #else 1859 tcol = cmap[rbuf3_i[ct2]]; 1860 #endif 1861 if (tcol) { 1862 subcols[idex] = --tcol; /* may not be sorted */ 1863 subvals[idex++] = rbuf4_i[ct2]; 1864 1865 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1866 For reuse, we replace received C->j with index that should be inserted to submat */ 1867 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1868 } 1869 } 1870 CHKERRQ(MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES)); 1871 } else { /* scall == MAT_REUSE_MATRIX */ 1872 submat = submats[0]; 1873 subc = (Mat_SeqAIJ*)submat->data; 1874 1875 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1876 for (l=0; l<nnz; l++) { 1877 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1878 subvals[idex++] = rbuf4_i[ct2]; 1879 } 1880 1881 bj = subc->j + subc->i[row]; /* sorted column indices */ 1882 CHKERRQ(MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES)); 1883 } 1884 } else { /* allcolumns */ 1885 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1886 CHKERRQ(MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES)); 1887 ct2 += nnz; 1888 } 1889 } 1890 } 1891 1892 /* sending a->a are done */ 1893 CHKERRMPI(MPI_Waitall(nrqr,s_waits4,s_status4)); 1894 CHKERRQ(PetscFree4(r_waits4,s_waits4,r_status4,s_status4)); 1895 1896 CHKERRQ(MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY)); 1897 CHKERRQ(MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY)); 1898 submats[0] = submat; 1899 1900 /* Restore the indices */ 1901 CHKERRQ(ISRestoreIndices(isrow[0],&irow)); 1902 if (!allcolumns) { 1903 CHKERRQ(ISRestoreIndices(iscol[0],&icol)); 1904 } 1905 1906 /* Destroy allocated memory */ 1907 for (i=0; i<nrqs; ++i) { 1908 CHKERRQ(PetscFree(rbuf4[i])); 1909 } 1910 CHKERRQ(PetscFree3(rbuf4,subcols,subvals)); 1911 if (sbuf_aa) { 1912 CHKERRQ(PetscFree(sbuf_aa[0])); 1913 CHKERRQ(PetscFree(sbuf_aa)); 1914 } 1915 1916 if (scall == MAT_INITIAL_MATRIX) { 1917 CHKERRQ(PetscFree(lens)); 1918 if (sbuf_aj) { 1919 CHKERRQ(PetscFree(sbuf_aj[0])); 1920 CHKERRQ(PetscFree(sbuf_aj)); 1921 } 1922 } 1923 CHKERRQ(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a)); 1924 CHKERRQ(MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a)); 1925 PetscFunctionReturn(0); 1926 } 1927 1928 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1929 { 1930 PetscInt ncol; 1931 PetscBool colflag,allcolumns=PETSC_FALSE; 1932 1933 PetscFunctionBegin; 1934 /* Allocate memory to hold all the submatrices */ 1935 if (scall == MAT_INITIAL_MATRIX) { 1936 CHKERRQ(PetscCalloc1(2,submat)); 1937 } 1938 1939 /* Check for special case: each processor gets entire matrix columns */ 1940 CHKERRQ(ISIdentity(iscol[0],&colflag)); 1941 CHKERRQ(ISGetLocalSize(iscol[0],&ncol)); 1942 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1943 1944 CHKERRQ(MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat)); 1945 PetscFunctionReturn(0); 1946 } 1947 1948 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1949 { 1950 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1951 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1952 Mat_SeqAIJ *subc; 1953 Mat_SubSppt *smat; 1954 1955 PetscFunctionBegin; 1956 /* Check for special case: each processor has a single IS */ 1957 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1958 CHKERRQ(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat)); 1959 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1960 PetscFunctionReturn(0); 1961 } 1962 1963 /* Collect global wantallmatrix and nstages */ 1964 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1965 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1966 if (!nmax) nmax = 1; 1967 1968 if (scall == MAT_INITIAL_MATRIX) { 1969 /* Collect global wantallmatrix and nstages */ 1970 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1971 CHKERRQ(ISIdentity(*isrow,&rowflag)); 1972 CHKERRQ(ISIdentity(*iscol,&colflag)); 1973 CHKERRQ(ISGetLocalSize(*isrow,&nrow)); 1974 CHKERRQ(ISGetLocalSize(*iscol,&ncol)); 1975 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1976 wantallmatrix = PETSC_TRUE; 1977 1978 CHKERRQ(PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL)); 1979 } 1980 } 1981 1982 /* Determine the number of stages through which submatrices are done 1983 Each stage will extract nmax submatrices. 1984 nmax is determined by the matrix column dimension. 1985 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1986 */ 1987 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1988 1989 in[0] = -1*(PetscInt)wantallmatrix; 1990 in[1] = nstages; 1991 CHKERRMPI(MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C))); 1992 wantallmatrix = (PetscBool)(-out[0]); 1993 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 1994 1995 } else { /* MAT_REUSE_MATRIX */ 1996 if (ismax) { 1997 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 1998 smat = subc->submatis1; 1999 } else { /* (*submat)[0] is a dummy matrix */ 2000 smat = (Mat_SubSppt*)(*submat)[0]->data; 2001 } 2002 if (!smat) { 2003 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2004 wantallmatrix = PETSC_TRUE; 2005 } else if (smat->singleis) { 2006 CHKERRQ(MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat)); 2007 PetscFunctionReturn(0); 2008 } else { 2009 nstages = smat->nstages; 2010 } 2011 } 2012 2013 if (wantallmatrix) { 2014 CHKERRQ(MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat)); 2015 PetscFunctionReturn(0); 2016 } 2017 2018 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2019 if (scall == MAT_INITIAL_MATRIX) { 2020 CHKERRQ(PetscCalloc1(ismax+nstages,submat)); 2021 } 2022 2023 for (i=0,pos=0; i<nstages; i++) { 2024 if (pos+nmax <= ismax) max_no = nmax; 2025 else if (pos >= ismax) max_no = 0; 2026 else max_no = ismax-pos; 2027 2028 CHKERRQ(MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos)); 2029 if (!max_no) { 2030 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2031 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2032 smat->nstages = nstages; 2033 } 2034 pos++; /* advance to next dummy matrix if any */ 2035 } else pos += max_no; 2036 } 2037 2038 if (ismax && scall == MAT_INITIAL_MATRIX) { 2039 /* save nstages for reuse */ 2040 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2041 smat = subc->submatis1; 2042 smat->nstages = nstages; 2043 } 2044 PetscFunctionReturn(0); 2045 } 2046 2047 /* -------------------------------------------------------------------------*/ 2048 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2049 { 2050 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2051 Mat A = c->A; 2052 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2053 const PetscInt **icol,**irow; 2054 PetscInt *nrow,*ncol,start; 2055 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2056 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2057 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2058 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2059 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2060 #if defined(PETSC_USE_CTABLE) 2061 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2062 #else 2063 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2064 #endif 2065 const PetscInt *irow_i; 2066 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2067 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2068 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2069 MPI_Comm comm; 2070 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2071 PetscMPIInt *onodes1,*olengths1,end; 2072 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2073 Mat_SubSppt *smat_i; 2074 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2075 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2076 2077 PetscFunctionBegin; 2078 CHKERRQ(PetscObjectGetComm((PetscObject)C,&comm)); 2079 size = c->size; 2080 rank = c->rank; 2081 2082 CHKERRQ(PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns)); 2083 CHKERRQ(PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted)); 2084 2085 for (i=0; i<ismax; i++) { 2086 CHKERRQ(ISSorted(iscol[i],&issorted[i])); 2087 if (!issorted[i]) iscsorted = issorted[i]; 2088 2089 CHKERRQ(ISSorted(isrow[i],&issorted[i])); 2090 2091 CHKERRQ(ISGetIndices(isrow[i],&irow[i])); 2092 CHKERRQ(ISGetLocalSize(isrow[i],&nrow[i])); 2093 2094 /* Check for special case: allcolumn */ 2095 CHKERRQ(ISIdentity(iscol[i],&colflag)); 2096 CHKERRQ(ISGetLocalSize(iscol[i],&ncol[i])); 2097 if (colflag && ncol[i] == C->cmap->N) { 2098 allcolumns[i] = PETSC_TRUE; 2099 icol[i] = NULL; 2100 } else { 2101 allcolumns[i] = PETSC_FALSE; 2102 CHKERRQ(ISGetIndices(iscol[i],&icol[i])); 2103 } 2104 } 2105 2106 if (scall == MAT_REUSE_MATRIX) { 2107 /* Assumes new rows are same length as the old rows */ 2108 for (i=0; i<ismax; i++) { 2109 PetscCheckFalse(!submats[i],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%" PetscInt_FMT "] is null, cannot reuse",i); 2110 subc = (Mat_SeqAIJ*)submats[i]->data; 2111 PetscCheckFalse((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2112 2113 /* Initial matrix as if empty */ 2114 CHKERRQ(PetscArrayzero(subc->ilen,submats[i]->rmap->n)); 2115 2116 smat_i = subc->submatis1; 2117 2118 nrqs = smat_i->nrqs; 2119 nrqr = smat_i->nrqr; 2120 rbuf1 = smat_i->rbuf1; 2121 rbuf2 = smat_i->rbuf2; 2122 rbuf3 = smat_i->rbuf3; 2123 req_source2 = smat_i->req_source2; 2124 2125 sbuf1 = smat_i->sbuf1; 2126 sbuf2 = smat_i->sbuf2; 2127 ptr = smat_i->ptr; 2128 tmp = smat_i->tmp; 2129 ctr = smat_i->ctr; 2130 2131 pa = smat_i->pa; 2132 req_size = smat_i->req_size; 2133 req_source1 = smat_i->req_source1; 2134 2135 allcolumns[i] = smat_i->allcolumns; 2136 row2proc[i] = smat_i->row2proc; 2137 rmap[i] = smat_i->rmap; 2138 cmap[i] = smat_i->cmap; 2139 } 2140 2141 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2142 PetscCheckFalse(!submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2143 smat_i = (Mat_SubSppt*)submats[0]->data; 2144 2145 nrqs = smat_i->nrqs; 2146 nrqr = smat_i->nrqr; 2147 rbuf1 = smat_i->rbuf1; 2148 rbuf2 = smat_i->rbuf2; 2149 rbuf3 = smat_i->rbuf3; 2150 req_source2 = smat_i->req_source2; 2151 2152 sbuf1 = smat_i->sbuf1; 2153 sbuf2 = smat_i->sbuf2; 2154 ptr = smat_i->ptr; 2155 tmp = smat_i->tmp; 2156 ctr = smat_i->ctr; 2157 2158 pa = smat_i->pa; 2159 req_size = smat_i->req_size; 2160 req_source1 = smat_i->req_source1; 2161 2162 allcolumns[0] = PETSC_FALSE; 2163 } 2164 } else { /* scall == MAT_INITIAL_MATRIX */ 2165 /* Get some new tags to keep the communication clean */ 2166 CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag2)); 2167 CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag3)); 2168 2169 /* evaluate communication - mesg to who, length of mesg, and buffer space 2170 required. Based on this, buffers are allocated, and data copied into them*/ 2171 CHKERRQ(PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4)); /* mesg size, initialize work vectors */ 2172 2173 for (i=0; i<ismax; i++) { 2174 jmax = nrow[i]; 2175 irow_i = irow[i]; 2176 2177 CHKERRQ(PetscMalloc1(jmax,&row2proc_i)); 2178 row2proc[i] = row2proc_i; 2179 2180 if (issorted[i]) proc = 0; 2181 for (j=0; j<jmax; j++) { 2182 if (!issorted[i]) proc = 0; 2183 row = irow_i[j]; 2184 while (row >= C->rmap->range[proc+1]) proc++; 2185 w4[proc]++; 2186 row2proc_i[j] = proc; /* map row index to proc */ 2187 } 2188 for (j=0; j<size; j++) { 2189 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2190 } 2191 } 2192 2193 nrqs = 0; /* no of outgoing messages */ 2194 msz = 0; /* total mesg length (for all procs) */ 2195 w1[rank] = 0; /* no mesg sent to self */ 2196 w3[rank] = 0; 2197 for (i=0; i<size; i++) { 2198 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2199 } 2200 CHKERRQ(PetscMalloc1(nrqs,&pa)); /*(proc -array)*/ 2201 for (i=0,j=0; i<size; i++) { 2202 if (w1[i]) { pa[j] = i; j++; } 2203 } 2204 2205 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2206 for (i=0; i<nrqs; i++) { 2207 j = pa[i]; 2208 w1[j] += w2[j] + 2* w3[j]; 2209 msz += w1[j]; 2210 } 2211 CHKERRQ(PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz)); 2212 2213 /* Determine the number of messages to expect, their lengths, from from-ids */ 2214 CHKERRQ(PetscGatherNumberOfMessages(comm,w2,w1,&nrqr)); 2215 CHKERRQ(PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1)); 2216 2217 /* Now post the Irecvs corresponding to these messages */ 2218 CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag0)); 2219 CHKERRQ(PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1)); 2220 2221 /* Allocate Memory for outgoing messages */ 2222 CHKERRQ(PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr)); 2223 CHKERRQ(PetscArrayzero(sbuf1,size)); 2224 CHKERRQ(PetscArrayzero(ptr,size)); 2225 2226 { 2227 PetscInt *iptr = tmp; 2228 k = 0; 2229 for (i=0; i<nrqs; i++) { 2230 j = pa[i]; 2231 iptr += k; 2232 sbuf1[j] = iptr; 2233 k = w1[j]; 2234 } 2235 } 2236 2237 /* Form the outgoing messages. Initialize the header space */ 2238 for (i=0; i<nrqs; i++) { 2239 j = pa[i]; 2240 sbuf1[j][0] = 0; 2241 CHKERRQ(PetscArrayzero(sbuf1[j]+1,2*w3[j])); 2242 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2243 } 2244 2245 /* Parse the isrow and copy data into outbuf */ 2246 for (i=0; i<ismax; i++) { 2247 row2proc_i = row2proc[i]; 2248 CHKERRQ(PetscArrayzero(ctr,size)); 2249 irow_i = irow[i]; 2250 jmax = nrow[i]; 2251 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2252 proc = row2proc_i[j]; 2253 if (proc != rank) { /* copy to the outgoing buf*/ 2254 ctr[proc]++; 2255 *ptr[proc] = irow_i[j]; 2256 ptr[proc]++; 2257 } 2258 } 2259 /* Update the headers for the current IS */ 2260 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2261 if ((ctr_j = ctr[j])) { 2262 sbuf1_j = sbuf1[j]; 2263 k = ++sbuf1_j[0]; 2264 sbuf1_j[2*k] = ctr_j; 2265 sbuf1_j[2*k-1] = i; 2266 } 2267 } 2268 } 2269 2270 /* Now post the sends */ 2271 CHKERRQ(PetscMalloc1(nrqs,&s_waits1)); 2272 for (i=0; i<nrqs; ++i) { 2273 j = pa[i]; 2274 CHKERRMPI(MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i)); 2275 } 2276 2277 /* Post Receives to capture the buffer size */ 2278 CHKERRQ(PetscMalloc1(nrqs,&r_waits2)); 2279 CHKERRQ(PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3)); 2280 if (nrqs) rbuf2[0] = tmp + msz; 2281 for (i=1; i<nrqs; ++i) { 2282 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2283 } 2284 for (i=0; i<nrqs; ++i) { 2285 j = pa[i]; 2286 CHKERRMPI(MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i)); 2287 } 2288 2289 /* Send to other procs the buf size they should allocate */ 2290 /* Receive messages*/ 2291 CHKERRQ(PetscMalloc1(nrqr,&s_waits2)); 2292 CHKERRQ(PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1)); 2293 { 2294 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2295 PetscInt *sbuf2_i; 2296 2297 CHKERRMPI(MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE)); 2298 for (i=0; i<nrqr; ++i) { 2299 req_size[i] = 0; 2300 rbuf1_i = rbuf1[i]; 2301 start = 2*rbuf1_i[0] + 1; 2302 end = olengths1[i]; 2303 CHKERRQ(PetscMalloc1(end,&sbuf2[i])); 2304 sbuf2_i = sbuf2[i]; 2305 for (j=start; j<end; j++) { 2306 id = rbuf1_i[j] - rstart; 2307 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2308 sbuf2_i[j] = ncols; 2309 req_size[i] += ncols; 2310 } 2311 req_source1[i] = onodes1[i]; 2312 /* form the header */ 2313 sbuf2_i[0] = req_size[i]; 2314 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2315 2316 CHKERRMPI(MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i)); 2317 } 2318 } 2319 2320 CHKERRQ(PetscFree(onodes1)); 2321 CHKERRQ(PetscFree(olengths1)); 2322 CHKERRQ(PetscFree(r_waits1)); 2323 CHKERRQ(PetscFree4(w1,w2,w3,w4)); 2324 2325 /* Receive messages*/ 2326 CHKERRQ(PetscMalloc1(nrqs,&r_waits3)); 2327 CHKERRMPI(MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE)); 2328 for (i=0; i<nrqs; ++i) { 2329 CHKERRQ(PetscMalloc1(rbuf2[i][0],&rbuf3[i])); 2330 req_source2[i] = pa[i]; 2331 CHKERRMPI(MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i)); 2332 } 2333 CHKERRQ(PetscFree(r_waits2)); 2334 2335 /* Wait on sends1 and sends2 */ 2336 CHKERRMPI(MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE)); 2337 CHKERRMPI(MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE)); 2338 CHKERRQ(PetscFree(s_waits1)); 2339 CHKERRQ(PetscFree(s_waits2)); 2340 2341 /* Now allocate sending buffers for a->j, and send them off */ 2342 CHKERRQ(PetscMalloc1(nrqr,&sbuf_aj)); 2343 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2344 if (nrqr) CHKERRQ(PetscMalloc1(j,&sbuf_aj[0])); 2345 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2346 2347 CHKERRQ(PetscMalloc1(nrqr,&s_waits3)); 2348 { 2349 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2350 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2351 PetscInt cend = C->cmap->rend; 2352 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2353 2354 for (i=0; i<nrqr; i++) { 2355 rbuf1_i = rbuf1[i]; 2356 sbuf_aj_i = sbuf_aj[i]; 2357 ct1 = 2*rbuf1_i[0] + 1; 2358 ct2 = 0; 2359 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2360 kmax = rbuf1[i][2*j]; 2361 for (k=0; k<kmax; k++,ct1++) { 2362 row = rbuf1_i[ct1] - rstart; 2363 nzA = a_i[row+1] - a_i[row]; 2364 nzB = b_i[row+1] - b_i[row]; 2365 ncols = nzA + nzB; 2366 cworkA = a_j + a_i[row]; 2367 cworkB = b_j + b_i[row]; 2368 2369 /* load the column indices for this row into cols */ 2370 cols = sbuf_aj_i + ct2; 2371 2372 lwrite = 0; 2373 for (l=0; l<nzB; l++) { 2374 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2375 } 2376 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2377 for (l=0; l<nzB; l++) { 2378 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2379 } 2380 2381 ct2 += ncols; 2382 } 2383 } 2384 CHKERRMPI(MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i)); 2385 } 2386 } 2387 2388 /* create col map: global col of C -> local col of submatrices */ 2389 { 2390 const PetscInt *icol_i; 2391 #if defined(PETSC_USE_CTABLE) 2392 for (i=0; i<ismax; i++) { 2393 if (!allcolumns[i]) { 2394 CHKERRQ(PetscTableCreate(ncol[i],C->cmap->N,&cmap[i])); 2395 2396 jmax = ncol[i]; 2397 icol_i = icol[i]; 2398 cmap_i = cmap[i]; 2399 for (j=0; j<jmax; j++) { 2400 CHKERRQ(PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES)); 2401 } 2402 } else cmap[i] = NULL; 2403 } 2404 #else 2405 for (i=0; i<ismax; i++) { 2406 if (!allcolumns[i]) { 2407 CHKERRQ(PetscCalloc1(C->cmap->N,&cmap[i])); 2408 jmax = ncol[i]; 2409 icol_i = icol[i]; 2410 cmap_i = cmap[i]; 2411 for (j=0; j<jmax; j++) { 2412 cmap_i[icol_i[j]] = j+1; 2413 } 2414 } else cmap[i] = NULL; 2415 } 2416 #endif 2417 } 2418 2419 /* Create lens which is required for MatCreate... */ 2420 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2421 CHKERRQ(PetscMalloc1(ismax,&lens)); 2422 2423 if (ismax) { 2424 CHKERRQ(PetscCalloc1(j,&lens[0])); 2425 } 2426 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2427 2428 /* Update lens from local data */ 2429 for (i=0; i<ismax; i++) { 2430 row2proc_i = row2proc[i]; 2431 jmax = nrow[i]; 2432 if (!allcolumns[i]) cmap_i = cmap[i]; 2433 irow_i = irow[i]; 2434 lens_i = lens[i]; 2435 for (j=0; j<jmax; j++) { 2436 row = irow_i[j]; 2437 proc = row2proc_i[j]; 2438 if (proc == rank) { 2439 CHKERRQ(MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL)); 2440 if (!allcolumns[i]) { 2441 for (k=0; k<ncols; k++) { 2442 #if defined(PETSC_USE_CTABLE) 2443 CHKERRQ(PetscTableFind(cmap_i,cols[k]+1,&tcol)); 2444 #else 2445 tcol = cmap_i[cols[k]]; 2446 #endif 2447 if (tcol) lens_i[j]++; 2448 } 2449 } else { /* allcolumns */ 2450 lens_i[j] = ncols; 2451 } 2452 CHKERRQ(MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL)); 2453 } 2454 } 2455 } 2456 2457 /* Create row map: global row of C -> local row of submatrices */ 2458 #if defined(PETSC_USE_CTABLE) 2459 for (i=0; i<ismax; i++) { 2460 CHKERRQ(PetscTableCreate(nrow[i],C->rmap->N,&rmap[i])); 2461 irow_i = irow[i]; 2462 jmax = nrow[i]; 2463 for (j=0; j<jmax; j++) { 2464 CHKERRQ(PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES)); 2465 } 2466 } 2467 #else 2468 for (i=0; i<ismax; i++) { 2469 CHKERRQ(PetscCalloc1(C->rmap->N,&rmap[i])); 2470 rmap_i = rmap[i]; 2471 irow_i = irow[i]; 2472 jmax = nrow[i]; 2473 for (j=0; j<jmax; j++) { 2474 rmap_i[irow_i[j]] = j; 2475 } 2476 } 2477 #endif 2478 2479 /* Update lens from offproc data */ 2480 { 2481 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2482 2483 CHKERRMPI(MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE)); 2484 for (tmp2=0; tmp2<nrqs; tmp2++) { 2485 sbuf1_i = sbuf1[pa[tmp2]]; 2486 jmax = sbuf1_i[0]; 2487 ct1 = 2*jmax+1; 2488 ct2 = 0; 2489 rbuf2_i = rbuf2[tmp2]; 2490 rbuf3_i = rbuf3[tmp2]; 2491 for (j=1; j<=jmax; j++) { 2492 is_no = sbuf1_i[2*j-1]; 2493 max1 = sbuf1_i[2*j]; 2494 lens_i = lens[is_no]; 2495 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2496 rmap_i = rmap[is_no]; 2497 for (k=0; k<max1; k++,ct1++) { 2498 #if defined(PETSC_USE_CTABLE) 2499 CHKERRQ(PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row)); 2500 row--; 2501 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2502 #else 2503 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2504 #endif 2505 max2 = rbuf2_i[ct1]; 2506 for (l=0; l<max2; l++,ct2++) { 2507 if (!allcolumns[is_no]) { 2508 #if defined(PETSC_USE_CTABLE) 2509 CHKERRQ(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol)); 2510 #else 2511 tcol = cmap_i[rbuf3_i[ct2]]; 2512 #endif 2513 if (tcol) lens_i[row]++; 2514 } else { /* allcolumns */ 2515 lens_i[row]++; /* lens_i[row] += max2 ? */ 2516 } 2517 } 2518 } 2519 } 2520 } 2521 } 2522 CHKERRQ(PetscFree(r_waits3)); 2523 CHKERRMPI(MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE)); 2524 CHKERRQ(PetscFree(s_waits3)); 2525 2526 /* Create the submatrices */ 2527 for (i=0; i<ismax; i++) { 2528 PetscInt rbs,cbs; 2529 2530 CHKERRQ(ISGetBlockSize(isrow[i],&rbs)); 2531 CHKERRQ(ISGetBlockSize(iscol[i],&cbs)); 2532 2533 CHKERRQ(MatCreate(PETSC_COMM_SELF,submats+i)); 2534 CHKERRQ(MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE)); 2535 2536 CHKERRQ(MatSetBlockSizes(submats[i],rbs,cbs)); 2537 CHKERRQ(MatSetType(submats[i],((PetscObject)A)->type_name)); 2538 CHKERRQ(MatSeqAIJSetPreallocation(submats[i],0,lens[i])); 2539 2540 /* create struct Mat_SubSppt and attached it to submat */ 2541 CHKERRQ(PetscNew(&smat_i)); 2542 subc = (Mat_SeqAIJ*)submats[i]->data; 2543 subc->submatis1 = smat_i; 2544 2545 smat_i->destroy = submats[i]->ops->destroy; 2546 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2547 submats[i]->factortype = C->factortype; 2548 2549 smat_i->id = i; 2550 smat_i->nrqs = nrqs; 2551 smat_i->nrqr = nrqr; 2552 smat_i->rbuf1 = rbuf1; 2553 smat_i->rbuf2 = rbuf2; 2554 smat_i->rbuf3 = rbuf3; 2555 smat_i->sbuf2 = sbuf2; 2556 smat_i->req_source2 = req_source2; 2557 2558 smat_i->sbuf1 = sbuf1; 2559 smat_i->ptr = ptr; 2560 smat_i->tmp = tmp; 2561 smat_i->ctr = ctr; 2562 2563 smat_i->pa = pa; 2564 smat_i->req_size = req_size; 2565 smat_i->req_source1 = req_source1; 2566 2567 smat_i->allcolumns = allcolumns[i]; 2568 smat_i->singleis = PETSC_FALSE; 2569 smat_i->row2proc = row2proc[i]; 2570 smat_i->rmap = rmap[i]; 2571 smat_i->cmap = cmap[i]; 2572 } 2573 2574 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2575 CHKERRQ(MatCreate(PETSC_COMM_SELF,&submats[0])); 2576 CHKERRQ(MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE)); 2577 CHKERRQ(MatSetType(submats[0],MATDUMMY)); 2578 2579 /* create struct Mat_SubSppt and attached it to submat */ 2580 CHKERRQ(PetscNewLog(submats[0],&smat_i)); 2581 submats[0]->data = (void*)smat_i; 2582 2583 smat_i->destroy = submats[0]->ops->destroy; 2584 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2585 submats[0]->factortype = C->factortype; 2586 2587 smat_i->id = 0; 2588 smat_i->nrqs = nrqs; 2589 smat_i->nrqr = nrqr; 2590 smat_i->rbuf1 = rbuf1; 2591 smat_i->rbuf2 = rbuf2; 2592 smat_i->rbuf3 = rbuf3; 2593 smat_i->sbuf2 = sbuf2; 2594 smat_i->req_source2 = req_source2; 2595 2596 smat_i->sbuf1 = sbuf1; 2597 smat_i->ptr = ptr; 2598 smat_i->tmp = tmp; 2599 smat_i->ctr = ctr; 2600 2601 smat_i->pa = pa; 2602 smat_i->req_size = req_size; 2603 smat_i->req_source1 = req_source1; 2604 2605 smat_i->allcolumns = PETSC_FALSE; 2606 smat_i->singleis = PETSC_FALSE; 2607 smat_i->row2proc = NULL; 2608 smat_i->rmap = NULL; 2609 smat_i->cmap = NULL; 2610 } 2611 2612 if (ismax) CHKERRQ(PetscFree(lens[0])); 2613 CHKERRQ(PetscFree(lens)); 2614 if (sbuf_aj) { 2615 CHKERRQ(PetscFree(sbuf_aj[0])); 2616 CHKERRQ(PetscFree(sbuf_aj)); 2617 } 2618 2619 } /* endof scall == MAT_INITIAL_MATRIX */ 2620 2621 /* Post recv matrix values */ 2622 CHKERRQ(PetscObjectGetNewTag((PetscObject)C,&tag4)); 2623 CHKERRQ(PetscMalloc1(nrqs,&rbuf4)); 2624 CHKERRQ(PetscMalloc1(nrqs,&r_waits4)); 2625 for (i=0; i<nrqs; ++i) { 2626 CHKERRQ(PetscMalloc1(rbuf2[i][0],&rbuf4[i])); 2627 CHKERRMPI(MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i)); 2628 } 2629 2630 /* Allocate sending buffers for a->a, and send them off */ 2631 CHKERRQ(PetscMalloc1(nrqr,&sbuf_aa)); 2632 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2633 if (nrqr) CHKERRQ(PetscMalloc1(j,&sbuf_aa[0])); 2634 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2635 2636 CHKERRQ(PetscMalloc1(nrqr,&s_waits4)); 2637 { 2638 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2639 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2640 PetscInt cend = C->cmap->rend; 2641 PetscInt *b_j = b->j; 2642 PetscScalar *vworkA,*vworkB,*a_a,*b_a; 2643 2644 CHKERRQ(MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a)); 2645 CHKERRQ(MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a)); 2646 for (i=0; i<nrqr; i++) { 2647 rbuf1_i = rbuf1[i]; 2648 sbuf_aa_i = sbuf_aa[i]; 2649 ct1 = 2*rbuf1_i[0]+1; 2650 ct2 = 0; 2651 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2652 kmax = rbuf1_i[2*j]; 2653 for (k=0; k<kmax; k++,ct1++) { 2654 row = rbuf1_i[ct1] - rstart; 2655 nzA = a_i[row+1] - a_i[row]; 2656 nzB = b_i[row+1] - b_i[row]; 2657 ncols = nzA + nzB; 2658 cworkB = b_j + b_i[row]; 2659 vworkA = a_a + a_i[row]; 2660 vworkB = b_a + b_i[row]; 2661 2662 /* load the column values for this row into vals*/ 2663 vals = sbuf_aa_i+ct2; 2664 2665 lwrite = 0; 2666 for (l=0; l<nzB; l++) { 2667 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2668 } 2669 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2670 for (l=0; l<nzB; l++) { 2671 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2672 } 2673 2674 ct2 += ncols; 2675 } 2676 } 2677 CHKERRMPI(MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i)); 2678 } 2679 CHKERRQ(MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a)); 2680 CHKERRQ(MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a)); 2681 } 2682 2683 /* Assemble the matrices */ 2684 /* First assemble the local rows */ 2685 for (i=0; i<ismax; i++) { 2686 row2proc_i = row2proc[i]; 2687 subc = (Mat_SeqAIJ*)submats[i]->data; 2688 imat_ilen = subc->ilen; 2689 imat_j = subc->j; 2690 imat_i = subc->i; 2691 imat_a = subc->a; 2692 2693 if (!allcolumns[i]) cmap_i = cmap[i]; 2694 rmap_i = rmap[i]; 2695 irow_i = irow[i]; 2696 jmax = nrow[i]; 2697 for (j=0; j<jmax; j++) { 2698 row = irow_i[j]; 2699 proc = row2proc_i[j]; 2700 if (proc == rank) { 2701 old_row = row; 2702 #if defined(PETSC_USE_CTABLE) 2703 CHKERRQ(PetscTableFind(rmap_i,row+1,&row)); 2704 row--; 2705 #else 2706 row = rmap_i[row]; 2707 #endif 2708 ilen_row = imat_ilen[row]; 2709 CHKERRQ(MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals)); 2710 mat_i = imat_i[row]; 2711 mat_a = imat_a + mat_i; 2712 mat_j = imat_j + mat_i; 2713 if (!allcolumns[i]) { 2714 for (k=0; k<ncols; k++) { 2715 #if defined(PETSC_USE_CTABLE) 2716 CHKERRQ(PetscTableFind(cmap_i,cols[k]+1,&tcol)); 2717 #else 2718 tcol = cmap_i[cols[k]]; 2719 #endif 2720 if (tcol) { 2721 *mat_j++ = tcol - 1; 2722 *mat_a++ = vals[k]; 2723 ilen_row++; 2724 } 2725 } 2726 } else { /* allcolumns */ 2727 for (k=0; k<ncols; k++) { 2728 *mat_j++ = cols[k]; /* global col index! */ 2729 *mat_a++ = vals[k]; 2730 ilen_row++; 2731 } 2732 } 2733 CHKERRQ(MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals)); 2734 2735 imat_ilen[row] = ilen_row; 2736 } 2737 } 2738 } 2739 2740 /* Now assemble the off proc rows */ 2741 CHKERRMPI(MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE)); 2742 for (tmp2=0; tmp2<nrqs; tmp2++) { 2743 sbuf1_i = sbuf1[pa[tmp2]]; 2744 jmax = sbuf1_i[0]; 2745 ct1 = 2*jmax + 1; 2746 ct2 = 0; 2747 rbuf2_i = rbuf2[tmp2]; 2748 rbuf3_i = rbuf3[tmp2]; 2749 rbuf4_i = rbuf4[tmp2]; 2750 for (j=1; j<=jmax; j++) { 2751 is_no = sbuf1_i[2*j-1]; 2752 rmap_i = rmap[is_no]; 2753 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2754 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2755 imat_ilen = subc->ilen; 2756 imat_j = subc->j; 2757 imat_i = subc->i; 2758 imat_a = subc->a; 2759 max1 = sbuf1_i[2*j]; 2760 for (k=0; k<max1; k++,ct1++) { 2761 row = sbuf1_i[ct1]; 2762 #if defined(PETSC_USE_CTABLE) 2763 CHKERRQ(PetscTableFind(rmap_i,row+1,&row)); 2764 row--; 2765 #else 2766 row = rmap_i[row]; 2767 #endif 2768 ilen = imat_ilen[row]; 2769 mat_i = imat_i[row]; 2770 mat_a = imat_a + mat_i; 2771 mat_j = imat_j + mat_i; 2772 max2 = rbuf2_i[ct1]; 2773 if (!allcolumns[is_no]) { 2774 for (l=0; l<max2; l++,ct2++) { 2775 #if defined(PETSC_USE_CTABLE) 2776 CHKERRQ(PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol)); 2777 #else 2778 tcol = cmap_i[rbuf3_i[ct2]]; 2779 #endif 2780 if (tcol) { 2781 *mat_j++ = tcol - 1; 2782 *mat_a++ = rbuf4_i[ct2]; 2783 ilen++; 2784 } 2785 } 2786 } else { /* allcolumns */ 2787 for (l=0; l<max2; l++,ct2++) { 2788 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2789 *mat_a++ = rbuf4_i[ct2]; 2790 ilen++; 2791 } 2792 } 2793 imat_ilen[row] = ilen; 2794 } 2795 } 2796 } 2797 2798 if (!iscsorted) { /* sort column indices of the rows */ 2799 for (i=0; i<ismax; i++) { 2800 subc = (Mat_SeqAIJ*)submats[i]->data; 2801 imat_j = subc->j; 2802 imat_i = subc->i; 2803 imat_a = subc->a; 2804 imat_ilen = subc->ilen; 2805 2806 if (allcolumns[i]) continue; 2807 jmax = nrow[i]; 2808 for (j=0; j<jmax; j++) { 2809 mat_i = imat_i[j]; 2810 mat_a = imat_a + mat_i; 2811 mat_j = imat_j + mat_i; 2812 CHKERRQ(PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a)); 2813 } 2814 } 2815 } 2816 2817 CHKERRQ(PetscFree(r_waits4)); 2818 CHKERRMPI(MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE)); 2819 CHKERRQ(PetscFree(s_waits4)); 2820 2821 /* Restore the indices */ 2822 for (i=0; i<ismax; i++) { 2823 CHKERRQ(ISRestoreIndices(isrow[i],irow+i)); 2824 if (!allcolumns[i]) { 2825 CHKERRQ(ISRestoreIndices(iscol[i],icol+i)); 2826 } 2827 } 2828 2829 for (i=0; i<ismax; i++) { 2830 CHKERRQ(MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY)); 2831 CHKERRQ(MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY)); 2832 } 2833 2834 /* Destroy allocated memory */ 2835 if (sbuf_aa) { 2836 CHKERRQ(PetscFree(sbuf_aa[0])); 2837 CHKERRQ(PetscFree(sbuf_aa)); 2838 } 2839 CHKERRQ(PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted)); 2840 2841 for (i=0; i<nrqs; ++i) { 2842 CHKERRQ(PetscFree(rbuf4[i])); 2843 } 2844 CHKERRQ(PetscFree(rbuf4)); 2845 2846 CHKERRQ(PetscFree4(row2proc,cmap,rmap,allcolumns)); 2847 PetscFunctionReturn(0); 2848 } 2849 2850 /* 2851 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2852 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2853 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2854 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2855 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2856 state, and needs to be "assembled" later by compressing B's column space. 2857 2858 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2859 Following this call, C->A & C->B have been created, even if empty. 2860 */ 2861 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2862 { 2863 /* If making this function public, change the error returned in this function away from _PLIB. */ 2864 Mat_MPIAIJ *aij; 2865 Mat_SeqAIJ *Baij; 2866 PetscBool seqaij,Bdisassembled; 2867 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2868 PetscScalar v; 2869 const PetscInt *rowindices,*colindices; 2870 2871 PetscFunctionBegin; 2872 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2873 if (A) { 2874 CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij)); 2875 PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2876 if (rowemb) { 2877 CHKERRQ(ISGetLocalSize(rowemb,&m)); 2878 PetscCheckFalse(m != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT,m,A->rmap->n); 2879 } else { 2880 if (C->rmap->n != A->rmap->n) { 2881 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2882 } 2883 } 2884 if (dcolemb) { 2885 CHKERRQ(ISGetLocalSize(dcolemb,&n)); 2886 PetscCheckFalse(n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT,n,A->cmap->n); 2887 } else { 2888 PetscCheckFalse(C->cmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2889 } 2890 } 2891 if (B) { 2892 CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij)); 2893 PetscCheck(seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2894 if (rowemb) { 2895 CHKERRQ(ISGetLocalSize(rowemb,&m)); 2896 PetscCheckFalse(m != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT,m,A->rmap->n); 2897 } else { 2898 if (C->rmap->n != B->rmap->n) { 2899 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2900 } 2901 } 2902 if (ocolemb) { 2903 CHKERRQ(ISGetLocalSize(ocolemb,&n)); 2904 PetscCheckFalse(n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT,n,B->cmap->n); 2905 } else { 2906 PetscCheckFalse(C->cmap->N - C->cmap->n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2907 } 2908 } 2909 2910 aij = (Mat_MPIAIJ*)C->data; 2911 if (!aij->A) { 2912 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2913 CHKERRQ(MatCreate(PETSC_COMM_SELF,&aij->A)); 2914 CHKERRQ(MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n)); 2915 CHKERRQ(MatSetBlockSizesFromMats(aij->A,C,C)); 2916 CHKERRQ(MatSetType(aij->A,MATSEQAIJ)); 2917 CHKERRQ(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A)); 2918 } 2919 if (A) { 2920 CHKERRQ(MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A)); 2921 } else { 2922 CHKERRQ(MatSetUp(aij->A)); 2923 } 2924 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2925 /* 2926 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2927 need to "disassemble" B -- convert it to using C's global indices. 2928 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2929 2930 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2931 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2932 2933 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2934 At least avoid calling MatSetValues() and the implied searches? 2935 */ 2936 2937 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2938 #if defined(PETSC_USE_CTABLE) 2939 CHKERRQ(PetscTableDestroy(&aij->colmap)); 2940 #else 2941 CHKERRQ(PetscFree(aij->colmap)); 2942 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2943 if (aij->B) { 2944 CHKERRQ(PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt))); 2945 } 2946 #endif 2947 ngcol = 0; 2948 if (aij->lvec) { 2949 CHKERRQ(VecGetSize(aij->lvec,&ngcol)); 2950 } 2951 if (aij->garray) { 2952 CHKERRQ(PetscFree(aij->garray)); 2953 CHKERRQ(PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt))); 2954 } 2955 CHKERRQ(VecDestroy(&aij->lvec)); 2956 CHKERRQ(VecScatterDestroy(&aij->Mvctx)); 2957 } 2958 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2959 CHKERRQ(MatDestroy(&aij->B)); 2960 } 2961 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2962 CHKERRQ(MatZeroEntries(aij->B)); 2963 } 2964 } 2965 Bdisassembled = PETSC_FALSE; 2966 if (!aij->B) { 2967 CHKERRQ(MatCreate(PETSC_COMM_SELF,&aij->B)); 2968 CHKERRQ(PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B)); 2969 CHKERRQ(MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N)); 2970 CHKERRQ(MatSetBlockSizesFromMats(aij->B,B,B)); 2971 CHKERRQ(MatSetType(aij->B,MATSEQAIJ)); 2972 Bdisassembled = PETSC_TRUE; 2973 } 2974 if (B) { 2975 Baij = (Mat_SeqAIJ*)B->data; 2976 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2977 CHKERRQ(PetscMalloc1(B->rmap->n,&nz)); 2978 for (i=0; i<B->rmap->n; i++) { 2979 nz[i] = Baij->i[i+1] - Baij->i[i]; 2980 } 2981 CHKERRQ(MatSeqAIJSetPreallocation(aij->B,0,nz)); 2982 CHKERRQ(PetscFree(nz)); 2983 } 2984 2985 CHKERRQ(PetscLayoutGetRange(C->rmap,&rstart,&rend)); 2986 shift = rend-rstart; 2987 count = 0; 2988 rowindices = NULL; 2989 colindices = NULL; 2990 if (rowemb) { 2991 CHKERRQ(ISGetIndices(rowemb,&rowindices)); 2992 } 2993 if (ocolemb) { 2994 CHKERRQ(ISGetIndices(ocolemb,&colindices)); 2995 } 2996 for (i=0; i<B->rmap->n; i++) { 2997 PetscInt row; 2998 row = i; 2999 if (rowindices) row = rowindices[i]; 3000 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3001 col = Baij->j[count]; 3002 if (colindices) col = colindices[col]; 3003 if (Bdisassembled && col>=rstart) col += shift; 3004 v = Baij->a[count]; 3005 CHKERRQ(MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES)); 3006 ++count; 3007 } 3008 } 3009 /* No assembly for aij->B is necessary. */ 3010 /* FIXME: set aij->B's nonzerostate correctly. */ 3011 } else { 3012 CHKERRQ(MatSetUp(aij->B)); 3013 } 3014 C->preallocated = PETSC_TRUE; 3015 C->was_assembled = PETSC_FALSE; 3016 C->assembled = PETSC_FALSE; 3017 /* 3018 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3019 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3020 */ 3021 PetscFunctionReturn(0); 3022 } 3023 3024 /* 3025 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3026 */ 3027 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3028 { 3029 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3030 3031 PetscFunctionBegin; 3032 PetscValidPointer(A,2); 3033 PetscValidPointer(B,3); 3034 /* FIXME: make sure C is assembled */ 3035 *A = aij->A; 3036 *B = aij->B; 3037 /* Note that we don't incref *A and *B, so be careful! */ 3038 PetscFunctionReturn(0); 3039 } 3040 3041 /* 3042 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3043 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3044 */ 3045 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3046 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3047 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3048 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3049 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3050 { 3051 PetscMPIInt size,flag; 3052 PetscInt i,ii,cismax,ispar; 3053 Mat *A,*B; 3054 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3055 3056 PetscFunctionBegin; 3057 if (!ismax) PetscFunctionReturn(0); 3058 3059 for (i = 0, cismax = 0; i < ismax; ++i) { 3060 CHKERRMPI(MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag)); 3061 PetscCheckFalse(flag != MPI_IDENT,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3062 CHKERRMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm, &size)); 3063 if (size > 1) ++cismax; 3064 } 3065 3066 /* 3067 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3068 ispar counts the number of parallel ISs across C's comm. 3069 */ 3070 CHKERRMPI(MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C))); 3071 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3072 CHKERRQ((*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat)); 3073 PetscFunctionReturn(0); 3074 } 3075 3076 /* if (ispar) */ 3077 /* 3078 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3079 These are used to extract the off-diag portion of the resulting parallel matrix. 3080 The row IS for the off-diag portion is the same as for the diag portion, 3081 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3082 */ 3083 CHKERRQ(PetscMalloc2(cismax,&cisrow,cismax,&ciscol)); 3084 CHKERRQ(PetscMalloc1(cismax,&ciscol_p)); 3085 for (i = 0, ii = 0; i < ismax; ++i) { 3086 CHKERRMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3087 if (size > 1) { 3088 /* 3089 TODO: This is the part that's ***NOT SCALABLE***. 3090 To fix this we need to extract just the indices of C's nonzero columns 3091 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3092 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3093 to be done without serializing on the IS list, so, most likely, it is best 3094 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3095 */ 3096 CHKERRQ(ISGetNonlocalIS(iscol[i],&(ciscol[ii]))); 3097 /* Now we have to 3098 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3099 were sorted on each rank, concatenated they might no longer be sorted; 3100 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3101 indices in the nondecreasing order to the original index positions. 3102 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3103 */ 3104 CHKERRQ(ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii)); 3105 CHKERRQ(ISSort(ciscol[ii])); 3106 ++ii; 3107 } 3108 } 3109 CHKERRQ(PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p)); 3110 for (i = 0, ii = 0; i < ismax; ++i) { 3111 PetscInt j,issize; 3112 const PetscInt *indices; 3113 3114 /* 3115 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3116 */ 3117 CHKERRQ(ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i)); 3118 CHKERRQ(ISSort(isrow[i])); 3119 CHKERRQ(ISGetLocalSize(isrow[i],&issize)); 3120 CHKERRQ(ISGetIndices(isrow[i],&indices)); 3121 for (j = 1; j < issize; ++j) { 3122 PetscCheckFalse(indices[j] == indices[j-1],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]); 3123 } 3124 CHKERRQ(ISRestoreIndices(isrow[i],&indices)); 3125 CHKERRQ(ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i)); 3126 CHKERRQ(ISSort(iscol[i])); 3127 CHKERRQ(ISGetLocalSize(iscol[i],&issize)); 3128 CHKERRQ(ISGetIndices(iscol[i],&indices)); 3129 for (j = 1; j < issize; ++j) { 3130 PetscCheckFalse(indices[j-1] == indices[j],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]); 3131 } 3132 CHKERRQ(ISRestoreIndices(iscol[i],&indices)); 3133 CHKERRMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3134 if (size > 1) { 3135 cisrow[ii] = isrow[i]; 3136 ++ii; 3137 } 3138 } 3139 /* 3140 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3141 array of sequential matrices underlying the resulting parallel matrices. 3142 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3143 contain duplicates. 3144 3145 There are as many diag matrices as there are original index sets. There are only as many parallel 3146 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3147 3148 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3149 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3150 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3151 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3152 with A[i] and B[ii] extracted from the corresponding MPI submat. 3153 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3154 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3155 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3156 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3157 values into A[i] and B[ii] sitting inside the corresponding submat. 3158 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3159 A[i], B[ii], AA[i] or BB[ii] matrices. 3160 */ 3161 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3162 if (scall == MAT_INITIAL_MATRIX) { 3163 CHKERRQ(PetscMalloc1(ismax,submat)); 3164 } 3165 3166 /* Now obtain the sequential A and B submatrices separately. */ 3167 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3168 CHKERRQ((*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A)); 3169 CHKERRQ((*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B)); 3170 3171 /* 3172 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3173 matrices A & B have been extracted directly into the parallel matrices containing them, or 3174 simply into the sequential matrix identical with the corresponding A (if size == 1). 3175 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3176 to have the same sparsity pattern. 3177 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3178 must be constructed for C. This is done by setseqmat(s). 3179 */ 3180 for (i = 0, ii = 0; i < ismax; ++i) { 3181 /* 3182 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3183 That way we can avoid sorting and computing permutations when reusing. 3184 To this end: 3185 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3186 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3187 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3188 */ 3189 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3190 3191 CHKERRMPI(MPI_Comm_size(((PetscObject)isrow[i])->comm,&size)); 3192 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3193 if (size > 1) { 3194 if (scall == MAT_INITIAL_MATRIX) { 3195 CHKERRQ(MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i)); 3196 CHKERRQ(MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE)); 3197 CHKERRQ(MatSetType((*submat)[i],MATMPIAIJ)); 3198 CHKERRQ(PetscLayoutSetUp((*submat)[i]->rmap)); 3199 CHKERRQ(PetscLayoutSetUp((*submat)[i]->cmap)); 3200 } 3201 /* 3202 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3203 */ 3204 { 3205 Mat AA = A[i],BB = B[ii]; 3206 3207 if (AA || BB) { 3208 CHKERRQ(setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB)); 3209 CHKERRQ(MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY)); 3210 CHKERRQ(MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY)); 3211 } 3212 CHKERRQ(MatDestroy(&AA)); 3213 } 3214 CHKERRQ(ISDestroy(ciscol+ii)); 3215 CHKERRQ(ISDestroy(ciscol_p+ii)); 3216 ++ii; 3217 } else { /* if (size == 1) */ 3218 if (scall == MAT_REUSE_MATRIX) { 3219 CHKERRQ(MatDestroy(&(*submat)[i])); 3220 } 3221 if (isrow_p[i] || iscol_p[i]) { 3222 CHKERRQ(MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i)); 3223 CHKERRQ(setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i])); 3224 /* Otherwise A is extracted straight into (*submats)[i]. */ 3225 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3226 CHKERRQ(MatDestroy(A+i)); 3227 } else (*submat)[i] = A[i]; 3228 } 3229 CHKERRQ(ISDestroy(&isrow_p[i])); 3230 CHKERRQ(ISDestroy(&iscol_p[i])); 3231 } 3232 CHKERRQ(PetscFree2(cisrow,ciscol)); 3233 CHKERRQ(PetscFree2(isrow_p,iscol_p)); 3234 CHKERRQ(PetscFree(ciscol_p)); 3235 CHKERRQ(PetscFree(A)); 3236 CHKERRQ(MatDestroySubMatrices(cismax,&B)); 3237 PetscFunctionReturn(0); 3238 } 3239 3240 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3241 { 3242 PetscFunctionBegin; 3243 CHKERRQ(MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ)); 3244 PetscFunctionReturn(0); 3245 } 3246