1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscInt **iindices; 58 PetscLayout rmap; 59 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 60 PetscSF sf; 61 PetscSFNode *remote; 62 63 PetscFunctionBegin; 64 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 65 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 66 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 67 /* get row map to determine where rows should be going */ 68 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 69 /* retrieve IS data and put all together so that we 70 * can optimize communication 71 * */ 72 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 73 for (i=0,tlength=0; i<nidx; i++){ 74 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 75 tlength += length[i]; 76 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 77 } 78 /* find these rows on remote processors */ 79 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 80 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 81 nrrows = 0; 82 for (i=0; i<nidx; i++){ 83 length_i = length[i]; 84 indices_i = indices[i]; 85 for (j=0; j<length_i; j++){ 86 owner = -1; 87 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 88 /* remote processors */ 89 if (owner != rank){ 90 tosizes_temp[owner]++; /* number of rows to owner */ 91 rrow_ranks[nrrows] = owner; /* processor */ 92 rrow_isids[nrrows] = i; /* is id */ 93 remoterows[nrrows++] = indices_i[j]; /* row */ 94 } 95 } 96 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 97 } 98 iindices = (PetscInt**)indices; /* required because PetscFreeN() cannot work on const */ 99 ierr = PetscFree2(iindices,length);CHKERRQ(ierr); 100 /* test if we need to exchange messages 101 * generally speaking, we do not need to exchange 102 * data when overlap is 1 103 * */ 104 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 105 /* we do not have any messages 106 * It usually corresponds to overlap 1 107 * */ 108 if (!reducednrrows){ 109 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 110 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 111 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 112 PetscFunctionReturn(0); 113 } 114 nto = 0; 115 /* send sizes and ranks for building a two-sided communcation */ 116 for (i=0; i<size; i++){ 117 if (tosizes_temp[i]){ 118 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 119 tosizes_temp[i] = nto; /* a map from processor to index */ 120 toranks[nto++] = i; /* processor */ 121 } 122 } 123 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 124 for (i=0; i<nto; i++){ 125 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 126 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 127 } 128 /* send information to other processors */ 129 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 130 nrecvrows = 0; 131 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 132 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 133 nrecvrows = 0; 134 for (i=0; i<nfrom; i++){ 135 for (j=0; j<fromsizes[2*i]; j++){ 136 remote[nrecvrows].rank = fromranks[i]; 137 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 138 } 139 } 140 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 141 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 142 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 143 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 144 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 145 /* message pair <no of is, row> */ 146 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 147 for (i=0; i<nrrows; i++){ 148 owner = rrow_ranks[i]; /* processor */ 149 j = tosizes_temp[owner]; /* index */ 150 todata[toffsets[j]++] = rrow_isids[i]; 151 todata[toffsets[j]++] = remoterows[i]; 152 } 153 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 154 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 155 ierr = PetscFree(toffsets);CHKERRQ(ierr); 156 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 157 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 158 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 159 /* send rows belonging to the remote so that then we could get the overlapping data back */ 160 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 161 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 163 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 164 ierr = PetscFree(fromranks);CHKERRQ(ierr); 165 nrecvrows = 0; 166 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 167 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 168 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 169 nrecvrows = 0; 170 for (i=0; i<nto; i++){ 171 for (j=0; j<tosizes[2*i]; j++){ 172 remote[nrecvrows].rank = toranks[i]; 173 remote[nrecvrows++].index = tosizes[2*i+1]+j; 174 } 175 } 176 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 177 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 178 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 179 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 180 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 181 /* overlap communication and computation */ 182 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 184 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 185 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 186 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 187 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 188 ierr = PetscFree(toranks);CHKERRQ(ierr); 189 ierr = PetscFree(tosizes);CHKERRQ(ierr); 190 ierr = PetscFree(todata);CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 } 193 194 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 195 { 196 PetscInt *isz,isz_i,i,j,is_id, data_size; 197 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 198 const PetscInt *indices_i_temp; 199 PetscErrorCode ierr; 200 201 PetscFunctionBegin; 202 max_lsize = 0; 203 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 204 for (i=0; i<nidx; i++){ 205 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 206 max_lsize = lsize>max_lsize ? lsize:max_lsize; 207 isz[i] = lsize; 208 } 209 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 213 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 214 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 215 } 216 /* retrieve information to get row id and its overlap */ 217 for (i=0; i<nrecvs; ){ 218 is_id = recvdata[i++]; 219 data_size = recvdata[i++]; 220 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 221 isz_i = isz[is_id]; 222 for (j=0; j< data_size; j++){ 223 col = recvdata[i++]; 224 indices_i[isz_i++] = col; 225 } 226 isz[is_id] = isz_i; 227 } 228 /* remove duplicate entities */ 229 for (i=0; i<nidx; i++){ 230 indices_i = indices_temp+(max_lsize+nrecvs)*i; 231 isz_i = isz[i]; 232 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 233 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 234 } 235 ierr = PetscFree(isz);CHKERRQ(ierr); 236 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 237 PetscFunctionReturn(0); 238 } 239 240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 241 { 242 PetscLayout rmap,cmap; 243 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 244 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 245 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 246 const PetscInt *gcols,*ai,*aj,*bi,*bj; 247 Mat amat,bmat; 248 PetscMPIInt rank; 249 PetscBool done; 250 MPI_Comm comm; 251 PetscErrorCode ierr; 252 253 PetscFunctionBegin; 254 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 255 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 256 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 257 /* Even if the mat is symmetric, we still assume it is not symmetric */ 258 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 261 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 262 /* total number of nonzero values is used to estimate the memory usage in the next step */ 263 tnz = ai[an]+bi[bn]; 264 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 265 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 266 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 267 /* to find the longest message */ 268 max_fszs = 0; 269 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 270 /* better way to estimate number of nonzero in the mat??? */ 271 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 272 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 273 rows_pos = 0; 274 totalrows = 0; 275 for (i=0; i<nfrom; i++){ 276 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 277 /* group data together */ 278 for (j=0; j<fromsizes[2*i]; j+=2){ 279 is_id = fromrows[rows_pos++];/* no of is */ 280 rows_i = rows_data[is_id]; 281 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 282 } 283 /* estimate a space to avoid multiple allocations */ 284 for (j=0; j<nidx; j++){ 285 indvc_ij = 0; 286 rows_i = rows_data[j]; 287 for (l=0; l<rows_pos_i[j]; l++){ 288 row = rows_i[l]-rstart; 289 start = ai[row]; 290 end = ai[row+1]; 291 for (k=start; k<end; k++){ /* Amat */ 292 col = aj[k] + cstart; 293 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 294 } 295 start = bi[row]; 296 end = bi[row+1]; 297 for (k=start; k<end; k++) { /* Bmat */ 298 col = gcols[bj[k]]; 299 indices_tmp[indvc_ij++] = col; 300 } 301 } 302 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 303 indv_counts[i*nidx+j] = indvc_ij; 304 totalrows += indvc_ij; 305 } 306 } 307 /* message triple <no of is, number of rows, rows> */ 308 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 309 totalrows = 0; 310 rows_pos = 0; 311 /* use this code again */ 312 for (i=0;i<nfrom;i++){ 313 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 314 for (j=0; j<fromsizes[2*i]; j+=2){ 315 is_id = fromrows[rows_pos++]; 316 rows_i = rows_data[is_id]; 317 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 318 } 319 /* add data */ 320 for (j=0; j<nidx; j++){ 321 if (!indv_counts[i*nidx+j]) continue; 322 indvc_ij = 0; 323 sbdata[totalrows++] = j; 324 sbdata[totalrows++] = indv_counts[i*nidx+j]; 325 sbsizes[2*i] += 2; 326 rows_i = rows_data[j]; 327 for (l=0; l<rows_pos_i[j]; l++){ 328 row = rows_i[l]-rstart; 329 start = ai[row]; 330 end = ai[row+1]; 331 for (k=start; k<end; k++){ /* Amat */ 332 col = aj[k] + cstart; 333 indices_tmp[indvc_ij++] = col; 334 } 335 start = bi[row]; 336 end = bi[row+1]; 337 for (k=start; k<end; k++) { /* Bmat */ 338 col = gcols[bj[k]]; 339 indices_tmp[indvc_ij++] = col; 340 } 341 } 342 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 343 sbsizes[2*i] += indvc_ij; 344 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 345 totalrows += indvc_ij; 346 } 347 } 348 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 349 for (i=0; i<nfrom; i++){ 350 offsets[i+1] = offsets[i] + sbsizes[2*i]; 351 sbsizes[2*i+1] = offsets[i]; 352 } 353 ierr = PetscFree(offsets);CHKERRQ(ierr); 354 if (sbrowsizes) *sbrowsizes = sbsizes; 355 if (sbrows) *sbrows = sbdata; 356 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 357 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 358 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 359 PetscFunctionReturn(0); 360 } 361 362 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 363 { 364 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 365 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 366 PetscInt lsize,lsize_tmp,owner; 367 PetscMPIInt rank; 368 Mat amat,bmat; 369 PetscBool done; 370 PetscLayout cmap,rmap; 371 MPI_Comm comm; 372 PetscErrorCode ierr; 373 374 PetscFunctionBegin; 375 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 376 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 377 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 378 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 381 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 382 /* is it a safe way to compute number of nonzero values ? */ 383 tnz = ai[an]+bi[bn]; 384 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 385 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 386 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 387 /* it is a better way to estimate memory than the old implementation 388 * where global size of matrix is used 389 * */ 390 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 391 for (i=0; i<nidx; i++) { 392 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 393 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 394 lsize_tmp = 0; 395 for (j=0; j<lsize; j++) { 396 owner = -1; 397 row = indices[j]; 398 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 399 if (owner != rank) continue; 400 /* local number */ 401 row -= rstart; 402 start = ai[row]; 403 end = ai[row+1]; 404 for (k=start; k<end; k++) { /* Amat */ 405 col = aj[k] + cstart; 406 indices_temp[lsize_tmp++] = col; 407 } 408 start = bi[row]; 409 end = bi[row+1]; 410 for (k=start; k<end; k++) { /* Bmat */ 411 col = gcols[bj[k]]; 412 indices_temp[lsize_tmp++] = col; 413 } 414 } 415 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 416 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 417 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 418 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 419 } 420 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 421 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 422 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 423 PetscFunctionReturn(0); 424 } 425 426 427 /* 428 Sample message format: 429 If a processor A wants processor B to process some elements corresponding 430 to index sets is[1],is[5] 431 mesg [0] = 2 (no of index sets in the mesg) 432 ----------- 433 mesg [1] = 1 => is[1] 434 mesg [2] = sizeof(is[1]); 435 ----------- 436 mesg [3] = 5 => is[5] 437 mesg [4] = sizeof(is[5]); 438 ----------- 439 mesg [5] 440 mesg [n] datas[1] 441 ----------- 442 mesg[n+1] 443 mesg[m] data(is[5]) 444 ----------- 445 446 Notes: 447 nrqs - no of requests sent (or to be sent out) 448 nrqr - no of requests recieved (which have to be or which have been processed 449 */ 450 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 451 { 452 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 453 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 454 const PetscInt **idx,*idx_i; 455 PetscInt **iidx; 456 PetscInt *n,**data,len; 457 #if defined(PETSC_USE_CTABLE) 458 PetscTable *table_data,table_data_i; 459 PetscInt *tdata,tcount,tcount_max; 460 #else 461 PetscInt *data_i,*d_p; 462 #endif 463 PetscErrorCode ierr; 464 PetscMPIInt size,rank,tag1,tag2; 465 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 466 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 467 PetscBT *table; 468 MPI_Comm comm; 469 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 470 MPI_Status *s_status,*recv_status; 471 char *t_p; 472 473 PetscFunctionBegin; 474 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 475 size = c->size; 476 rank = c->rank; 477 M = C->rmap->N; 478 479 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 480 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 481 482 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 483 484 for (i=0; i<imax; i++) { 485 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 486 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 487 } 488 489 /* evaluate communication - mesg to who,length of mesg, and buffer space 490 required. Based on this, buffers are allocated, and data copied into them */ 491 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 492 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 493 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 495 for (i=0; i<imax; i++) { 496 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 497 idx_i = idx[i]; 498 len = n[i]; 499 for (j=0; j<len; j++) { 500 row = idx_i[j]; 501 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 502 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 503 w4[proc]++; 504 } 505 for (j=0; j<size; j++) { 506 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 507 } 508 } 509 510 nrqs = 0; /* no of outgoing messages */ 511 msz = 0; /* total mesg length (for all proc */ 512 w1[rank] = 0; /* no mesg sent to intself */ 513 w3[rank] = 0; 514 for (i=0; i<size; i++) { 515 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 516 } 517 /* pa - is list of processors to communicate with */ 518 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 519 for (i=0,j=0; i<size; i++) { 520 if (w1[i]) {pa[j] = i; j++;} 521 } 522 523 /* Each message would have a header = 1 + 2*(no of IS) + data */ 524 for (i=0; i<nrqs; i++) { 525 j = pa[i]; 526 w1[j] += w2[j] + 2*w3[j]; 527 msz += w1[j]; 528 } 529 530 /* Determine the number of messages to expect, their lengths, from from-ids */ 531 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 532 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 533 534 /* Now post the Irecvs corresponding to these messages */ 535 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 536 537 /* Allocate Memory for outgoing messages */ 538 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 539 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 540 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 541 542 { 543 PetscInt *iptr = tmp,ict = 0; 544 for (i=0; i<nrqs; i++) { 545 j = pa[i]; 546 iptr += ict; 547 outdat[j] = iptr; 548 ict = w1[j]; 549 } 550 } 551 552 /* Form the outgoing messages */ 553 /* plug in the headers */ 554 for (i=0; i<nrqs; i++) { 555 j = pa[i]; 556 outdat[j][0] = 0; 557 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 558 ptr[j] = outdat[j] + 2*w3[j] + 1; 559 } 560 561 /* Memory for doing local proc's work */ 562 { 563 PetscInt M_BPB_imax = 0; 564 #if defined(PETSC_USE_CTABLE) 565 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 566 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 567 for (i=0; i<imax; i++) { 568 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 569 } 570 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 571 for (i=0; i<imax; i++) { 572 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 573 } 574 #else 575 PetscInt Mimax = 0; 576 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 577 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 578 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 579 for (i=0; i<imax; i++) { 580 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 581 data[i] = d_p + M*i; 582 } 583 #endif 584 } 585 586 /* Parse the IS and update local tables and the outgoing buf with the data */ 587 { 588 PetscInt n_i,isz_i,*outdat_j,ctr_j; 589 PetscBT table_i; 590 591 for (i=0; i<imax; i++) { 592 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 593 n_i = n[i]; 594 table_i = table[i]; 595 idx_i = idx[i]; 596 #if defined(PETSC_USE_CTABLE) 597 table_data_i = table_data[i]; 598 #else 599 data_i = data[i]; 600 #endif 601 isz_i = isz[i]; 602 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 603 row = idx_i[j]; 604 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 605 if (proc != rank) { /* copy to the outgoing buffer */ 606 ctr[proc]++; 607 *ptr[proc] = row; 608 ptr[proc]++; 609 } else if (!PetscBTLookupSet(table_i,row)) { 610 #if defined(PETSC_USE_CTABLE) 611 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 612 #else 613 data_i[isz_i] = row; /* Update the local table */ 614 #endif 615 isz_i++; 616 } 617 } 618 /* Update the headers for the current IS */ 619 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 620 if ((ctr_j = ctr[j])) { 621 outdat_j = outdat[j]; 622 k = ++outdat_j[0]; 623 outdat_j[2*k] = ctr_j; 624 outdat_j[2*k-1] = i; 625 } 626 } 627 isz[i] = isz_i; 628 } 629 } 630 631 /* Now post the sends */ 632 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 633 for (i=0; i<nrqs; ++i) { 634 j = pa[i]; 635 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 636 } 637 638 /* No longer need the original indices */ 639 for (i=0; i<imax; ++i) { 640 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 641 } 642 iidx = (PetscInt**)idx; /* required because PetscFreeN() cannot work on const */ 643 ierr = PetscFree2(iidx,n);CHKERRQ(ierr); 644 645 for (i=0; i<imax; ++i) { 646 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 647 } 648 649 /* Do Local work */ 650 #if defined(PETSC_USE_CTABLE) 651 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 652 #else 653 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 654 #endif 655 656 /* Receive messages */ 657 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 658 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 659 660 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 661 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 662 663 /* Phase 1 sends are complete - deallocate buffers */ 664 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 665 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 666 667 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 668 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 669 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 670 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 671 ierr = PetscFree(rbuf);CHKERRQ(ierr); 672 673 674 /* Send the data back */ 675 /* Do a global reduction to know the buffer space req for incoming messages */ 676 { 677 PetscMPIInt *rw1; 678 679 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 680 681 for (i=0; i<nrqr; ++i) { 682 proc = recv_status[i].MPI_SOURCE; 683 684 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 685 rw1[proc] = isz1[i]; 686 } 687 ierr = PetscFree(onodes1);CHKERRQ(ierr); 688 ierr = PetscFree(olengths1);CHKERRQ(ierr); 689 690 /* Determine the number of messages to expect, their lengths, from from-ids */ 691 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 692 ierr = PetscFree(rw1);CHKERRQ(ierr); 693 } 694 /* Now post the Irecvs corresponding to these messages */ 695 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 696 697 /* Now post the sends */ 698 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 699 for (i=0; i<nrqr; ++i) { 700 j = recv_status[i].MPI_SOURCE; 701 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 702 } 703 704 /* receive work done on other processors */ 705 { 706 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 707 PetscMPIInt idex; 708 PetscBT table_i; 709 MPI_Status *status2; 710 711 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 712 for (i=0; i<nrqs; ++i) { 713 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 714 /* Process the message */ 715 rbuf2_i = rbuf2[idex]; 716 ct1 = 2*rbuf2_i[0]+1; 717 jmax = rbuf2[idex][0]; 718 for (j=1; j<=jmax; j++) { 719 max = rbuf2_i[2*j]; 720 is_no = rbuf2_i[2*j-1]; 721 isz_i = isz[is_no]; 722 table_i = table[is_no]; 723 #if defined(PETSC_USE_CTABLE) 724 table_data_i = table_data[is_no]; 725 #else 726 data_i = data[is_no]; 727 #endif 728 for (k=0; k<max; k++,ct1++) { 729 row = rbuf2_i[ct1]; 730 if (!PetscBTLookupSet(table_i,row)) { 731 #if defined(PETSC_USE_CTABLE) 732 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 733 #else 734 data_i[isz_i] = row; 735 #endif 736 isz_i++; 737 } 738 } 739 isz[is_no] = isz_i; 740 } 741 } 742 743 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 744 ierr = PetscFree(status2);CHKERRQ(ierr); 745 } 746 747 #if defined(PETSC_USE_CTABLE) 748 tcount_max = 0; 749 for (i=0; i<imax; ++i) { 750 table_data_i = table_data[i]; 751 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 752 if (tcount_max < tcount) tcount_max = tcount; 753 } 754 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 755 #endif 756 757 for (i=0; i<imax; ++i) { 758 #if defined(PETSC_USE_CTABLE) 759 PetscTablePosition tpos; 760 table_data_i = table_data[i]; 761 762 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 763 while (tpos) { 764 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 765 tdata[--j] = --k; 766 } 767 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 768 #else 769 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 770 #endif 771 } 772 773 ierr = PetscFree(onodes2);CHKERRQ(ierr); 774 ierr = PetscFree(olengths2);CHKERRQ(ierr); 775 776 ierr = PetscFree(pa);CHKERRQ(ierr); 777 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 778 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 779 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 780 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 781 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 782 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 783 ierr = PetscFree(s_status);CHKERRQ(ierr); 784 ierr = PetscFree(recv_status);CHKERRQ(ierr); 785 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 786 ierr = PetscFree(xdata);CHKERRQ(ierr); 787 ierr = PetscFree(isz1);CHKERRQ(ierr); 788 #if defined(PETSC_USE_CTABLE) 789 for (i=0; i<imax; i++) { 790 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 791 } 792 ierr = PetscFree(table_data);CHKERRQ(ierr); 793 ierr = PetscFree(tdata);CHKERRQ(ierr); 794 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 795 #else 796 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 797 #endif 798 PetscFunctionReturn(0); 799 } 800 801 /* 802 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 803 the work on the local processor. 804 805 Inputs: 806 C - MAT_MPIAIJ; 807 imax - total no of index sets processed at a time; 808 table - an array of char - size = m bits. 809 810 Output: 811 isz - array containing the count of the solution elements corresponding 812 to each index set; 813 data or table_data - pointer to the solutions 814 */ 815 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 816 { 817 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 818 Mat A = c->A,B = c->B; 819 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 820 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 821 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 822 PetscBT table_i; 823 #if defined(PETSC_USE_CTABLE) 824 PetscTable table_data_i; 825 PetscErrorCode ierr; 826 PetscTablePosition tpos; 827 PetscInt tcount,*tdata; 828 #else 829 PetscInt *data_i; 830 #endif 831 832 PetscFunctionBegin; 833 rstart = C->rmap->rstart; 834 cstart = C->cmap->rstart; 835 ai = a->i; 836 aj = a->j; 837 bi = b->i; 838 bj = b->j; 839 garray = c->garray; 840 841 for (i=0; i<imax; i++) { 842 #if defined(PETSC_USE_CTABLE) 843 /* copy existing entries of table_data_i into tdata[] */ 844 table_data_i = table_data[i]; 845 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 846 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 847 848 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 849 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 850 while (tpos) { 851 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 852 tdata[--j] = --row; 853 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 854 } 855 #else 856 data_i = data[i]; 857 #endif 858 table_i = table[i]; 859 isz_i = isz[i]; 860 max = isz[i]; 861 862 for (j=0; j<max; j++) { 863 #if defined(PETSC_USE_CTABLE) 864 row = tdata[j] - rstart; 865 #else 866 row = data_i[j] - rstart; 867 #endif 868 start = ai[row]; 869 end = ai[row+1]; 870 for (k=start; k<end; k++) { /* Amat */ 871 val = aj[k] + cstart; 872 if (!PetscBTLookupSet(table_i,val)) { 873 #if defined(PETSC_USE_CTABLE) 874 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 875 #else 876 data_i[isz_i] = val; 877 #endif 878 isz_i++; 879 } 880 } 881 start = bi[row]; 882 end = bi[row+1]; 883 for (k=start; k<end; k++) { /* Bmat */ 884 val = garray[bj[k]]; 885 if (!PetscBTLookupSet(table_i,val)) { 886 #if defined(PETSC_USE_CTABLE) 887 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 888 #else 889 data_i[isz_i] = val; 890 #endif 891 isz_i++; 892 } 893 } 894 } 895 isz[i] = isz_i; 896 897 #if defined(PETSC_USE_CTABLE) 898 ierr = PetscFree(tdata);CHKERRQ(ierr); 899 #endif 900 } 901 PetscFunctionReturn(0); 902 } 903 904 /* 905 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 906 and return the output 907 908 Input: 909 C - the matrix 910 nrqr - no of messages being processed. 911 rbuf - an array of pointers to the recieved requests 912 913 Output: 914 xdata - array of messages to be sent back 915 isz1 - size of each message 916 917 For better efficiency perhaps we should malloc separately each xdata[i], 918 then if a remalloc is required we need only copy the data for that one row 919 rather then all previous rows as it is now where a single large chunck of 920 memory is used. 921 922 */ 923 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 924 { 925 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 926 Mat A = c->A,B = c->B; 927 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 928 PetscErrorCode ierr; 929 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 930 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 931 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 932 PetscInt *rbuf_i,kmax,rbuf_0; 933 PetscBT xtable; 934 935 PetscFunctionBegin; 936 m = C->rmap->N; 937 rstart = C->rmap->rstart; 938 cstart = C->cmap->rstart; 939 ai = a->i; 940 aj = a->j; 941 bi = b->i; 942 bj = b->j; 943 garray = c->garray; 944 945 946 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 947 rbuf_i = rbuf[i]; 948 rbuf_0 = rbuf_i[0]; 949 ct += rbuf_0; 950 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 951 } 952 953 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 954 else max1 = 1; 955 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 956 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 957 ++no_malloc; 958 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 959 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 960 961 ct3 = 0; 962 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 963 rbuf_i = rbuf[i]; 964 rbuf_0 = rbuf_i[0]; 965 ct1 = 2*rbuf_0+1; 966 ct2 = ct1; 967 ct3 += ct1; 968 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 969 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 970 oct2 = ct2; 971 kmax = rbuf_i[2*j]; 972 for (k=0; k<kmax; k++,ct1++) { 973 row = rbuf_i[ct1]; 974 if (!PetscBTLookupSet(xtable,row)) { 975 if (!(ct3 < mem_estimate)) { 976 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 977 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 978 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 979 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 980 xdata[0] = tmp; 981 mem_estimate = new_estimate; ++no_malloc; 982 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 983 } 984 xdata[i][ct2++] = row; 985 ct3++; 986 } 987 } 988 for (k=oct2,max2=ct2; k<max2; k++) { 989 row = xdata[i][k] - rstart; 990 start = ai[row]; 991 end = ai[row+1]; 992 for (l=start; l<end; l++) { 993 val = aj[l] + cstart; 994 if (!PetscBTLookupSet(xtable,val)) { 995 if (!(ct3 < mem_estimate)) { 996 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 997 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 998 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 999 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1000 xdata[0] = tmp; 1001 mem_estimate = new_estimate; ++no_malloc; 1002 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1003 } 1004 xdata[i][ct2++] = val; 1005 ct3++; 1006 } 1007 } 1008 start = bi[row]; 1009 end = bi[row+1]; 1010 for (l=start; l<end; l++) { 1011 val = garray[bj[l]]; 1012 if (!PetscBTLookupSet(xtable,val)) { 1013 if (!(ct3 < mem_estimate)) { 1014 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1015 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1016 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1017 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1018 xdata[0] = tmp; 1019 mem_estimate = new_estimate; ++no_malloc; 1020 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1021 } 1022 xdata[i][ct2++] = val; 1023 ct3++; 1024 } 1025 } 1026 } 1027 /* Update the header*/ 1028 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1029 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1030 } 1031 xdata[i][0] = rbuf_0; 1032 xdata[i+1] = xdata[i] + ct2; 1033 isz1[i] = ct2; /* size of each message */ 1034 } 1035 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1036 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1037 PetscFunctionReturn(0); 1038 } 1039 /* -------------------------------------------------------------------------*/ 1040 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1041 /* 1042 Every processor gets the entire matrix 1043 */ 1044 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1045 { 1046 Mat B; 1047 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1048 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1049 PetscErrorCode ierr; 1050 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1051 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1052 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1053 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1054 1055 PetscFunctionBegin; 1056 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1057 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1058 1059 if (scall == MAT_INITIAL_MATRIX) { 1060 /* ---------------------------------------------------------------- 1061 Tell every processor the number of nonzeros per row 1062 */ 1063 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1064 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1065 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1066 } 1067 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1068 for (i=0; i<size; i++) { 1069 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1070 displs[i] = A->rmap->range[i]; 1071 } 1072 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1073 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1074 #else 1075 sendcount = A->rmap->rend - A->rmap->rstart; 1076 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1077 #endif 1078 /* --------------------------------------------------------------- 1079 Create the sequential matrix of the same type as the local block diagonal 1080 */ 1081 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1082 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1083 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1084 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1085 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1086 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1087 **Bin = B; 1088 b = (Mat_SeqAIJ*)B->data; 1089 1090 /*-------------------------------------------------------------------- 1091 Copy my part of matrix column indices over 1092 */ 1093 sendcount = ad->nz + bd->nz; 1094 jsendbuf = b->j + b->i[rstarts[rank]]; 1095 a_jsendbuf = ad->j; 1096 b_jsendbuf = bd->j; 1097 n = A->rmap->rend - A->rmap->rstart; 1098 cnt = 0; 1099 for (i=0; i<n; i++) { 1100 1101 /* put in lower diagonal portion */ 1102 m = bd->i[i+1] - bd->i[i]; 1103 while (m > 0) { 1104 /* is it above diagonal (in bd (compressed) numbering) */ 1105 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1106 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1107 m--; 1108 } 1109 1110 /* put in diagonal portion */ 1111 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1112 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1113 } 1114 1115 /* put in upper diagonal portion */ 1116 while (m-- > 0) { 1117 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1118 } 1119 } 1120 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1121 1122 /*-------------------------------------------------------------------- 1123 Gather all column indices to all processors 1124 */ 1125 for (i=0; i<size; i++) { 1126 recvcounts[i] = 0; 1127 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1128 recvcounts[i] += lens[j]; 1129 } 1130 } 1131 displs[0] = 0; 1132 for (i=1; i<size; i++) { 1133 displs[i] = displs[i-1] + recvcounts[i-1]; 1134 } 1135 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1136 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1137 #else 1138 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1139 #endif 1140 /*-------------------------------------------------------------------- 1141 Assemble the matrix into useable form (note numerical values not yet set) 1142 */ 1143 /* set the b->ilen (length of each row) values */ 1144 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1145 /* set the b->i indices */ 1146 b->i[0] = 0; 1147 for (i=1; i<=A->rmap->N; i++) { 1148 b->i[i] = b->i[i-1] + lens[i-1]; 1149 } 1150 ierr = PetscFree(lens);CHKERRQ(ierr); 1151 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1152 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1153 1154 } else { 1155 B = **Bin; 1156 b = (Mat_SeqAIJ*)B->data; 1157 } 1158 1159 /*-------------------------------------------------------------------- 1160 Copy my part of matrix numerical values into the values location 1161 */ 1162 if (flag == MAT_GET_VALUES) { 1163 sendcount = ad->nz + bd->nz; 1164 sendbuf = b->a + b->i[rstarts[rank]]; 1165 a_sendbuf = ad->a; 1166 b_sendbuf = bd->a; 1167 b_sendj = bd->j; 1168 n = A->rmap->rend - A->rmap->rstart; 1169 cnt = 0; 1170 for (i=0; i<n; i++) { 1171 1172 /* put in lower diagonal portion */ 1173 m = bd->i[i+1] - bd->i[i]; 1174 while (m > 0) { 1175 /* is it above diagonal (in bd (compressed) numbering) */ 1176 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1177 sendbuf[cnt++] = *b_sendbuf++; 1178 m--; 1179 b_sendj++; 1180 } 1181 1182 /* put in diagonal portion */ 1183 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1184 sendbuf[cnt++] = *a_sendbuf++; 1185 } 1186 1187 /* put in upper diagonal portion */ 1188 while (m-- > 0) { 1189 sendbuf[cnt++] = *b_sendbuf++; 1190 b_sendj++; 1191 } 1192 } 1193 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1194 1195 /* ----------------------------------------------------------------- 1196 Gather all numerical values to all processors 1197 */ 1198 if (!recvcounts) { 1199 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1200 } 1201 for (i=0; i<size; i++) { 1202 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1203 } 1204 displs[0] = 0; 1205 for (i=1; i<size; i++) { 1206 displs[i] = displs[i-1] + recvcounts[i-1]; 1207 } 1208 recvbuf = b->a; 1209 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1210 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1211 #else 1212 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1213 #endif 1214 } /* endof (flag == MAT_GET_VALUES) */ 1215 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1216 1217 if (A->symmetric) { 1218 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1219 } else if (A->hermitian) { 1220 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1221 } else if (A->structurally_symmetric) { 1222 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1223 } 1224 PetscFunctionReturn(0); 1225 } 1226 1227 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1228 { 1229 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1230 Mat submat,A = c->A,B = c->B; 1231 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1232 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1233 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1234 const PetscInt *icol,*irow; 1235 PetscInt nrow,ncol,start; 1236 PetscErrorCode ierr; 1237 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1238 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1239 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1240 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1241 PetscInt *lens,rmax,ncols,*cols,Crow; 1242 #if defined(PETSC_USE_CTABLE) 1243 PetscTable cmap,rmap; 1244 PetscInt *cmap_loc,*rmap_loc; 1245 #else 1246 PetscInt *cmap,*rmap; 1247 #endif 1248 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1249 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1250 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1251 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1252 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1253 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1254 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1255 MPI_Comm comm; 1256 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1257 PetscMPIInt *onodes1,*olengths1,idex,end; 1258 Mat_SubSppt *smatis1; 1259 PetscBool isrowsorted,iscolsorted; 1260 1261 PetscFunctionBegin; 1262 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1263 1264 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1265 size = c->size; 1266 rank = c->rank; 1267 1268 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1269 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1270 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1271 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1272 if (allcolumns) { 1273 icol = NULL; 1274 ncol = C->cmap->N; 1275 } else { 1276 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1277 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1278 } 1279 1280 if (scall == MAT_INITIAL_MATRIX) { 1281 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1282 1283 /* Get some new tags to keep the communication clean */ 1284 tag1 = ((PetscObject)C)->tag; 1285 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1286 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1287 1288 /* evaluate communication - mesg to who, length of mesg, and buffer space 1289 required. Based on this, buffers are allocated, and data copied into them */ 1290 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1291 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1292 1293 /* w1[proc] = num of rows owned by proc -- to be requested */ 1294 proc = 0; 1295 nrqs = 0; /* num of outgoing messages */ 1296 for (j=0; j<nrow; j++) { 1297 row = irow[j]; 1298 if (!isrowsorted) proc = 0; 1299 while (row >= C->rmap->range[proc+1]) proc++; 1300 w1[proc]++; 1301 row2proc[j] = proc; /* map row index to proc */ 1302 1303 if (proc != rank && !w2[proc]) { 1304 w2[proc] = 1; nrqs++; 1305 } 1306 } 1307 w1[rank] = 0; /* rows owned by self will not be requested */ 1308 1309 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1310 for (proc=0,j=0; proc<size; proc++) { 1311 if (w1[proc]) { pa[j++] = proc;} 1312 } 1313 1314 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1315 msz = 0; /* total mesg length (for all procs) */ 1316 for (i=0; i<nrqs; i++) { 1317 proc = pa[i]; 1318 w1[proc] += 3; 1319 msz += w1[proc]; 1320 } 1321 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1322 1323 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1324 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1325 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1326 1327 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1328 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1329 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1330 1331 /* Now post the Irecvs corresponding to these messages */ 1332 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1333 1334 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1335 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1336 1337 /* Allocate Memory for outgoing messages */ 1338 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1339 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1340 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1341 1342 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1343 iptr = tmp; 1344 for (i=0; i<nrqs; i++) { 1345 proc = pa[i]; 1346 sbuf1[proc] = iptr; 1347 iptr += w1[proc]; 1348 } 1349 1350 /* Form the outgoing messages */ 1351 /* Initialize the header space */ 1352 for (i=0; i<nrqs; i++) { 1353 proc = pa[i]; 1354 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1355 ptr[proc] = sbuf1[proc] + 3; 1356 } 1357 1358 /* Parse the isrow and copy data into outbuf */ 1359 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1360 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1361 proc = row2proc[j]; 1362 if (proc != rank) { /* copy to the outgoing buf*/ 1363 *ptr[proc] = irow[j]; 1364 ctr[proc]++; ptr[proc]++; 1365 } 1366 } 1367 1368 /* Update the headers for the current IS */ 1369 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1370 if ((ctr_j = ctr[j])) { 1371 sbuf1_j = sbuf1[j]; 1372 k = ++sbuf1_j[0]; 1373 sbuf1_j[2*k] = ctr_j; 1374 sbuf1_j[2*k-1] = 0; 1375 } 1376 } 1377 1378 /* Now post the sends */ 1379 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1380 for (i=0; i<nrqs; ++i) { 1381 proc = pa[i]; 1382 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1383 } 1384 1385 /* Post Receives to capture the buffer size */ 1386 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1387 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1388 1389 rbuf2[0] = tmp + msz; 1390 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1391 1392 for (i=0; i<nrqs; ++i) { 1393 proc = pa[i]; 1394 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1395 } 1396 1397 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1398 1399 /* Send to other procs the buf size they should allocate */ 1400 /* Receive messages*/ 1401 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1402 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1403 1404 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1405 for (i=0; i<nrqr; ++i) { 1406 req_size[i] = 0; 1407 rbuf1_i = rbuf1[i]; 1408 start = 2*rbuf1_i[0] + 1; 1409 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1410 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1411 sbuf2_i = sbuf2[i]; 1412 for (j=start; j<end; j++) { 1413 k = rbuf1_i[j] - rstart; 1414 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1415 sbuf2_i[j] = ncols; 1416 req_size[i] += ncols; 1417 } 1418 req_source1[i] = r_status1[i].MPI_SOURCE; 1419 1420 /* form the header */ 1421 sbuf2_i[0] = req_size[i]; 1422 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1423 1424 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1425 } 1426 1427 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1428 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1429 1430 /* rbuf2 is received, Post recv column indices a->j */ 1431 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1432 1433 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1434 for (i=0; i<nrqs; ++i) { 1435 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1436 req_source2[i] = r_status2[i].MPI_SOURCE; 1437 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1438 } 1439 1440 /* Wait on sends1 and sends2 */ 1441 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1442 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1443 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1444 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1445 1446 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1447 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1448 1449 /* Now allocate sending buffers for a->j, and send them off */ 1450 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1451 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1452 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1453 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1454 1455 for (i=0; i<nrqr; i++) { /* for each requested message */ 1456 rbuf1_i = rbuf1[i]; 1457 sbuf_aj_i = sbuf_aj[i]; 1458 ct1 = 2*rbuf1_i[0] + 1; 1459 ct2 = 0; 1460 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1461 1462 kmax = rbuf1[i][2]; 1463 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1464 row = rbuf1_i[ct1] - rstart; 1465 nzA = ai[row+1] - ai[row]; 1466 nzB = bi[row+1] - bi[row]; 1467 ncols = nzA + nzB; 1468 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1469 1470 /* load the column indices for this row into cols*/ 1471 cols = sbuf_aj_i + ct2; 1472 1473 lwrite = 0; 1474 for (l=0; l<nzB; l++) { 1475 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1476 } 1477 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1478 for (l=0; l<nzB; l++) { 1479 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1480 } 1481 1482 ct2 += ncols; 1483 } 1484 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1485 } 1486 1487 /* create column map (cmap): global col of C -> local col of submat */ 1488 #if defined(PETSC_USE_CTABLE) 1489 if (!allcolumns) { 1490 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1491 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1492 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1493 if (icol[j] >= cstart && icol[j] <cend) { 1494 cmap_loc[icol[j] - cstart] = j+1; 1495 } else { /* use PetscTable for non-local col indices */ 1496 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1497 } 1498 } 1499 } else { 1500 cmap = NULL; 1501 cmap_loc = NULL; 1502 } 1503 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1504 #else 1505 if (!allcolumns) { 1506 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1507 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1508 } else { 1509 cmap = NULL; 1510 } 1511 #endif 1512 1513 /* Create lens for MatSeqAIJSetPreallocation() */ 1514 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1515 1516 /* Compute lens from local part of C */ 1517 for (j=0; j<nrow; j++) { 1518 row = irow[j]; 1519 proc = row2proc[j]; 1520 if (proc == rank) { 1521 /* diagonal part A = c->A */ 1522 ncols = ai[row-rstart+1] - ai[row-rstart]; 1523 cols = aj + ai[row-rstart]; 1524 if (!allcolumns) { 1525 for (k=0; k<ncols; k++) { 1526 #if defined(PETSC_USE_CTABLE) 1527 tcol = cmap_loc[cols[k]]; 1528 #else 1529 tcol = cmap[cols[k]+cstart]; 1530 #endif 1531 if (tcol) lens[j]++; 1532 } 1533 } else { /* allcolumns */ 1534 lens[j] = ncols; 1535 } 1536 1537 /* off-diagonal part B = c->B */ 1538 ncols = bi[row-rstart+1] - bi[row-rstart]; 1539 cols = bj + bi[row-rstart]; 1540 if (!allcolumns) { 1541 for (k=0; k<ncols; k++) { 1542 #if defined(PETSC_USE_CTABLE) 1543 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1544 #else 1545 tcol = cmap[bmap[cols[k]]]; 1546 #endif 1547 if (tcol) lens[j]++; 1548 } 1549 } else { /* allcolumns */ 1550 lens[j] += ncols; 1551 } 1552 } 1553 } 1554 1555 /* Create row map (rmap): global row of C -> local row of submat */ 1556 #if defined(PETSC_USE_CTABLE) 1557 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1558 for (j=0; j<nrow; j++) { 1559 row = irow[j]; 1560 proc = row2proc[j]; 1561 if (proc == rank) { /* a local row */ 1562 rmap_loc[row - rstart] = j; 1563 } else { 1564 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1565 } 1566 } 1567 #else 1568 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1569 for (j=0; j<nrow; j++) { 1570 rmap[irow[j]] = j; 1571 } 1572 #endif 1573 1574 /* Update lens from offproc data */ 1575 /* recv a->j is done */ 1576 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1577 for (i=0; i<nrqs; i++) { 1578 proc = pa[i]; 1579 sbuf1_i = sbuf1[proc]; 1580 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1581 ct1 = 2 + 1; 1582 ct2 = 0; 1583 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1584 rbuf3_i = rbuf3[i]; /* received C->j */ 1585 1586 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1587 max1 = sbuf1_i[2]; 1588 for (k=0; k<max1; k++,ct1++) { 1589 #if defined(PETSC_USE_CTABLE) 1590 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1591 row--; 1592 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1593 #else 1594 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1595 #endif 1596 /* Now, store row index of submat in sbuf1_i[ct1] */ 1597 sbuf1_i[ct1] = row; 1598 1599 nnz = rbuf2_i[ct1]; 1600 if (!allcolumns) { 1601 for (l=0; l<nnz; l++,ct2++) { 1602 #if defined(PETSC_USE_CTABLE) 1603 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1604 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1605 } else { 1606 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1607 } 1608 #else 1609 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1610 #endif 1611 if (tcol) lens[row]++; 1612 } 1613 } else { /* allcolumns */ 1614 lens[row] += nnz; 1615 } 1616 } 1617 } 1618 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1619 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1620 1621 /* Create the submatrices */ 1622 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1623 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1624 1625 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1626 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1627 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1628 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1629 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1630 1631 /* create struct Mat_SubSppt and attached it to submat */ 1632 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1633 subc = (Mat_SeqAIJ*)submat->data; 1634 subc->submatis1 = smatis1; 1635 1636 smatis1->id = 0; 1637 smatis1->nrqs = nrqs; 1638 smatis1->nrqr = nrqr; 1639 smatis1->rbuf1 = rbuf1; 1640 smatis1->rbuf2 = rbuf2; 1641 smatis1->rbuf3 = rbuf3; 1642 smatis1->sbuf2 = sbuf2; 1643 smatis1->req_source2 = req_source2; 1644 1645 smatis1->sbuf1 = sbuf1; 1646 smatis1->ptr = ptr; 1647 smatis1->tmp = tmp; 1648 smatis1->ctr = ctr; 1649 1650 smatis1->pa = pa; 1651 smatis1->req_size = req_size; 1652 smatis1->req_source1 = req_source1; 1653 1654 smatis1->allcolumns = allcolumns; 1655 smatis1->singleis = PETSC_TRUE; 1656 smatis1->row2proc = row2proc; 1657 smatis1->rmap = rmap; 1658 smatis1->cmap = cmap; 1659 #if defined(PETSC_USE_CTABLE) 1660 smatis1->rmap_loc = rmap_loc; 1661 smatis1->cmap_loc = cmap_loc; 1662 #endif 1663 1664 smatis1->destroy = submat->ops->destroy; 1665 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1666 submat->factortype = C->factortype; 1667 1668 /* compute rmax */ 1669 rmax = 0; 1670 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1671 1672 } else { /* scall == MAT_REUSE_MATRIX */ 1673 submat = submats[0]; 1674 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1675 1676 subc = (Mat_SeqAIJ*)submat->data; 1677 rmax = subc->rmax; 1678 smatis1 = subc->submatis1; 1679 nrqs = smatis1->nrqs; 1680 nrqr = smatis1->nrqr; 1681 rbuf1 = smatis1->rbuf1; 1682 rbuf2 = smatis1->rbuf2; 1683 rbuf3 = smatis1->rbuf3; 1684 req_source2 = smatis1->req_source2; 1685 1686 sbuf1 = smatis1->sbuf1; 1687 sbuf2 = smatis1->sbuf2; 1688 ptr = smatis1->ptr; 1689 tmp = smatis1->tmp; 1690 ctr = smatis1->ctr; 1691 1692 pa = smatis1->pa; 1693 req_size = smatis1->req_size; 1694 req_source1 = smatis1->req_source1; 1695 1696 allcolumns = smatis1->allcolumns; 1697 row2proc = smatis1->row2proc; 1698 rmap = smatis1->rmap; 1699 cmap = smatis1->cmap; 1700 #if defined(PETSC_USE_CTABLE) 1701 rmap_loc = smatis1->rmap_loc; 1702 cmap_loc = smatis1->cmap_loc; 1703 #endif 1704 } 1705 1706 /* Post recv matrix values */ 1707 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1708 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1709 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1710 for (i=0; i<nrqs; ++i) { 1711 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1712 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1713 } 1714 1715 /* Allocate sending buffers for a->a, and send them off */ 1716 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1717 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1718 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1719 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1720 1721 for (i=0; i<nrqr; i++) { 1722 rbuf1_i = rbuf1[i]; 1723 sbuf_aa_i = sbuf_aa[i]; 1724 ct1 = 2*rbuf1_i[0]+1; 1725 ct2 = 0; 1726 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1727 1728 kmax = rbuf1_i[2]; 1729 for (k=0; k<kmax; k++,ct1++) { 1730 row = rbuf1_i[ct1] - rstart; 1731 nzA = ai[row+1] - ai[row]; 1732 nzB = bi[row+1] - bi[row]; 1733 ncols = nzA + nzB; 1734 cworkB = bj + bi[row]; 1735 vworkA = a_a + ai[row]; 1736 vworkB = b_a + bi[row]; 1737 1738 /* load the column values for this row into vals*/ 1739 vals = sbuf_aa_i + ct2; 1740 1741 lwrite = 0; 1742 for (l=0; l<nzB; l++) { 1743 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1744 } 1745 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1746 for (l=0; l<nzB; l++) { 1747 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1748 } 1749 1750 ct2 += ncols; 1751 } 1752 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1753 } 1754 1755 /* Assemble submat */ 1756 /* First assemble the local rows */ 1757 for (j=0; j<nrow; j++) { 1758 row = irow[j]; 1759 proc = row2proc[j]; 1760 if (proc == rank) { 1761 Crow = row - rstart; /* local row index of C */ 1762 #if defined(PETSC_USE_CTABLE) 1763 row = rmap_loc[Crow]; /* row index of submat */ 1764 #else 1765 row = rmap[row]; 1766 #endif 1767 1768 if (allcolumns) { 1769 /* diagonal part A = c->A */ 1770 ncols = ai[Crow+1] - ai[Crow]; 1771 cols = aj + ai[Crow]; 1772 vals = a->a + ai[Crow]; 1773 i = 0; 1774 for (k=0; k<ncols; k++) { 1775 subcols[i] = cols[k] + cstart; 1776 subvals[i++] = vals[k]; 1777 } 1778 1779 /* off-diagonal part B = c->B */ 1780 ncols = bi[Crow+1] - bi[Crow]; 1781 cols = bj + bi[Crow]; 1782 vals = b->a + bi[Crow]; 1783 for (k=0; k<ncols; k++) { 1784 subcols[i] = bmap[cols[k]]; 1785 subvals[i++] = vals[k]; 1786 } 1787 1788 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1789 1790 } else { /* !allcolumns */ 1791 #if defined(PETSC_USE_CTABLE) 1792 /* diagonal part A = c->A */ 1793 ncols = ai[Crow+1] - ai[Crow]; 1794 cols = aj + ai[Crow]; 1795 vals = a->a + ai[Crow]; 1796 i = 0; 1797 for (k=0; k<ncols; k++) { 1798 tcol = cmap_loc[cols[k]]; 1799 if (tcol) { 1800 subcols[i] = --tcol; 1801 subvals[i++] = vals[k]; 1802 } 1803 } 1804 1805 /* off-diagonal part B = c->B */ 1806 ncols = bi[Crow+1] - bi[Crow]; 1807 cols = bj + bi[Crow]; 1808 vals = b->a + bi[Crow]; 1809 for (k=0; k<ncols; k++) { 1810 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1811 if (tcol) { 1812 subcols[i] = --tcol; 1813 subvals[i++] = vals[k]; 1814 } 1815 } 1816 #else 1817 /* diagonal part A = c->A */ 1818 ncols = ai[Crow+1] - ai[Crow]; 1819 cols = aj + ai[Crow]; 1820 vals = a->a + ai[Crow]; 1821 i = 0; 1822 for (k=0; k<ncols; k++) { 1823 tcol = cmap[cols[k]+cstart]; 1824 if (tcol) { 1825 subcols[i] = --tcol; 1826 subvals[i++] = vals[k]; 1827 } 1828 } 1829 1830 /* off-diagonal part B = c->B */ 1831 ncols = bi[Crow+1] - bi[Crow]; 1832 cols = bj + bi[Crow]; 1833 vals = b->a + bi[Crow]; 1834 for (k=0; k<ncols; k++) { 1835 tcol = cmap[bmap[cols[k]]]; 1836 if (tcol) { 1837 subcols[i] = --tcol; 1838 subvals[i++] = vals[k]; 1839 } 1840 } 1841 #endif 1842 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1843 } 1844 } 1845 } 1846 1847 /* Now assemble the off-proc rows */ 1848 for (i=0; i<nrqs; i++) { /* for each requested message */ 1849 /* recv values from other processes */ 1850 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1851 proc = pa[idex]; 1852 sbuf1_i = sbuf1[proc]; 1853 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1854 ct1 = 2 + 1; 1855 ct2 = 0; /* count of received C->j */ 1856 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1857 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1858 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1859 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1860 1861 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1862 max1 = sbuf1_i[2]; /* num of rows */ 1863 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1864 row = sbuf1_i[ct1]; /* row index of submat */ 1865 if (!allcolumns) { 1866 idex = 0; 1867 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1868 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1869 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1870 #if defined(PETSC_USE_CTABLE) 1871 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1872 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1873 } else { 1874 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1875 } 1876 #else 1877 tcol = cmap[rbuf3_i[ct2]]; 1878 #endif 1879 if (tcol) { 1880 subcols[idex] = --tcol; /* may not be sorted */ 1881 subvals[idex++] = rbuf4_i[ct2]; 1882 1883 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1884 For reuse, we replace received C->j with index that should be inserted to submat */ 1885 rbuf3_i[ct3++] = ct2; 1886 } 1887 } 1888 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1889 } else { /* scall == MAT_REUSE_MATRIX */ 1890 submat = submats[0]; 1891 subc = (Mat_SeqAIJ*)submat->data; 1892 1893 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1894 for (l=0; l<nnz; l++) { 1895 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1896 subvals[idex++] = rbuf4_i[ct2]; 1897 } 1898 1899 bj = subc->j + subc->i[row]; /* sorted column indices */ 1900 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1901 } 1902 } else { /* allcolumns */ 1903 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1904 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1905 ct2 += nnz; 1906 } 1907 } 1908 } 1909 1910 /* sending a->a are done */ 1911 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1912 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1913 1914 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1915 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1916 submats[0] = submat; 1917 1918 /* Restore the indices */ 1919 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1920 if (!allcolumns) { 1921 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1922 } 1923 1924 /* Destroy allocated memory */ 1925 for (i=0; i<nrqs; ++i) { 1926 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1927 } 1928 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1929 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1930 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1931 1932 if (scall == MAT_INITIAL_MATRIX) { 1933 ierr = PetscFree(lens);CHKERRQ(ierr); 1934 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1935 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1936 } 1937 PetscFunctionReturn(0); 1938 } 1939 1940 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1941 { 1942 PetscErrorCode ierr; 1943 PetscInt ncol; 1944 PetscBool colflag,allcolumns=PETSC_FALSE; 1945 1946 PetscFunctionBegin; 1947 /* Allocate memory to hold all the submatrices */ 1948 if (scall == MAT_INITIAL_MATRIX) { 1949 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1950 } 1951 1952 /* Check for special case: each processor gets entire matrix columns */ 1953 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1954 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1955 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1956 1957 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1958 PetscFunctionReturn(0); 1959 } 1960 1961 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1962 { 1963 PetscErrorCode ierr; 1964 PetscInt nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2]; 1965 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1966 Mat_SeqAIJ *subc; 1967 Mat_SubSppt *smat; 1968 1969 PetscFunctionBegin; 1970 /* Check for special case: each processor has a single IS */ 1971 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1972 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1973 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */ 1974 PetscFunctionReturn(0); 1975 } 1976 1977 /* Collect global wantallmatrix and nstages */ 1978 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1979 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1980 if (!nmax) nmax = 1; 1981 1982 if (scall == MAT_INITIAL_MATRIX) { 1983 /* Collect global wantallmatrix and nstages */ 1984 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1985 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1986 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1987 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1988 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1989 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1990 wantallmatrix = PETSC_TRUE; 1991 1992 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1993 } 1994 } 1995 1996 /* Determine the number of stages through which submatrices are done 1997 Each stage will extract nmax submatrices. 1998 nmax is determined by the matrix column dimension. 1999 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2000 */ 2001 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2002 2003 in[0] = -1*(PetscInt)wantallmatrix; 2004 in[1] = nstages; 2005 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2006 wantallmatrix = (PetscBool)(-out[0]); 2007 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2008 2009 } else { /* MAT_REUSE_MATRIX */ 2010 if (ismax) { 2011 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2012 smat = subc->submatis1; 2013 } else { /* (*submat)[0] is a dummy matrix */ 2014 smat = (Mat_SubSppt*)(*submat)[0]->data; 2015 } 2016 if (!smat) { 2017 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2018 wantallmatrix = PETSC_TRUE; 2019 } else if (smat->singleis) { 2020 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2021 PetscFunctionReturn(0); 2022 } else { 2023 nstages = smat->nstages; 2024 } 2025 } 2026 2027 if (wantallmatrix) { 2028 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2029 PetscFunctionReturn(0); 2030 } 2031 2032 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2033 if (scall == MAT_INITIAL_MATRIX) { 2034 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2035 } 2036 2037 for (i=0,pos=0; i<nstages; i++) { 2038 if (pos+nmax <= ismax) max_no = nmax; 2039 else if (pos == ismax) max_no = 0; 2040 else max_no = ismax-pos; 2041 2042 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2043 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2044 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2045 smat->nstages = nstages; 2046 } 2047 pos += max_no; 2048 } 2049 2050 if (ismax && scall == MAT_INITIAL_MATRIX) { 2051 /* save nstages for reuse */ 2052 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2053 smat = subc->submatis1; 2054 smat->nstages = nstages; 2055 } 2056 PetscFunctionReturn(0); 2057 } 2058 2059 /* -------------------------------------------------------------------------*/ 2060 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2061 { 2062 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2063 Mat A = c->A; 2064 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2065 const PetscInt **icol,**irow; 2066 PetscInt **iicol,**iirow; 2067 PetscInt *nrow,*ncol,start; 2068 PetscErrorCode ierr; 2069 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2070 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2071 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2072 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2073 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2074 #if defined(PETSC_USE_CTABLE) 2075 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2076 #else 2077 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2078 #endif 2079 const PetscInt *irow_i; 2080 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2081 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2082 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2083 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2084 MPI_Status *r_status3,*r_status4,*s_status4; 2085 MPI_Comm comm; 2086 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2087 PetscMPIInt *onodes1,*olengths1,end; 2088 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2089 Mat_SubSppt *smat_i; 2090 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2091 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2092 2093 PetscFunctionBegin; 2094 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2095 size = c->size; 2096 rank = c->rank; 2097 2098 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2099 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2100 2101 for (i=0; i<ismax; i++) { 2102 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2103 if (!issorted[i]) iscsorted = issorted[i]; 2104 2105 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2106 2107 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2108 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2109 2110 /* Check for special case: allcolumn */ 2111 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2112 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2113 if (colflag && ncol[i] == C->cmap->N) { 2114 allcolumns[i] = PETSC_TRUE; 2115 icol[i] = NULL; 2116 } else { 2117 allcolumns[i] = PETSC_FALSE; 2118 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2119 } 2120 } 2121 2122 if (scall == MAT_REUSE_MATRIX) { 2123 /* Assumes new rows are same length as the old rows */ 2124 for (i=0; i<ismax; i++) { 2125 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2126 subc = (Mat_SeqAIJ*)submats[i]->data; 2127 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2128 2129 /* Initial matrix as if empty */ 2130 ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2131 2132 smat_i = subc->submatis1; 2133 2134 nrqs = smat_i->nrqs; 2135 nrqr = smat_i->nrqr; 2136 rbuf1 = smat_i->rbuf1; 2137 rbuf2 = smat_i->rbuf2; 2138 rbuf3 = smat_i->rbuf3; 2139 req_source2 = smat_i->req_source2; 2140 2141 sbuf1 = smat_i->sbuf1; 2142 sbuf2 = smat_i->sbuf2; 2143 ptr = smat_i->ptr; 2144 tmp = smat_i->tmp; 2145 ctr = smat_i->ctr; 2146 2147 pa = smat_i->pa; 2148 req_size = smat_i->req_size; 2149 req_source1 = smat_i->req_source1; 2150 2151 allcolumns[i] = smat_i->allcolumns; 2152 row2proc[i] = smat_i->row2proc; 2153 rmap[i] = smat_i->rmap; 2154 cmap[i] = smat_i->cmap; 2155 } 2156 2157 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2158 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2159 smat_i = (Mat_SubSppt*)submats[0]->data; 2160 2161 nrqs = smat_i->nrqs; 2162 nrqr = smat_i->nrqr; 2163 rbuf1 = smat_i->rbuf1; 2164 rbuf2 = smat_i->rbuf2; 2165 rbuf3 = smat_i->rbuf3; 2166 req_source2 = smat_i->req_source2; 2167 2168 sbuf1 = smat_i->sbuf1; 2169 sbuf2 = smat_i->sbuf2; 2170 ptr = smat_i->ptr; 2171 tmp = smat_i->tmp; 2172 ctr = smat_i->ctr; 2173 2174 pa = smat_i->pa; 2175 req_size = smat_i->req_size; 2176 req_source1 = smat_i->req_source1; 2177 2178 allcolumns[0] = PETSC_FALSE; 2179 } 2180 } else { /* scall == MAT_INITIAL_MATRIX */ 2181 /* Get some new tags to keep the communication clean */ 2182 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2183 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2184 2185 /* evaluate communication - mesg to who, length of mesg, and buffer space 2186 required. Based on this, buffers are allocated, and data copied into them*/ 2187 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2188 2189 for (i=0; i<ismax; i++) { 2190 jmax = nrow[i]; 2191 irow_i = irow[i]; 2192 2193 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2194 row2proc[i] = row2proc_i; 2195 2196 if (issorted[i]) proc = 0; 2197 for (j=0; j<jmax; j++) { 2198 if (!issorted[i]) proc = 0; 2199 row = irow_i[j]; 2200 while (row >= C->rmap->range[proc+1]) proc++; 2201 w4[proc]++; 2202 row2proc_i[j] = proc; /* map row index to proc */ 2203 } 2204 for (j=0; j<size; j++) { 2205 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2206 } 2207 } 2208 2209 nrqs = 0; /* no of outgoing messages */ 2210 msz = 0; /* total mesg length (for all procs) */ 2211 w1[rank] = 0; /* no mesg sent to self */ 2212 w3[rank] = 0; 2213 for (i=0; i<size; i++) { 2214 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2215 } 2216 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2217 for (i=0,j=0; i<size; i++) { 2218 if (w1[i]) { pa[j] = i; j++; } 2219 } 2220 2221 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2222 for (i=0; i<nrqs; i++) { 2223 j = pa[i]; 2224 w1[j] += w2[j] + 2* w3[j]; 2225 msz += w1[j]; 2226 } 2227 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2228 2229 /* Determine the number of messages to expect, their lengths, from from-ids */ 2230 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2231 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2232 2233 /* Now post the Irecvs corresponding to these messages */ 2234 tag0 = ((PetscObject)C)->tag; 2235 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2236 2237 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2238 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2239 2240 /* Allocate Memory for outgoing messages */ 2241 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2242 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2243 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2244 2245 { 2246 PetscInt *iptr = tmp; 2247 k = 0; 2248 for (i=0; i<nrqs; i++) { 2249 j = pa[i]; 2250 iptr += k; 2251 sbuf1[j] = iptr; 2252 k = w1[j]; 2253 } 2254 } 2255 2256 /* Form the outgoing messages. Initialize the header space */ 2257 for (i=0; i<nrqs; i++) { 2258 j = pa[i]; 2259 sbuf1[j][0] = 0; 2260 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2261 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2262 } 2263 2264 /* Parse the isrow and copy data into outbuf */ 2265 for (i=0; i<ismax; i++) { 2266 row2proc_i = row2proc[i]; 2267 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2268 irow_i = irow[i]; 2269 jmax = nrow[i]; 2270 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2271 proc = row2proc_i[j]; 2272 if (proc != rank) { /* copy to the outgoing buf*/ 2273 ctr[proc]++; 2274 *ptr[proc] = irow_i[j]; 2275 ptr[proc]++; 2276 } 2277 } 2278 /* Update the headers for the current IS */ 2279 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2280 if ((ctr_j = ctr[j])) { 2281 sbuf1_j = sbuf1[j]; 2282 k = ++sbuf1_j[0]; 2283 sbuf1_j[2*k] = ctr_j; 2284 sbuf1_j[2*k-1] = i; 2285 } 2286 } 2287 } 2288 2289 /* Now post the sends */ 2290 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2291 for (i=0; i<nrqs; ++i) { 2292 j = pa[i]; 2293 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2294 } 2295 2296 /* Post Receives to capture the buffer size */ 2297 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2298 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2299 rbuf2[0] = tmp + msz; 2300 for (i=1; i<nrqs; ++i) { 2301 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2302 } 2303 for (i=0; i<nrqs; ++i) { 2304 j = pa[i]; 2305 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2306 } 2307 2308 /* Send to other procs the buf size they should allocate */ 2309 /* Receive messages*/ 2310 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2311 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2312 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2313 { 2314 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2315 PetscInt *sbuf2_i; 2316 2317 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2318 for (i=0; i<nrqr; ++i) { 2319 req_size[i] = 0; 2320 rbuf1_i = rbuf1[i]; 2321 start = 2*rbuf1_i[0] + 1; 2322 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2323 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2324 sbuf2_i = sbuf2[i]; 2325 for (j=start; j<end; j++) { 2326 id = rbuf1_i[j] - rstart; 2327 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2328 sbuf2_i[j] = ncols; 2329 req_size[i] += ncols; 2330 } 2331 req_source1[i] = r_status1[i].MPI_SOURCE; 2332 /* form the header */ 2333 sbuf2_i[0] = req_size[i]; 2334 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2335 2336 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2337 } 2338 } 2339 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2340 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2341 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2342 2343 /* Receive messages*/ 2344 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2345 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2346 2347 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2348 for (i=0; i<nrqs; ++i) { 2349 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2350 req_source2[i] = r_status2[i].MPI_SOURCE; 2351 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2352 } 2353 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2354 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2355 2356 /* Wait on sends1 and sends2 */ 2357 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2358 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2359 2360 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2361 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2362 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2363 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2364 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2365 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2366 2367 /* Now allocate sending buffers for a->j, and send them off */ 2368 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2369 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2370 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2371 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2372 2373 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2374 { 2375 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2376 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2377 PetscInt cend = C->cmap->rend; 2378 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2379 2380 for (i=0; i<nrqr; i++) { 2381 rbuf1_i = rbuf1[i]; 2382 sbuf_aj_i = sbuf_aj[i]; 2383 ct1 = 2*rbuf1_i[0] + 1; 2384 ct2 = 0; 2385 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2386 kmax = rbuf1[i][2*j]; 2387 for (k=0; k<kmax; k++,ct1++) { 2388 row = rbuf1_i[ct1] - rstart; 2389 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2390 ncols = nzA + nzB; 2391 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2392 2393 /* load the column indices for this row into cols */ 2394 cols = sbuf_aj_i + ct2; 2395 2396 lwrite = 0; 2397 for (l=0; l<nzB; l++) { 2398 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2399 } 2400 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2401 for (l=0; l<nzB; l++) { 2402 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2403 } 2404 2405 ct2 += ncols; 2406 } 2407 } 2408 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2409 } 2410 } 2411 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2412 2413 /* create col map: global col of C -> local col of submatrices */ 2414 { 2415 const PetscInt *icol_i; 2416 #if defined(PETSC_USE_CTABLE) 2417 for (i=0; i<ismax; i++) { 2418 if (!allcolumns[i]) { 2419 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2420 2421 jmax = ncol[i]; 2422 icol_i = icol[i]; 2423 cmap_i = cmap[i]; 2424 for (j=0; j<jmax; j++) { 2425 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2426 } 2427 } else cmap[i] = NULL; 2428 } 2429 #else 2430 for (i=0; i<ismax; i++) { 2431 if (!allcolumns[i]) { 2432 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2433 jmax = ncol[i]; 2434 icol_i = icol[i]; 2435 cmap_i = cmap[i]; 2436 for (j=0; j<jmax; j++) { 2437 cmap_i[icol_i[j]] = j+1; 2438 } 2439 } else cmap[i] = NULL; 2440 } 2441 #endif 2442 } 2443 2444 /* Create lens which is required for MatCreate... */ 2445 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2446 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2447 2448 if (ismax) { 2449 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2450 } 2451 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2452 2453 /* Update lens from local data */ 2454 for (i=0; i<ismax; i++) { 2455 row2proc_i = row2proc[i]; 2456 jmax = nrow[i]; 2457 if (!allcolumns[i]) cmap_i = cmap[i]; 2458 irow_i = irow[i]; 2459 lens_i = lens[i]; 2460 for (j=0; j<jmax; j++) { 2461 row = irow_i[j]; 2462 proc = row2proc_i[j]; 2463 if (proc == rank) { 2464 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2465 if (!allcolumns[i]) { 2466 for (k=0; k<ncols; k++) { 2467 #if defined(PETSC_USE_CTABLE) 2468 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2469 #else 2470 tcol = cmap_i[cols[k]]; 2471 #endif 2472 if (tcol) lens_i[j]++; 2473 } 2474 } else { /* allcolumns */ 2475 lens_i[j] = ncols; 2476 } 2477 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2478 } 2479 } 2480 } 2481 2482 /* Create row map: global row of C -> local row of submatrices */ 2483 #if defined(PETSC_USE_CTABLE) 2484 for (i=0; i<ismax; i++) { 2485 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2486 irow_i = irow[i]; 2487 jmax = nrow[i]; 2488 for (j=0; j<jmax; j++) { 2489 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2490 } 2491 } 2492 #else 2493 for (i=0; i<ismax; i++) { 2494 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2495 rmap_i = rmap[i]; 2496 irow_i = irow[i]; 2497 jmax = nrow[i]; 2498 for (j=0; j<jmax; j++) { 2499 rmap_i[irow_i[j]] = j; 2500 } 2501 } 2502 #endif 2503 2504 /* Update lens from offproc data */ 2505 { 2506 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2507 2508 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2509 for (tmp2=0; tmp2<nrqs; tmp2++) { 2510 sbuf1_i = sbuf1[pa[tmp2]]; 2511 jmax = sbuf1_i[0]; 2512 ct1 = 2*jmax+1; 2513 ct2 = 0; 2514 rbuf2_i = rbuf2[tmp2]; 2515 rbuf3_i = rbuf3[tmp2]; 2516 for (j=1; j<=jmax; j++) { 2517 is_no = sbuf1_i[2*j-1]; 2518 max1 = sbuf1_i[2*j]; 2519 lens_i = lens[is_no]; 2520 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2521 rmap_i = rmap[is_no]; 2522 for (k=0; k<max1; k++,ct1++) { 2523 #if defined(PETSC_USE_CTABLE) 2524 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2525 row--; 2526 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2527 #else 2528 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2529 #endif 2530 max2 = rbuf2_i[ct1]; 2531 for (l=0; l<max2; l++,ct2++) { 2532 if (!allcolumns[is_no]) { 2533 #if defined(PETSC_USE_CTABLE) 2534 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2535 #else 2536 tcol = cmap_i[rbuf3_i[ct2]]; 2537 #endif 2538 if (tcol) lens_i[row]++; 2539 } else { /* allcolumns */ 2540 lens_i[row]++; /* lens_i[row] += max2 ? */ 2541 } 2542 } 2543 } 2544 } 2545 } 2546 } 2547 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2548 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2549 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2550 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2551 2552 /* Create the submatrices */ 2553 for (i=0; i<ismax; i++) { 2554 PetscInt rbs,cbs; 2555 2556 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2557 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2558 2559 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2560 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2561 2562 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2563 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2564 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2565 2566 /* create struct Mat_SubSppt and attached it to submat */ 2567 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2568 subc = (Mat_SeqAIJ*)submats[i]->data; 2569 subc->submatis1 = smat_i; 2570 2571 smat_i->destroy = submats[i]->ops->destroy; 2572 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2573 submats[i]->factortype = C->factortype; 2574 2575 smat_i->id = i; 2576 smat_i->nrqs = nrqs; 2577 smat_i->nrqr = nrqr; 2578 smat_i->rbuf1 = rbuf1; 2579 smat_i->rbuf2 = rbuf2; 2580 smat_i->rbuf3 = rbuf3; 2581 smat_i->sbuf2 = sbuf2; 2582 smat_i->req_source2 = req_source2; 2583 2584 smat_i->sbuf1 = sbuf1; 2585 smat_i->ptr = ptr; 2586 smat_i->tmp = tmp; 2587 smat_i->ctr = ctr; 2588 2589 smat_i->pa = pa; 2590 smat_i->req_size = req_size; 2591 smat_i->req_source1 = req_source1; 2592 2593 smat_i->allcolumns = allcolumns[i]; 2594 smat_i->singleis = PETSC_FALSE; 2595 smat_i->row2proc = row2proc[i]; 2596 smat_i->rmap = rmap[i]; 2597 smat_i->cmap = cmap[i]; 2598 } 2599 2600 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2601 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2602 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2603 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2604 2605 /* create struct Mat_SubSppt and attached it to submat */ 2606 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2607 submats[0]->data = (void*)smat_i; 2608 2609 smat_i->destroy = submats[0]->ops->destroy; 2610 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2611 submats[0]->factortype = C->factortype; 2612 2613 smat_i->id = 0; 2614 smat_i->nrqs = nrqs; 2615 smat_i->nrqr = nrqr; 2616 smat_i->rbuf1 = rbuf1; 2617 smat_i->rbuf2 = rbuf2; 2618 smat_i->rbuf3 = rbuf3; 2619 smat_i->sbuf2 = sbuf2; 2620 smat_i->req_source2 = req_source2; 2621 2622 smat_i->sbuf1 = sbuf1; 2623 smat_i->ptr = ptr; 2624 smat_i->tmp = tmp; 2625 smat_i->ctr = ctr; 2626 2627 smat_i->pa = pa; 2628 smat_i->req_size = req_size; 2629 smat_i->req_source1 = req_source1; 2630 2631 smat_i->allcolumns = PETSC_FALSE; 2632 smat_i->singleis = PETSC_FALSE; 2633 smat_i->row2proc = NULL; 2634 smat_i->rmap = NULL; 2635 smat_i->cmap = NULL; 2636 } 2637 2638 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2639 ierr = PetscFree(lens);CHKERRQ(ierr); 2640 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2641 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2642 2643 } /* endof scall == MAT_INITIAL_MATRIX */ 2644 2645 /* Post recv matrix values */ 2646 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2647 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2648 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2649 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2650 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2651 for (i=0; i<nrqs; ++i) { 2652 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2653 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2654 } 2655 2656 /* Allocate sending buffers for a->a, and send them off */ 2657 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2658 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2659 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2660 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2661 2662 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2663 { 2664 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2665 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2666 PetscInt cend = C->cmap->rend; 2667 PetscInt *b_j = b->j; 2668 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2669 2670 for (i=0; i<nrqr; i++) { 2671 rbuf1_i = rbuf1[i]; 2672 sbuf_aa_i = sbuf_aa[i]; 2673 ct1 = 2*rbuf1_i[0]+1; 2674 ct2 = 0; 2675 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2676 kmax = rbuf1_i[2*j]; 2677 for (k=0; k<kmax; k++,ct1++) { 2678 row = rbuf1_i[ct1] - rstart; 2679 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2680 ncols = nzA + nzB; 2681 cworkB = b_j + b_i[row]; 2682 vworkA = a_a + a_i[row]; 2683 vworkB = b_a + b_i[row]; 2684 2685 /* load the column values for this row into vals*/ 2686 vals = sbuf_aa_i+ct2; 2687 2688 lwrite = 0; 2689 for (l=0; l<nzB; l++) { 2690 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2691 } 2692 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2693 for (l=0; l<nzB; l++) { 2694 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2695 } 2696 2697 ct2 += ncols; 2698 } 2699 } 2700 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2701 } 2702 } 2703 2704 /* Assemble the matrices */ 2705 /* First assemble the local rows */ 2706 for (i=0; i<ismax; i++) { 2707 row2proc_i = row2proc[i]; 2708 subc = (Mat_SeqAIJ*)submats[i]->data; 2709 imat_ilen = subc->ilen; 2710 imat_j = subc->j; 2711 imat_i = subc->i; 2712 imat_a = subc->a; 2713 2714 if (!allcolumns[i]) cmap_i = cmap[i]; 2715 rmap_i = rmap[i]; 2716 irow_i = irow[i]; 2717 jmax = nrow[i]; 2718 for (j=0; j<jmax; j++) { 2719 row = irow_i[j]; 2720 proc = row2proc_i[j]; 2721 if (proc == rank) { 2722 old_row = row; 2723 #if defined(PETSC_USE_CTABLE) 2724 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2725 row--; 2726 #else 2727 row = rmap_i[row]; 2728 #endif 2729 ilen_row = imat_ilen[row]; 2730 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2731 mat_i = imat_i[row]; 2732 mat_a = imat_a + mat_i; 2733 mat_j = imat_j + mat_i; 2734 if (!allcolumns[i]) { 2735 for (k=0; k<ncols; k++) { 2736 #if defined(PETSC_USE_CTABLE) 2737 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2738 #else 2739 tcol = cmap_i[cols[k]]; 2740 #endif 2741 if (tcol) { 2742 *mat_j++ = tcol - 1; 2743 *mat_a++ = vals[k]; 2744 ilen_row++; 2745 } 2746 } 2747 } else { /* allcolumns */ 2748 for (k=0; k<ncols; k++) { 2749 *mat_j++ = cols[k]; /* global col index! */ 2750 *mat_a++ = vals[k]; 2751 ilen_row++; 2752 } 2753 } 2754 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2755 2756 imat_ilen[row] = ilen_row; 2757 } 2758 } 2759 } 2760 2761 /* Now assemble the off proc rows */ 2762 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2763 for (tmp2=0; tmp2<nrqs; tmp2++) { 2764 sbuf1_i = sbuf1[pa[tmp2]]; 2765 jmax = sbuf1_i[0]; 2766 ct1 = 2*jmax + 1; 2767 ct2 = 0; 2768 rbuf2_i = rbuf2[tmp2]; 2769 rbuf3_i = rbuf3[tmp2]; 2770 rbuf4_i = rbuf4[tmp2]; 2771 for (j=1; j<=jmax; j++) { 2772 is_no = sbuf1_i[2*j-1]; 2773 rmap_i = rmap[is_no]; 2774 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2775 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2776 imat_ilen = subc->ilen; 2777 imat_j = subc->j; 2778 imat_i = subc->i; 2779 imat_a = subc->a; 2780 max1 = sbuf1_i[2*j]; 2781 for (k=0; k<max1; k++,ct1++) { 2782 row = sbuf1_i[ct1]; 2783 #if defined(PETSC_USE_CTABLE) 2784 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2785 row--; 2786 #else 2787 row = rmap_i[row]; 2788 #endif 2789 ilen = imat_ilen[row]; 2790 mat_i = imat_i[row]; 2791 mat_a = imat_a + mat_i; 2792 mat_j = imat_j + mat_i; 2793 max2 = rbuf2_i[ct1]; 2794 if (!allcolumns[is_no]) { 2795 for (l=0; l<max2; l++,ct2++) { 2796 #if defined(PETSC_USE_CTABLE) 2797 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2798 #else 2799 tcol = cmap_i[rbuf3_i[ct2]]; 2800 #endif 2801 if (tcol) { 2802 *mat_j++ = tcol - 1; 2803 *mat_a++ = rbuf4_i[ct2]; 2804 ilen++; 2805 } 2806 } 2807 } else { /* allcolumns */ 2808 for (l=0; l<max2; l++,ct2++) { 2809 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2810 *mat_a++ = rbuf4_i[ct2]; 2811 ilen++; 2812 } 2813 } 2814 imat_ilen[row] = ilen; 2815 } 2816 } 2817 } 2818 2819 if (!iscsorted) { /* sort column indices of the rows */ 2820 for (i=0; i<ismax; i++) { 2821 subc = (Mat_SeqAIJ*)submats[i]->data; 2822 imat_j = subc->j; 2823 imat_i = subc->i; 2824 imat_a = subc->a; 2825 imat_ilen = subc->ilen; 2826 2827 if (allcolumns[i]) continue; 2828 jmax = nrow[i]; 2829 for (j=0; j<jmax; j++) { 2830 mat_i = imat_i[j]; 2831 mat_a = imat_a + mat_i; 2832 mat_j = imat_j + mat_i; 2833 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2834 } 2835 } 2836 } 2837 2838 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2839 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2840 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2841 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2842 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2843 2844 /* Restore the indices */ 2845 for (i=0; i<ismax; i++) { 2846 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2847 if (!allcolumns[i]) { 2848 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2849 } 2850 } 2851 2852 for (i=0; i<ismax; i++) { 2853 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2854 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2855 } 2856 2857 /* Destroy allocated memory */ 2858 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2859 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2860 iirow = (PetscInt**) irow; /* required because PetscFreeN() cannot work on const */ 2861 iicol = (PetscInt**) icol; 2862 ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr); 2863 2864 for (i=0; i<nrqs; ++i) { 2865 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2866 } 2867 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2868 2869 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2870 PetscFunctionReturn(0); 2871 } 2872 2873 /* 2874 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2875 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2876 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2877 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2878 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2879 state, and needs to be "assembled" later by compressing B's column space. 2880 2881 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2882 Following this call, C->A & C->B have been created, even if empty. 2883 */ 2884 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2885 { 2886 /* If making this function public, change the error returned in this function away from _PLIB. */ 2887 PetscErrorCode ierr; 2888 Mat_MPIAIJ *aij; 2889 Mat_SeqAIJ *Baij; 2890 PetscBool seqaij,Bdisassembled; 2891 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2892 PetscScalar v; 2893 const PetscInt *rowindices,*colindices; 2894 2895 PetscFunctionBegin; 2896 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2897 if (A) { 2898 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2899 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2900 if (rowemb) { 2901 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2902 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2903 } else { 2904 if (C->rmap->n != A->rmap->n) { 2905 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2906 } 2907 } 2908 if (dcolemb) { 2909 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2910 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2911 } else { 2912 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2913 } 2914 } 2915 if (B) { 2916 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2917 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2918 if (rowemb) { 2919 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2920 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2921 } else { 2922 if (C->rmap->n != B->rmap->n) { 2923 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2924 } 2925 } 2926 if (ocolemb) { 2927 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2928 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2929 } else { 2930 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2931 } 2932 } 2933 2934 aij = (Mat_MPIAIJ*)C->data; 2935 if (!aij->A) { 2936 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2937 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2938 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2939 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2940 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2941 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2942 } 2943 if (A) { 2944 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2945 } else { 2946 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2947 } 2948 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2949 /* 2950 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2951 need to "disassemble" B -- convert it to using C's global indices. 2952 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2953 2954 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2955 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2956 2957 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2958 At least avoid calling MatSetValues() and the implied searches? 2959 */ 2960 2961 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2962 #if defined(PETSC_USE_CTABLE) 2963 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2964 #else 2965 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2966 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2967 if (aij->B) { 2968 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2969 } 2970 #endif 2971 ngcol = 0; 2972 if (aij->lvec) { 2973 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2974 } 2975 if (aij->garray) { 2976 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2977 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2978 } 2979 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2980 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2981 } 2982 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2983 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2984 } 2985 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2986 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2987 } 2988 } 2989 Bdisassembled = PETSC_FALSE; 2990 if (!aij->B) { 2991 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2992 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2993 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2994 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2995 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2996 Bdisassembled = PETSC_TRUE; 2997 } 2998 if (B) { 2999 Baij = (Mat_SeqAIJ*)B->data; 3000 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3001 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3002 for (i=0; i<B->rmap->n; i++) { 3003 nz[i] = Baij->i[i+1] - Baij->i[i]; 3004 } 3005 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3006 ierr = PetscFree(nz);CHKERRQ(ierr); 3007 } 3008 3009 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3010 shift = rend-rstart; 3011 count = 0; 3012 rowindices = NULL; 3013 colindices = NULL; 3014 if (rowemb) { 3015 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3016 } 3017 if (ocolemb) { 3018 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3019 } 3020 for (i=0; i<B->rmap->n; i++) { 3021 PetscInt row; 3022 row = i; 3023 if (rowindices) row = rowindices[i]; 3024 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3025 col = Baij->j[count]; 3026 if (colindices) col = colindices[col]; 3027 if (Bdisassembled && col>=rstart) col += shift; 3028 v = Baij->a[count]; 3029 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3030 ++count; 3031 } 3032 } 3033 /* No assembly for aij->B is necessary. */ 3034 /* FIXME: set aij->B's nonzerostate correctly. */ 3035 } else { 3036 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3037 } 3038 C->preallocated = PETSC_TRUE; 3039 C->was_assembled = PETSC_FALSE; 3040 C->assembled = PETSC_FALSE; 3041 /* 3042 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3043 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3044 */ 3045 PetscFunctionReturn(0); 3046 } 3047 3048 /* 3049 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3050 */ 3051 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3052 { 3053 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3054 3055 PetscFunctionBegin; 3056 PetscValidPointer(A,2); 3057 PetscValidPointer(B,3); 3058 /* FIXME: make sure C is assembled */ 3059 *A = aij->A; 3060 *B = aij->B; 3061 /* Note that we don't incref *A and *B, so be careful! */ 3062 PetscFunctionReturn(0); 3063 } 3064 3065 /* 3066 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3067 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3068 */ 3069 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3070 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3071 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3072 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3073 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3074 { 3075 PetscErrorCode ierr; 3076 PetscMPIInt isize,flag; 3077 PetscInt i,ii,cismax,ispar; 3078 Mat *A,*B; 3079 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3080 3081 PetscFunctionBegin; 3082 if (!ismax) PetscFunctionReturn(0); 3083 3084 for (i = 0, cismax = 0; i < ismax; ++i) { 3085 PetscMPIInt isize; 3086 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3087 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3088 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3089 if (isize > 1) ++cismax; 3090 } 3091 3092 /* 3093 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3094 ispar counts the number of parallel ISs across C's comm. 3095 */ 3096 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3097 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3098 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3099 PetscFunctionReturn(0); 3100 } 3101 3102 /* if (ispar) */ 3103 /* 3104 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3105 These are used to extract the off-diag portion of the resulting parallel matrix. 3106 The row IS for the off-diag portion is the same as for the diag portion, 3107 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3108 */ 3109 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3110 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3111 for (i = 0, ii = 0; i < ismax; ++i) { 3112 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3113 if (isize > 1) { 3114 /* 3115 TODO: This is the part that's ***NOT SCALABLE***. 3116 To fix this we need to extract just the indices of C's nonzero columns 3117 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3118 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3119 to be done without serializing on the IS list, so, most likely, it is best 3120 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3121 */ 3122 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3123 /* Now we have to 3124 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3125 were sorted on each rank, concatenated they might no longer be sorted; 3126 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3127 indices in the nondecreasing order to the original index positions. 3128 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3129 */ 3130 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3131 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3132 ++ii; 3133 } 3134 } 3135 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3136 for (i = 0, ii = 0; i < ismax; ++i) { 3137 PetscInt j,issize; 3138 const PetscInt *indices; 3139 3140 /* 3141 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3142 */ 3143 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3144 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3145 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3146 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3147 for (j = 1; j < issize; ++j) { 3148 if (indices[j] == indices[j-1]) { 3149 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3150 } 3151 } 3152 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3153 3154 3155 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3156 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3157 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3158 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3159 for (j = 1; j < issize; ++j) { 3160 if (indices[j-1] == indices[j]) { 3161 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3162 } 3163 } 3164 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3165 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3166 if (isize > 1) { 3167 cisrow[ii] = isrow[i]; 3168 ++ii; 3169 } 3170 } 3171 /* 3172 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3173 array of sequential matrices underlying the resulting parallel matrices. 3174 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3175 contain duplicates. 3176 3177 There are as many diag matrices as there are original index sets. There are only as many parallel 3178 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3179 3180 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3181 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3182 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3183 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3184 with A[i] and B[ii] extracted from the corresponding MPI submat. 3185 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3186 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3187 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3188 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3189 values into A[i] and B[ii] sitting inside the corresponding submat. 3190 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3191 A[i], B[ii], AA[i] or BB[ii] matrices. 3192 */ 3193 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3194 if (scall == MAT_INITIAL_MATRIX) { 3195 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3196 } 3197 3198 /* Now obtain the sequential A and B submatrices separately. */ 3199 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3200 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3201 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3202 3203 /* 3204 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3205 matrices A & B have been extracted directly into the parallel matrices containing them, or 3206 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3207 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3208 to have the same sparsity pattern. 3209 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3210 must be constructed for C. This is done by setseqmat(s). 3211 */ 3212 for (i = 0, ii = 0; i < ismax; ++i) { 3213 /* 3214 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3215 That way we can avoid sorting and computing permutations when reusing. 3216 To this end: 3217 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3218 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3219 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3220 */ 3221 MatStructure pattern; 3222 pattern = DIFFERENT_NONZERO_PATTERN; 3223 3224 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3225 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3226 if (isize > 1) { 3227 if (scall == MAT_INITIAL_MATRIX) { 3228 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3229 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3230 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3231 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3232 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3233 } 3234 /* 3235 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3236 */ 3237 { 3238 Mat AA,BB; 3239 AA = A[i]; 3240 BB = B[ii]; 3241 if (AA || BB) { 3242 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3243 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3244 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3245 } 3246 3247 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3248 } 3249 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3250 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3251 ++ii; 3252 } else { /* if (isize == 1) */ 3253 if (scall == MAT_REUSE_MATRIX) { 3254 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3255 } 3256 if (isrow_p[i] || iscol_p[i]) { 3257 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3258 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3259 /* Otherwise A is extracted straight into (*submats)[i]. */ 3260 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3261 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3262 } else (*submat)[i] = A[i]; 3263 } 3264 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3265 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3266 } 3267 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3268 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3269 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3270 ierr = PetscFree(A);CHKERRQ(ierr); 3271 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3272 PetscFunctionReturn(0); 3273 } 3274 3275 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3276 { 3277 PetscErrorCode ierr; 3278 3279 PetscFunctionBegin; 3280 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3281 PetscFunctionReturn(0); 3282 } 3283