1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 22 { 23 PetscErrorCode ierr; 24 PetscInt i; 25 26 PetscFunctionBegin; 27 PetscCheckFalse(ov < 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 for (i=0; i<ov; ++i) { 29 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 30 } 31 PetscFunctionReturn(0); 32 } 33 34 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 35 { 36 PetscErrorCode ierr; 37 PetscInt i; 38 39 PetscFunctionBegin; 40 PetscCheckFalse(ov < 0,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 41 for (i=0; i<ov; ++i) { 42 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 43 } 44 PetscFunctionReturn(0); 45 } 46 47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 48 { 49 PetscErrorCode ierr; 50 MPI_Comm comm; 51 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 52 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 53 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 54 const PetscInt *indices_i,**indices; 55 PetscLayout rmap; 56 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 57 PetscSF sf; 58 PetscSFNode *remote; 59 60 PetscFunctionBegin; 61 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 62 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 63 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 64 /* get row map to determine where rows should be going */ 65 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 66 /* retrieve IS data and put all together so that we 67 * can optimize communication 68 * */ 69 ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 70 for (i=0,tlength=0; i<nidx; i++) { 71 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 72 tlength += length[i]; 73 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 74 } 75 /* find these rows on remote processors */ 76 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 77 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 78 nrrows = 0; 79 for (i=0; i<nidx; i++) { 80 length_i = length[i]; 81 indices_i = indices[i]; 82 for (j=0; j<length_i; j++) { 83 owner = -1; 84 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 85 /* remote processors */ 86 if (owner != rank) { 87 tosizes_temp[owner]++; /* number of rows to owner */ 88 rrow_ranks[nrrows] = owner; /* processor */ 89 rrow_isids[nrrows] = i; /* is id */ 90 remoterows[nrrows++] = indices_i[j]; /* row */ 91 } 92 } 93 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 94 } 95 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 96 /* test if we need to exchange messages 97 * generally speaking, we do not need to exchange 98 * data when overlap is 1 99 * */ 100 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRMPI(ierr); 101 /* we do not have any messages 102 * It usually corresponds to overlap 1 103 * */ 104 if (!reducednrrows) { 105 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 106 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 107 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 nto = 0; 111 /* send sizes and ranks for building a two-sided communcation */ 112 for (i=0; i<size; i++) { 113 if (tosizes_temp[i]) { 114 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 115 tosizes_temp[i] = nto; /* a map from processor to index */ 116 toranks[nto++] = i; /* processor */ 117 } 118 } 119 ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr); 120 toffsets[0] = 0; 121 for (i=0; i<nto; i++) { 122 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 123 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 124 } 125 /* send information to other processors */ 126 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 127 nrecvrows = 0; 128 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 129 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 130 nrecvrows = 0; 131 for (i=0; i<nfrom; i++) { 132 for (j=0; j<fromsizes[2*i]; j++) { 133 remote[nrecvrows].rank = fromranks[i]; 134 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 135 } 136 } 137 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 138 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 139 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 140 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 141 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 142 /* message pair <no of is, row> */ 143 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 144 for (i=0; i<nrrows; i++) { 145 owner = rrow_ranks[i]; /* processor */ 146 j = tosizes_temp[owner]; /* index */ 147 todata[toffsets[j]++] = rrow_isids[i]; 148 todata[toffsets[j]++] = remoterows[i]; 149 } 150 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 151 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 152 ierr = PetscFree(toffsets);CHKERRQ(ierr); 153 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 154 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 155 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 156 /* send rows belonging to the remote so that then we could get the overlapping data back */ 157 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 158 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 159 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 160 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 161 ierr = PetscFree(fromranks);CHKERRQ(ierr); 162 nrecvrows = 0; 163 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 164 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 165 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 166 nrecvrows = 0; 167 for (i=0; i<nto; i++) { 168 for (j=0; j<tosizes[2*i]; j++) { 169 remote[nrecvrows].rank = toranks[i]; 170 remote[nrecvrows++].index = tosizes[2*i+1]+j; 171 } 172 } 173 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 174 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 175 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 176 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 177 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 178 /* overlap communication and computation */ 179 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 180 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 181 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 182 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 183 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 184 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 185 ierr = PetscFree(toranks);CHKERRQ(ierr); 186 ierr = PetscFree(tosizes);CHKERRQ(ierr); 187 ierr = PetscFree(todata);CHKERRQ(ierr); 188 PetscFunctionReturn(0); 189 } 190 191 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 192 { 193 PetscInt *isz,isz_i,i,j,is_id, data_size; 194 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 195 const PetscInt *indices_i_temp; 196 MPI_Comm *iscomms; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++) { 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++) { 209 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 210 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 211 ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr); 212 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 213 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 214 } 215 /* retrieve information to get row id and its overlap */ 216 for (i=0; i<nrecvs;) { 217 is_id = recvdata[i++]; 218 data_size = recvdata[i++]; 219 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 220 isz_i = isz[is_id]; 221 for (j=0; j< data_size; j++) { 222 col = recvdata[i++]; 223 indices_i[isz_i++] = col; 224 } 225 isz[is_id] = isz_i; 226 } 227 /* remove duplicate entities */ 228 for (i=0; i<nidx; i++) { 229 indices_i = indices_temp+(max_lsize+nrecvs)*i; 230 isz_i = isz[i]; 231 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 232 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 233 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 234 } 235 ierr = PetscFree(isz);CHKERRQ(ierr); 236 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 237 PetscFunctionReturn(0); 238 } 239 240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 241 { 242 PetscLayout rmap,cmap; 243 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 244 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 245 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 246 const PetscInt *gcols,*ai,*aj,*bi,*bj; 247 Mat amat,bmat; 248 PetscMPIInt rank; 249 PetscBool done; 250 MPI_Comm comm; 251 PetscErrorCode ierr; 252 253 PetscFunctionBegin; 254 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 255 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 256 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 257 /* Even if the mat is symmetric, we still assume it is not symmetric */ 258 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 259 PetscCheckFalse(!done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 260 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 261 PetscCheckFalse(!done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 262 /* total number of nonzero values is used to estimate the memory usage in the next step */ 263 tnz = ai[an]+bi[bn]; 264 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 265 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 266 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 267 /* to find the longest message */ 268 max_fszs = 0; 269 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 270 /* better way to estimate number of nonzero in the mat??? */ 271 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 272 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 273 rows_pos = 0; 274 totalrows = 0; 275 for (i=0; i<nfrom; i++) { 276 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 277 /* group data together */ 278 for (j=0; j<fromsizes[2*i]; j+=2) { 279 is_id = fromrows[rows_pos++];/* no of is */ 280 rows_i = rows_data[is_id]; 281 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 282 } 283 /* estimate a space to avoid multiple allocations */ 284 for (j=0; j<nidx; j++) { 285 indvc_ij = 0; 286 rows_i = rows_data[j]; 287 for (l=0; l<rows_pos_i[j]; l++) { 288 row = rows_i[l]-rstart; 289 start = ai[row]; 290 end = ai[row+1]; 291 for (k=start; k<end; k++) { /* Amat */ 292 col = aj[k] + cstart; 293 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 294 } 295 start = bi[row]; 296 end = bi[row+1]; 297 for (k=start; k<end; k++) { /* Bmat */ 298 col = gcols[bj[k]]; 299 indices_tmp[indvc_ij++] = col; 300 } 301 } 302 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 303 indv_counts[i*nidx+j] = indvc_ij; 304 totalrows += indvc_ij; 305 } 306 } 307 /* message triple <no of is, number of rows, rows> */ 308 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 309 totalrows = 0; 310 rows_pos = 0; 311 /* use this code again */ 312 for (i=0;i<nfrom;i++) { 313 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 314 for (j=0; j<fromsizes[2*i]; j+=2) { 315 is_id = fromrows[rows_pos++]; 316 rows_i = rows_data[is_id]; 317 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 318 } 319 /* add data */ 320 for (j=0; j<nidx; j++) { 321 if (!indv_counts[i*nidx+j]) continue; 322 indvc_ij = 0; 323 sbdata[totalrows++] = j; 324 sbdata[totalrows++] = indv_counts[i*nidx+j]; 325 sbsizes[2*i] += 2; 326 rows_i = rows_data[j]; 327 for (l=0; l<rows_pos_i[j]; l++) { 328 row = rows_i[l]-rstart; 329 start = ai[row]; 330 end = ai[row+1]; 331 for (k=start; k<end; k++) { /* Amat */ 332 col = aj[k] + cstart; 333 indices_tmp[indvc_ij++] = col; 334 } 335 start = bi[row]; 336 end = bi[row+1]; 337 for (k=start; k<end; k++) { /* Bmat */ 338 col = gcols[bj[k]]; 339 indices_tmp[indvc_ij++] = col; 340 } 341 } 342 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 343 sbsizes[2*i] += indvc_ij; 344 ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr); 345 totalrows += indvc_ij; 346 } 347 } 348 ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 349 offsets[0] = 0; 350 for (i=0; i<nfrom; i++) { 351 offsets[i+1] = offsets[i] + sbsizes[2*i]; 352 sbsizes[2*i+1] = offsets[i]; 353 } 354 ierr = PetscFree(offsets);CHKERRQ(ierr); 355 if (sbrowsizes) *sbrowsizes = sbsizes; 356 if (sbrows) *sbrows = sbdata; 357 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 358 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 359 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 360 PetscFunctionReturn(0); 361 } 362 363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 364 { 365 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 366 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 367 PetscInt lsize,lsize_tmp; 368 PetscMPIInt rank,owner; 369 Mat amat,bmat; 370 PetscBool done; 371 PetscLayout cmap,rmap; 372 MPI_Comm comm; 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 377 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 378 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 379 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 380 PetscCheckFalse(!done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 381 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 382 PetscCheckFalse(!done,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ "); 383 /* is it a safe way to compute number of nonzero values ? */ 384 tnz = ai[an]+bi[bn]; 385 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 386 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 387 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 388 /* it is a better way to estimate memory than the old implementation 389 * where global size of matrix is used 390 * */ 391 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 392 for (i=0; i<nidx; i++) { 393 MPI_Comm iscomm; 394 395 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 396 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 397 lsize_tmp = 0; 398 for (j=0; j<lsize; j++) { 399 owner = -1; 400 row = indices[j]; 401 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 402 if (owner != rank) continue; 403 /* local number */ 404 row -= rstart; 405 start = ai[row]; 406 end = ai[row+1]; 407 for (k=start; k<end; k++) { /* Amat */ 408 col = aj[k] + cstart; 409 indices_temp[lsize_tmp++] = col; 410 } 411 start = bi[row]; 412 end = bi[row+1]; 413 for (k=start; k<end; k++) { /* Bmat */ 414 col = gcols[bj[k]]; 415 indices_temp[lsize_tmp++] = col; 416 } 417 } 418 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 419 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 420 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 421 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 422 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 423 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 424 } 425 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 426 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 427 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 /* 432 Sample message format: 433 If a processor A wants processor B to process some elements corresponding 434 to index sets is[1],is[5] 435 mesg [0] = 2 (no of index sets in the mesg) 436 ----------- 437 mesg [1] = 1 => is[1] 438 mesg [2] = sizeof(is[1]); 439 ----------- 440 mesg [3] = 5 => is[5] 441 mesg [4] = sizeof(is[5]); 442 ----------- 443 mesg [5] 444 mesg [n] datas[1] 445 ----------- 446 mesg[n+1] 447 mesg[m] data(is[5]) 448 ----------- 449 450 Notes: 451 nrqs - no of requests sent (or to be sent out) 452 nrqr - no of requests received (which have to be or which have been processed) 453 */ 454 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 455 { 456 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 457 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 458 const PetscInt **idx,*idx_i; 459 PetscInt *n,**data,len; 460 #if defined(PETSC_USE_CTABLE) 461 PetscTable *table_data,table_data_i; 462 PetscInt *tdata,tcount,tcount_max; 463 #else 464 PetscInt *data_i,*d_p; 465 #endif 466 PetscErrorCode ierr; 467 PetscMPIInt size,rank,tag1,tag2,proc = 0; 468 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 469 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 470 PetscBT *table; 471 MPI_Comm comm; 472 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 473 MPI_Status *recv_status; 474 MPI_Comm *iscomms; 475 char *t_p; 476 477 PetscFunctionBegin; 478 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 479 size = c->size; 480 rank = c->rank; 481 M = C->rmap->N; 482 483 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 484 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 485 486 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 487 488 for (i=0; i<imax; i++) { 489 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 490 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 491 } 492 493 /* evaluate communication - mesg to who,length of mesg, and buffer space 494 required. Based on this, buffers are allocated, and data copied into them */ 495 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 496 for (i=0; i<imax; i++) { 497 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 498 idx_i = idx[i]; 499 len = n[i]; 500 for (j=0; j<len; j++) { 501 row = idx_i[j]; 502 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 503 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 504 w4[proc]++; 505 } 506 for (j=0; j<size; j++) { 507 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 508 } 509 } 510 511 nrqs = 0; /* no of outgoing messages */ 512 msz = 0; /* total mesg length (for all proc */ 513 w1[rank] = 0; /* no mesg sent to intself */ 514 w3[rank] = 0; 515 for (i=0; i<size; i++) { 516 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 517 } 518 /* pa - is list of processors to communicate with */ 519 ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); 520 for (i=0,j=0; i<size; i++) { 521 if (w1[i]) {pa[j] = i; j++;} 522 } 523 524 /* Each message would have a header = 1 + 2*(no of IS) + data */ 525 for (i=0; i<nrqs; i++) { 526 j = pa[i]; 527 w1[j] += w2[j] + 2*w3[j]; 528 msz += w1[j]; 529 } 530 531 /* Determine the number of messages to expect, their lengths, from from-ids */ 532 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 533 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 534 535 /* Now post the Irecvs corresponding to these messages */ 536 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 537 538 /* Allocate Memory for outgoing messages */ 539 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 540 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 541 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 542 543 { 544 PetscInt *iptr = tmp,ict = 0; 545 for (i=0; i<nrqs; i++) { 546 j = pa[i]; 547 iptr += ict; 548 outdat[j] = iptr; 549 ict = w1[j]; 550 } 551 } 552 553 /* Form the outgoing messages */ 554 /* plug in the headers */ 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 outdat[j][0] = 0; 558 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 559 ptr[j] = outdat[j] + 2*w3[j] + 1; 560 } 561 562 /* Memory for doing local proc's work */ 563 { 564 PetscInt M_BPB_imax = 0; 565 #if defined(PETSC_USE_CTABLE) 566 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 567 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 ierr = PetscTableCreate(n[i],M,&table_data[i]);CHKERRQ(ierr); 570 } 571 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 572 for (i=0; i<imax; i++) { 573 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 574 } 575 #else 576 PetscInt Mimax = 0; 577 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 578 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 579 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 580 for (i=0; i<imax; i++) { 581 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 582 data[i] = d_p + M*i; 583 } 584 #endif 585 } 586 587 /* Parse the IS and update local tables and the outgoing buf with the data */ 588 { 589 PetscInt n_i,isz_i,*outdat_j,ctr_j; 590 PetscBT table_i; 591 592 for (i=0; i<imax; i++) { 593 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 594 n_i = n[i]; 595 table_i = table[i]; 596 idx_i = idx[i]; 597 #if defined(PETSC_USE_CTABLE) 598 table_data_i = table_data[i]; 599 #else 600 data_i = data[i]; 601 #endif 602 isz_i = isz[i]; 603 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 604 row = idx_i[j]; 605 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 606 if (proc != rank) { /* copy to the outgoing buffer */ 607 ctr[proc]++; 608 *ptr[proc] = row; 609 ptr[proc]++; 610 } else if (!PetscBTLookupSet(table_i,row)) { 611 #if defined(PETSC_USE_CTABLE) 612 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 613 #else 614 data_i[isz_i] = row; /* Update the local table */ 615 #endif 616 isz_i++; 617 } 618 } 619 /* Update the headers for the current IS */ 620 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 621 if ((ctr_j = ctr[j])) { 622 outdat_j = outdat[j]; 623 k = ++outdat_j[0]; 624 outdat_j[2*k] = ctr_j; 625 outdat_j[2*k-1] = i; 626 } 627 } 628 isz[i] = isz_i; 629 } 630 } 631 632 /* Now post the sends */ 633 ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr); 634 for (i=0; i<nrqs; ++i) { 635 j = pa[i]; 636 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 637 } 638 639 /* No longer need the original indices */ 640 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 641 for (i=0; i<imax; ++i) { 642 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 643 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 644 } 645 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 646 647 for (i=0; i<imax; ++i) { 648 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 649 } 650 651 /* Do Local work */ 652 #if defined(PETSC_USE_CTABLE) 653 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 654 #else 655 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 656 #endif 657 658 /* Receive messages */ 659 ierr = PetscMalloc1(nrqr,&recv_status);CHKERRQ(ierr); 660 ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr); 661 ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 662 663 /* Phase 1 sends are complete - deallocate buffers */ 664 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 665 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 666 667 ierr = PetscMalloc1(nrqr,&xdata);CHKERRQ(ierr); 668 ierr = PetscMalloc1(nrqr,&isz1);CHKERRQ(ierr); 669 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 670 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 671 ierr = PetscFree(rbuf);CHKERRQ(ierr); 672 673 /* Send the data back */ 674 /* Do a global reduction to know the buffer space req for incoming messages */ 675 { 676 PetscMPIInt *rw1; 677 678 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 679 680 for (i=0; i<nrqr; ++i) { 681 proc = recv_status[i].MPI_SOURCE; 682 683 PetscCheckFalse(proc != onodes1[i],PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 684 rw1[proc] = isz1[i]; 685 } 686 ierr = PetscFree(onodes1);CHKERRQ(ierr); 687 ierr = PetscFree(olengths1);CHKERRQ(ierr); 688 689 /* Determine the number of messages to expect, their lengths, from from-ids */ 690 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 691 ierr = PetscFree(rw1);CHKERRQ(ierr); 692 } 693 /* Now post the Irecvs corresponding to these messages */ 694 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 695 696 /* Now post the sends */ 697 ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr); 698 for (i=0; i<nrqr; ++i) { 699 j = recv_status[i].MPI_SOURCE; 700 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr); 701 } 702 703 /* receive work done on other processors */ 704 { 705 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 706 PetscMPIInt idex; 707 PetscBT table_i; 708 709 for (i=0; i<nrqs; ++i) { 710 ierr = MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);CHKERRMPI(ierr); 711 /* Process the message */ 712 rbuf2_i = rbuf2[idex]; 713 ct1 = 2*rbuf2_i[0]+1; 714 jmax = rbuf2[idex][0]; 715 for (j=1; j<=jmax; j++) { 716 max = rbuf2_i[2*j]; 717 is_no = rbuf2_i[2*j-1]; 718 isz_i = isz[is_no]; 719 table_i = table[is_no]; 720 #if defined(PETSC_USE_CTABLE) 721 table_data_i = table_data[is_no]; 722 #else 723 data_i = data[is_no]; 724 #endif 725 for (k=0; k<max; k++,ct1++) { 726 row = rbuf2_i[ct1]; 727 if (!PetscBTLookupSet(table_i,row)) { 728 #if defined(PETSC_USE_CTABLE) 729 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 730 #else 731 data_i[isz_i] = row; 732 #endif 733 isz_i++; 734 } 735 } 736 isz[is_no] = isz_i; 737 } 738 } 739 740 ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 768 } 769 770 ierr = PetscFree(iscomms);CHKERRQ(ierr); 771 ierr = PetscFree(onodes2);CHKERRQ(ierr); 772 ierr = PetscFree(olengths2);CHKERRQ(ierr); 773 774 ierr = PetscFree(pa);CHKERRQ(ierr); 775 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 776 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 779 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 780 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 781 ierr = PetscFree(recv_status);CHKERRQ(ierr); 782 if (xdata) { 783 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 784 ierr = PetscFree(xdata);CHKERRQ(ierr); 785 } 786 ierr = PetscFree(isz1);CHKERRQ(ierr); 787 #if defined(PETSC_USE_CTABLE) 788 for (i=0; i<imax; i++) { 789 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 790 } 791 ierr = PetscFree(table_data);CHKERRQ(ierr); 792 ierr = PetscFree(tdata);CHKERRQ(ierr); 793 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 794 #else 795 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 796 #endif 797 PetscFunctionReturn(0); 798 } 799 800 /* 801 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 802 the work on the local processor. 803 804 Inputs: 805 C - MAT_MPIAIJ; 806 imax - total no of index sets processed at a time; 807 table - an array of char - size = m bits. 808 809 Output: 810 isz - array containing the count of the solution elements corresponding 811 to each index set; 812 data or table_data - pointer to the solutions 813 */ 814 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 815 { 816 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 817 Mat A = c->A,B = c->B; 818 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 819 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 820 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 821 PetscBT table_i; 822 #if defined(PETSC_USE_CTABLE) 823 PetscTable table_data_i; 824 PetscErrorCode ierr; 825 PetscTablePosition tpos; 826 PetscInt tcount,*tdata; 827 #else 828 PetscInt *data_i; 829 #endif 830 831 PetscFunctionBegin; 832 rstart = C->rmap->rstart; 833 cstart = C->cmap->rstart; 834 ai = a->i; 835 aj = a->j; 836 bi = b->i; 837 bj = b->j; 838 garray = c->garray; 839 840 for (i=0; i<imax; i++) { 841 #if defined(PETSC_USE_CTABLE) 842 /* copy existing entries of table_data_i into tdata[] */ 843 table_data_i = table_data[i]; 844 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 845 PetscCheckFalse(tcount != isz[i],PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %" PetscInt_FMT " != isz[%" PetscInt_FMT "] %" PetscInt_FMT,tcount,i,isz[i]); 846 847 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 848 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 849 while (tpos) { 850 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 851 tdata[--j] = --row; 852 PetscCheckFalse(j > tcount - 1,PETSC_COMM_SELF,PETSC_ERR_PLIB," j %" PetscInt_FMT " >= tcount %" PetscInt_FMT,j,tcount); 853 } 854 #else 855 data_i = data[i]; 856 #endif 857 table_i = table[i]; 858 isz_i = isz[i]; 859 max = isz[i]; 860 861 for (j=0; j<max; j++) { 862 #if defined(PETSC_USE_CTABLE) 863 row = tdata[j] - rstart; 864 #else 865 row = data_i[j] - rstart; 866 #endif 867 start = ai[row]; 868 end = ai[row+1]; 869 for (k=start; k<end; k++) { /* Amat */ 870 val = aj[k] + cstart; 871 if (!PetscBTLookupSet(table_i,val)) { 872 #if defined(PETSC_USE_CTABLE) 873 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 874 #else 875 data_i[isz_i] = val; 876 #endif 877 isz_i++; 878 } 879 } 880 start = bi[row]; 881 end = bi[row+1]; 882 for (k=start; k<end; k++) { /* Bmat */ 883 val = garray[bj[k]]; 884 if (!PetscBTLookupSet(table_i,val)) { 885 #if defined(PETSC_USE_CTABLE) 886 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 887 #else 888 data_i[isz_i] = val; 889 #endif 890 isz_i++; 891 } 892 } 893 } 894 isz[i] = isz_i; 895 896 #if defined(PETSC_USE_CTABLE) 897 ierr = PetscFree(tdata);CHKERRQ(ierr); 898 #endif 899 } 900 PetscFunctionReturn(0); 901 } 902 903 /* 904 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 905 and return the output 906 907 Input: 908 C - the matrix 909 nrqr - no of messages being processed. 910 rbuf - an array of pointers to the received requests 911 912 Output: 913 xdata - array of messages to be sent back 914 isz1 - size of each message 915 916 For better efficiency perhaps we should malloc separately each xdata[i], 917 then if a remalloc is required we need only copy the data for that one row 918 rather then all previous rows as it is now where a single large chunk of 919 memory is used. 920 921 */ 922 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 923 { 924 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 925 Mat A = c->A,B = c->B; 926 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 927 PetscErrorCode ierr; 928 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 929 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 930 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 931 PetscInt *rbuf_i,kmax,rbuf_0; 932 PetscBT xtable; 933 934 PetscFunctionBegin; 935 m = C->rmap->N; 936 rstart = C->rmap->rstart; 937 cstart = C->cmap->rstart; 938 ai = a->i; 939 aj = a->j; 940 bi = b->i; 941 bj = b->j; 942 garray = c->garray; 943 944 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 945 rbuf_i = rbuf[i]; 946 rbuf_0 = rbuf_i[0]; 947 ct += rbuf_0; 948 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 949 } 950 951 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 952 else max1 = 1; 953 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 954 if (nrqr) { 955 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 956 ++no_malloc; 957 } 958 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 959 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 960 961 ct3 = 0; 962 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 963 rbuf_i = rbuf[i]; 964 rbuf_0 = rbuf_i[0]; 965 ct1 = 2*rbuf_0+1; 966 ct2 = ct1; 967 ct3 += ct1; 968 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 969 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 970 oct2 = ct2; 971 kmax = rbuf_i[2*j]; 972 for (k=0; k<kmax; k++,ct1++) { 973 row = rbuf_i[ct1]; 974 if (!PetscBTLookupSet(xtable,row)) { 975 if (!(ct3 < mem_estimate)) { 976 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 977 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 978 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 979 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 980 xdata[0] = tmp; 981 mem_estimate = new_estimate; ++no_malloc; 982 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 983 } 984 xdata[i][ct2++] = row; 985 ct3++; 986 } 987 } 988 for (k=oct2,max2=ct2; k<max2; k++) { 989 row = xdata[i][k] - rstart; 990 start = ai[row]; 991 end = ai[row+1]; 992 for (l=start; l<end; l++) { 993 val = aj[l] + cstart; 994 if (!PetscBTLookupSet(xtable,val)) { 995 if (!(ct3 < mem_estimate)) { 996 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 997 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 998 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 999 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1000 xdata[0] = tmp; 1001 mem_estimate = new_estimate; ++no_malloc; 1002 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1003 } 1004 xdata[i][ct2++] = val; 1005 ct3++; 1006 } 1007 } 1008 start = bi[row]; 1009 end = bi[row+1]; 1010 for (l=start; l<end; l++) { 1011 val = garray[bj[l]]; 1012 if (!PetscBTLookupSet(xtable,val)) { 1013 if (!(ct3 < mem_estimate)) { 1014 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1015 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1016 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1017 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1018 xdata[0] = tmp; 1019 mem_estimate = new_estimate; ++no_malloc; 1020 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1021 } 1022 xdata[i][ct2++] = val; 1023 ct3++; 1024 } 1025 } 1026 } 1027 /* Update the header*/ 1028 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1029 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1030 } 1031 xdata[i][0] = rbuf_0; 1032 if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2; 1033 isz1[i] = ct2; /* size of each message */ 1034 } 1035 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1036 ierr = PetscInfo(C,"Allocated %" PetscInt_FMT " bytes, required %" PetscInt_FMT " bytes, no of mallocs = %" PetscInt_FMT "\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1037 PetscFunctionReturn(0); 1038 } 1039 /* -------------------------------------------------------------------------*/ 1040 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1041 /* 1042 Every processor gets the entire matrix 1043 */ 1044 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1045 { 1046 Mat B; 1047 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1048 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1049 PetscErrorCode ierr; 1050 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1051 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1052 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1053 1054 PetscFunctionBegin; 1055 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 1056 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr); 1057 if (scall == MAT_INITIAL_MATRIX) { 1058 /* ---------------------------------------------------------------- 1059 Tell every processor the number of nonzeros per row 1060 */ 1061 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1062 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1063 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1064 } 1065 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1066 for (i=0; i<size; i++) { 1067 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1068 displs[i] = A->rmap->range[i]; 1069 } 1070 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1071 /* --------------------------------------------------------------- 1072 Create the sequential matrix of the same type as the local block diagonal 1073 */ 1074 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1075 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1076 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1077 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1078 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1079 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1080 **Bin = B; 1081 b = (Mat_SeqAIJ*)B->data; 1082 1083 /*-------------------------------------------------------------------- 1084 Copy my part of matrix column indices over 1085 */ 1086 sendcount = ad->nz + bd->nz; 1087 jsendbuf = b->j + b->i[rstarts[rank]]; 1088 a_jsendbuf = ad->j; 1089 b_jsendbuf = bd->j; 1090 n = A->rmap->rend - A->rmap->rstart; 1091 cnt = 0; 1092 for (i=0; i<n; i++) { 1093 /* put in lower diagonal portion */ 1094 m = bd->i[i+1] - bd->i[i]; 1095 while (m > 0) { 1096 /* is it above diagonal (in bd (compressed) numbering) */ 1097 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1098 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1099 m--; 1100 } 1101 1102 /* put in diagonal portion */ 1103 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1104 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1105 } 1106 1107 /* put in upper diagonal portion */ 1108 while (m-- > 0) { 1109 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1110 } 1111 } 1112 PetscCheckFalse(cnt != sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt); 1113 1114 /*-------------------------------------------------------------------- 1115 Gather all column indices to all processors 1116 */ 1117 for (i=0; i<size; i++) { 1118 recvcounts[i] = 0; 1119 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1120 recvcounts[i] += lens[j]; 1121 } 1122 } 1123 displs[0] = 0; 1124 for (i=1; i<size; i++) { 1125 displs[i] = displs[i-1] + recvcounts[i-1]; 1126 } 1127 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1128 /*-------------------------------------------------------------------- 1129 Assemble the matrix into useable form (note numerical values not yet set) 1130 */ 1131 /* set the b->ilen (length of each row) values */ 1132 ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr); 1133 /* set the b->i indices */ 1134 b->i[0] = 0; 1135 for (i=1; i<=A->rmap->N; i++) { 1136 b->i[i] = b->i[i-1] + lens[i-1]; 1137 } 1138 ierr = PetscFree(lens);CHKERRQ(ierr); 1139 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1140 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1141 1142 } else { 1143 B = **Bin; 1144 b = (Mat_SeqAIJ*)B->data; 1145 } 1146 1147 /*-------------------------------------------------------------------- 1148 Copy my part of matrix numerical values into the values location 1149 */ 1150 if (flag == MAT_GET_VALUES) { 1151 const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf; 1152 MatScalar *sendbuf,*recvbuf; 1153 1154 ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr); 1155 ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr); 1156 sendcount = ad->nz + bd->nz; 1157 sendbuf = b->a + b->i[rstarts[rank]]; 1158 a_sendbuf = ada; 1159 b_sendbuf = bda; 1160 b_sendj = bd->j; 1161 n = A->rmap->rend - A->rmap->rstart; 1162 cnt = 0; 1163 for (i=0; i<n; i++) { 1164 /* put in lower diagonal portion */ 1165 m = bd->i[i+1] - bd->i[i]; 1166 while (m > 0) { 1167 /* is it above diagonal (in bd (compressed) numbering) */ 1168 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1169 sendbuf[cnt++] = *b_sendbuf++; 1170 m--; 1171 b_sendj++; 1172 } 1173 1174 /* put in diagonal portion */ 1175 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1176 sendbuf[cnt++] = *a_sendbuf++; 1177 } 1178 1179 /* put in upper diagonal portion */ 1180 while (m-- > 0) { 1181 sendbuf[cnt++] = *b_sendbuf++; 1182 b_sendj++; 1183 } 1184 } 1185 PetscCheckFalse(cnt != sendcount,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %" PetscInt_FMT " actual nz %" PetscInt_FMT,sendcount,cnt); 1186 1187 /* ----------------------------------------------------------------- 1188 Gather all numerical values to all processors 1189 */ 1190 if (!recvcounts) { 1191 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1192 } 1193 for (i=0; i<size; i++) { 1194 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1195 } 1196 displs[0] = 0; 1197 for (i=1; i<size; i++) { 1198 displs[i] = displs[i-1] + recvcounts[i-1]; 1199 } 1200 recvbuf = b->a; 1201 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1202 ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr); 1203 ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr); 1204 } /* endof (flag == MAT_GET_VALUES) */ 1205 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1206 1207 if (A->symmetric) { 1208 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1209 } else if (A->hermitian) { 1210 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1211 } else if (A->structurally_symmetric) { 1212 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1213 } 1214 PetscFunctionReturn(0); 1215 } 1216 1217 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1218 { 1219 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1220 Mat submat,A = c->A,B = c->B; 1221 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1222 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1223 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1224 const PetscInt *icol,*irow; 1225 PetscInt nrow,ncol,start; 1226 PetscErrorCode ierr; 1227 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1228 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1229 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1230 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1231 PetscInt *lens,rmax,ncols,*cols,Crow; 1232 #if defined(PETSC_USE_CTABLE) 1233 PetscTable cmap,rmap; 1234 PetscInt *cmap_loc,*rmap_loc; 1235 #else 1236 PetscInt *cmap,*rmap; 1237 #endif 1238 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1239 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1240 PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL; 1241 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1242 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1243 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1244 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1245 MPI_Comm comm; 1246 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1247 PetscMPIInt *onodes1,*olengths1,idex,end; 1248 Mat_SubSppt *smatis1; 1249 PetscBool isrowsorted,iscolsorted; 1250 1251 PetscFunctionBegin; 1252 PetscValidLogicalCollectiveInt(C,ismax,2); 1253 PetscValidLogicalCollectiveEnum(C,scall,5); 1254 PetscCheckFalse(ismax != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1255 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1256 ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1257 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1258 size = c->size; 1259 rank = c->rank; 1260 1261 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1262 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1263 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1264 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1265 if (allcolumns) { 1266 icol = NULL; 1267 ncol = C->cmap->N; 1268 } else { 1269 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1270 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1271 } 1272 1273 if (scall == MAT_INITIAL_MATRIX) { 1274 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1275 1276 /* Get some new tags to keep the communication clean */ 1277 tag1 = ((PetscObject)C)->tag; 1278 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1279 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1280 1281 /* evaluate communication - mesg to who, length of mesg, and buffer space 1282 required. Based on this, buffers are allocated, and data copied into them */ 1283 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1284 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1285 1286 /* w1[proc] = num of rows owned by proc -- to be requested */ 1287 proc = 0; 1288 nrqs = 0; /* num of outgoing messages */ 1289 for (j=0; j<nrow; j++) { 1290 row = irow[j]; 1291 if (!isrowsorted) proc = 0; 1292 while (row >= C->rmap->range[proc+1]) proc++; 1293 w1[proc]++; 1294 row2proc[j] = proc; /* map row index to proc */ 1295 1296 if (proc != rank && !w2[proc]) { 1297 w2[proc] = 1; nrqs++; 1298 } 1299 } 1300 w1[rank] = 0; /* rows owned by self will not be requested */ 1301 1302 ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1303 for (proc=0,j=0; proc<size; proc++) { 1304 if (w1[proc]) { pa[j++] = proc;} 1305 } 1306 1307 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1308 msz = 0; /* total mesg length (for all procs) */ 1309 for (i=0; i<nrqs; i++) { 1310 proc = pa[i]; 1311 w1[proc] += 3; 1312 msz += w1[proc]; 1313 } 1314 ierr = PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz);CHKERRQ(ierr); 1315 1316 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1317 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1318 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1319 1320 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1321 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1322 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1323 1324 /* Now post the Irecvs corresponding to these messages */ 1325 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1326 1327 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1328 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1329 1330 /* Allocate Memory for outgoing messages */ 1331 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1332 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 1333 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 1334 1335 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1336 iptr = tmp; 1337 for (i=0; i<nrqs; i++) { 1338 proc = pa[i]; 1339 sbuf1[proc] = iptr; 1340 iptr += w1[proc]; 1341 } 1342 1343 /* Form the outgoing messages */ 1344 /* Initialize the header space */ 1345 for (i=0; i<nrqs; i++) { 1346 proc = pa[i]; 1347 ierr = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr); 1348 ptr[proc] = sbuf1[proc] + 3; 1349 } 1350 1351 /* Parse the isrow and copy data into outbuf */ 1352 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 1353 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1354 proc = row2proc[j]; 1355 if (proc != rank) { /* copy to the outgoing buf*/ 1356 *ptr[proc] = irow[j]; 1357 ctr[proc]++; ptr[proc]++; 1358 } 1359 } 1360 1361 /* Update the headers for the current IS */ 1362 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1363 if ((ctr_j = ctr[j])) { 1364 sbuf1_j = sbuf1[j]; 1365 k = ++sbuf1_j[0]; 1366 sbuf1_j[2*k] = ctr_j; 1367 sbuf1_j[2*k-1] = 0; 1368 } 1369 } 1370 1371 /* Now post the sends */ 1372 ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr); 1373 for (i=0; i<nrqs; ++i) { 1374 proc = pa[i]; 1375 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 1376 } 1377 1378 /* Post Receives to capture the buffer size */ 1379 ierr = PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2);CHKERRQ(ierr); 1380 ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr); 1381 1382 if (nrqs) rbuf2[0] = tmp + msz; 1383 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1384 1385 for (i=0; i<nrqs; ++i) { 1386 proc = pa[i]; 1387 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 1388 } 1389 1390 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1391 1392 /* Send to other procs the buf size they should allocate */ 1393 /* Receive messages*/ 1394 ierr = PetscMalloc1(nrqr,&r_status1);CHKERRQ(ierr); 1395 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1396 1397 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 1398 for (i=0; i<nrqr; ++i) { 1399 req_size[i] = 0; 1400 rbuf1_i = rbuf1[i]; 1401 start = 2*rbuf1_i[0] + 1; 1402 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr); 1403 ierr = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr); 1404 sbuf2_i = sbuf2[i]; 1405 for (j=start; j<end; j++) { 1406 k = rbuf1_i[j] - rstart; 1407 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1408 sbuf2_i[j] = ncols; 1409 req_size[i] += ncols; 1410 } 1411 req_source1[i] = r_status1[i].MPI_SOURCE; 1412 1413 /* form the header */ 1414 sbuf2_i[0] = req_size[i]; 1415 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1416 1417 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 1418 } 1419 1420 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1421 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1422 1423 /* rbuf2 is received, Post recv column indices a->j */ 1424 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 1425 1426 ierr = PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3);CHKERRQ(ierr); 1427 for (i=0; i<nrqs; ++i) { 1428 ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr); 1429 req_source2[i] = r_status2[i].MPI_SOURCE; 1430 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 1431 } 1432 1433 /* Wait on sends1 and sends2 */ 1434 ierr = PetscMalloc1(nrqs,&s_status1);CHKERRQ(ierr); 1435 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr); 1436 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1437 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1438 1439 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr); 1440 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1441 1442 /* Now allocate sending buffers for a->j, and send them off */ 1443 ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr); 1444 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1445 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);} 1446 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1447 1448 for (i=0; i<nrqr; i++) { /* for each requested message */ 1449 rbuf1_i = rbuf1[i]; 1450 sbuf_aj_i = sbuf_aj[i]; 1451 ct1 = 2*rbuf1_i[0] + 1; 1452 ct2 = 0; 1453 /* max1=rbuf1_i[0]; PetscCheckFalse(max1 != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1454 1455 kmax = rbuf1[i][2]; 1456 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1457 row = rbuf1_i[ct1] - rstart; 1458 nzA = ai[row+1] - ai[row]; 1459 nzB = bi[row+1] - bi[row]; 1460 ncols = nzA + nzB; 1461 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1462 1463 /* load the column indices for this row into cols*/ 1464 cols = sbuf_aj_i + ct2; 1465 1466 lwrite = 0; 1467 for (l=0; l<nzB; l++) { 1468 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1469 } 1470 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1471 for (l=0; l<nzB; l++) { 1472 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1473 } 1474 1475 ct2 += ncols; 1476 } 1477 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 1478 } 1479 1480 /* create column map (cmap): global col of C -> local col of submat */ 1481 #if defined(PETSC_USE_CTABLE) 1482 if (!allcolumns) { 1483 ierr = PetscTableCreate(ncol,C->cmap->N,&cmap);CHKERRQ(ierr); 1484 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1485 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1486 if (icol[j] >= cstart && icol[j] <cend) { 1487 cmap_loc[icol[j] - cstart] = j+1; 1488 } else { /* use PetscTable for non-local col indices */ 1489 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1490 } 1491 } 1492 } else { 1493 cmap = NULL; 1494 cmap_loc = NULL; 1495 } 1496 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1497 #else 1498 if (!allcolumns) { 1499 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1500 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1501 } else { 1502 cmap = NULL; 1503 } 1504 #endif 1505 1506 /* Create lens for MatSeqAIJSetPreallocation() */ 1507 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1508 1509 /* Compute lens from local part of C */ 1510 for (j=0; j<nrow; j++) { 1511 row = irow[j]; 1512 proc = row2proc[j]; 1513 if (proc == rank) { 1514 /* diagonal part A = c->A */ 1515 ncols = ai[row-rstart+1] - ai[row-rstart]; 1516 cols = aj + ai[row-rstart]; 1517 if (!allcolumns) { 1518 for (k=0; k<ncols; k++) { 1519 #if defined(PETSC_USE_CTABLE) 1520 tcol = cmap_loc[cols[k]]; 1521 #else 1522 tcol = cmap[cols[k]+cstart]; 1523 #endif 1524 if (tcol) lens[j]++; 1525 } 1526 } else { /* allcolumns */ 1527 lens[j] = ncols; 1528 } 1529 1530 /* off-diagonal part B = c->B */ 1531 ncols = bi[row-rstart+1] - bi[row-rstart]; 1532 cols = bj + bi[row-rstart]; 1533 if (!allcolumns) { 1534 for (k=0; k<ncols; k++) { 1535 #if defined(PETSC_USE_CTABLE) 1536 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1537 #else 1538 tcol = cmap[bmap[cols[k]]]; 1539 #endif 1540 if (tcol) lens[j]++; 1541 } 1542 } else { /* allcolumns */ 1543 lens[j] += ncols; 1544 } 1545 } 1546 } 1547 1548 /* Create row map (rmap): global row of C -> local row of submat */ 1549 #if defined(PETSC_USE_CTABLE) 1550 ierr = PetscTableCreate(nrow,C->rmap->N,&rmap);CHKERRQ(ierr); 1551 for (j=0; j<nrow; j++) { 1552 row = irow[j]; 1553 proc = row2proc[j]; 1554 if (proc == rank) { /* a local row */ 1555 rmap_loc[row - rstart] = j; 1556 } else { 1557 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1558 } 1559 } 1560 #else 1561 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1562 for (j=0; j<nrow; j++) { 1563 rmap[irow[j]] = j; 1564 } 1565 #endif 1566 1567 /* Update lens from offproc data */ 1568 /* recv a->j is done */ 1569 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr); 1570 for (i=0; i<nrqs; i++) { 1571 proc = pa[i]; 1572 sbuf1_i = sbuf1[proc]; 1573 /* jmax = sbuf1_i[0]; PetscCheckFalse(jmax != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1574 ct1 = 2 + 1; 1575 ct2 = 0; 1576 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1577 rbuf3_i = rbuf3[i]; /* received C->j */ 1578 1579 /* is_no = sbuf1_i[2*j-1]; PetscCheckFalse(is_no != 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1580 max1 = sbuf1_i[2]; 1581 for (k=0; k<max1; k++,ct1++) { 1582 #if defined(PETSC_USE_CTABLE) 1583 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1584 row--; 1585 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1586 #else 1587 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1588 #endif 1589 /* Now, store row index of submat in sbuf1_i[ct1] */ 1590 sbuf1_i[ct1] = row; 1591 1592 nnz = rbuf2_i[ct1]; 1593 if (!allcolumns) { 1594 for (l=0; l<nnz; l++,ct2++) { 1595 #if defined(PETSC_USE_CTABLE) 1596 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1597 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1598 } else { 1599 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1600 } 1601 #else 1602 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1603 #endif 1604 if (tcol) lens[row]++; 1605 } 1606 } else { /* allcolumns */ 1607 lens[row] += nnz; 1608 } 1609 } 1610 } 1611 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr); 1612 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1613 1614 /* Create the submatrices */ 1615 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1616 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1617 1618 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1619 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1620 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1621 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1622 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1623 1624 /* create struct Mat_SubSppt and attached it to submat */ 1625 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1626 subc = (Mat_SeqAIJ*)submat->data; 1627 subc->submatis1 = smatis1; 1628 1629 smatis1->id = 0; 1630 smatis1->nrqs = nrqs; 1631 smatis1->nrqr = nrqr; 1632 smatis1->rbuf1 = rbuf1; 1633 smatis1->rbuf2 = rbuf2; 1634 smatis1->rbuf3 = rbuf3; 1635 smatis1->sbuf2 = sbuf2; 1636 smatis1->req_source2 = req_source2; 1637 1638 smatis1->sbuf1 = sbuf1; 1639 smatis1->ptr = ptr; 1640 smatis1->tmp = tmp; 1641 smatis1->ctr = ctr; 1642 1643 smatis1->pa = pa; 1644 smatis1->req_size = req_size; 1645 smatis1->req_source1 = req_source1; 1646 1647 smatis1->allcolumns = allcolumns; 1648 smatis1->singleis = PETSC_TRUE; 1649 smatis1->row2proc = row2proc; 1650 smatis1->rmap = rmap; 1651 smatis1->cmap = cmap; 1652 #if defined(PETSC_USE_CTABLE) 1653 smatis1->rmap_loc = rmap_loc; 1654 smatis1->cmap_loc = cmap_loc; 1655 #endif 1656 1657 smatis1->destroy = submat->ops->destroy; 1658 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1659 submat->factortype = C->factortype; 1660 1661 /* compute rmax */ 1662 rmax = 0; 1663 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1664 1665 } else { /* scall == MAT_REUSE_MATRIX */ 1666 submat = submats[0]; 1667 PetscCheckFalse(submat->rmap->n != nrow || submat->cmap->n != ncol,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1668 1669 subc = (Mat_SeqAIJ*)submat->data; 1670 rmax = subc->rmax; 1671 smatis1 = subc->submatis1; 1672 nrqs = smatis1->nrqs; 1673 nrqr = smatis1->nrqr; 1674 rbuf1 = smatis1->rbuf1; 1675 rbuf2 = smatis1->rbuf2; 1676 rbuf3 = smatis1->rbuf3; 1677 req_source2 = smatis1->req_source2; 1678 1679 sbuf1 = smatis1->sbuf1; 1680 sbuf2 = smatis1->sbuf2; 1681 ptr = smatis1->ptr; 1682 tmp = smatis1->tmp; 1683 ctr = smatis1->ctr; 1684 1685 pa = smatis1->pa; 1686 req_size = smatis1->req_size; 1687 req_source1 = smatis1->req_source1; 1688 1689 allcolumns = smatis1->allcolumns; 1690 row2proc = smatis1->row2proc; 1691 rmap = smatis1->rmap; 1692 cmap = smatis1->cmap; 1693 #if defined(PETSC_USE_CTABLE) 1694 rmap_loc = smatis1->rmap_loc; 1695 cmap_loc = smatis1->cmap_loc; 1696 #endif 1697 } 1698 1699 /* Post recv matrix values */ 1700 ierr = PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1701 ierr = PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4);CHKERRQ(ierr); 1702 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1703 for (i=0; i<nrqs; ++i) { 1704 ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr); 1705 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 1706 } 1707 1708 /* Allocate sending buffers for a->a, and send them off */ 1709 ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr); 1710 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1711 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);} 1712 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1713 1714 for (i=0; i<nrqr; i++) { 1715 rbuf1_i = rbuf1[i]; 1716 sbuf_aa_i = sbuf_aa[i]; 1717 ct1 = 2*rbuf1_i[0]+1; 1718 ct2 = 0; 1719 /* max1=rbuf1_i[0]; PetscCheckFalse(max1 != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1720 1721 kmax = rbuf1_i[2]; 1722 for (k=0; k<kmax; k++,ct1++) { 1723 row = rbuf1_i[ct1] - rstart; 1724 nzA = ai[row+1] - ai[row]; 1725 nzB = bi[row+1] - bi[row]; 1726 ncols = nzA + nzB; 1727 cworkB = bj + bi[row]; 1728 vworkA = a_a + ai[row]; 1729 vworkB = b_a + bi[row]; 1730 1731 /* load the column values for this row into vals*/ 1732 vals = sbuf_aa_i + ct2; 1733 1734 lwrite = 0; 1735 for (l=0; l<nzB; l++) { 1736 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1737 } 1738 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1739 for (l=0; l<nzB; l++) { 1740 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1741 } 1742 1743 ct2 += ncols; 1744 } 1745 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 1746 } 1747 1748 /* Assemble submat */ 1749 /* First assemble the local rows */ 1750 for (j=0; j<nrow; j++) { 1751 row = irow[j]; 1752 proc = row2proc[j]; 1753 if (proc == rank) { 1754 Crow = row - rstart; /* local row index of C */ 1755 #if defined(PETSC_USE_CTABLE) 1756 row = rmap_loc[Crow]; /* row index of submat */ 1757 #else 1758 row = rmap[row]; 1759 #endif 1760 1761 if (allcolumns) { 1762 /* diagonal part A = c->A */ 1763 ncols = ai[Crow+1] - ai[Crow]; 1764 cols = aj + ai[Crow]; 1765 vals = a_a + ai[Crow]; 1766 i = 0; 1767 for (k=0; k<ncols; k++) { 1768 subcols[i] = cols[k] + cstart; 1769 subvals[i++] = vals[k]; 1770 } 1771 1772 /* off-diagonal part B = c->B */ 1773 ncols = bi[Crow+1] - bi[Crow]; 1774 cols = bj + bi[Crow]; 1775 vals = b_a + bi[Crow]; 1776 for (k=0; k<ncols; k++) { 1777 subcols[i] = bmap[cols[k]]; 1778 subvals[i++] = vals[k]; 1779 } 1780 1781 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1782 1783 } else { /* !allcolumns */ 1784 #if defined(PETSC_USE_CTABLE) 1785 /* diagonal part A = c->A */ 1786 ncols = ai[Crow+1] - ai[Crow]; 1787 cols = aj + ai[Crow]; 1788 vals = a_a + ai[Crow]; 1789 i = 0; 1790 for (k=0; k<ncols; k++) { 1791 tcol = cmap_loc[cols[k]]; 1792 if (tcol) { 1793 subcols[i] = --tcol; 1794 subvals[i++] = vals[k]; 1795 } 1796 } 1797 1798 /* off-diagonal part B = c->B */ 1799 ncols = bi[Crow+1] - bi[Crow]; 1800 cols = bj + bi[Crow]; 1801 vals = b_a + bi[Crow]; 1802 for (k=0; k<ncols; k++) { 1803 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1804 if (tcol) { 1805 subcols[i] = --tcol; 1806 subvals[i++] = vals[k]; 1807 } 1808 } 1809 #else 1810 /* diagonal part A = c->A */ 1811 ncols = ai[Crow+1] - ai[Crow]; 1812 cols = aj + ai[Crow]; 1813 vals = a_a + ai[Crow]; 1814 i = 0; 1815 for (k=0; k<ncols; k++) { 1816 tcol = cmap[cols[k]+cstart]; 1817 if (tcol) { 1818 subcols[i] = --tcol; 1819 subvals[i++] = vals[k]; 1820 } 1821 } 1822 1823 /* off-diagonal part B = c->B */ 1824 ncols = bi[Crow+1] - bi[Crow]; 1825 cols = bj + bi[Crow]; 1826 vals = b_a + bi[Crow]; 1827 for (k=0; k<ncols; k++) { 1828 tcol = cmap[bmap[cols[k]]]; 1829 if (tcol) { 1830 subcols[i] = --tcol; 1831 subvals[i++] = vals[k]; 1832 } 1833 } 1834 #endif 1835 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1836 } 1837 } 1838 } 1839 1840 /* Now assemble the off-proc rows */ 1841 for (i=0; i<nrqs; i++) { /* for each requested message */ 1842 /* recv values from other processes */ 1843 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr); 1844 proc = pa[idex]; 1845 sbuf1_i = sbuf1[proc]; 1846 /* jmax = sbuf1_i[0]; PetscCheckFalse(jmax != 1,PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1847 ct1 = 2 + 1; 1848 ct2 = 0; /* count of received C->j */ 1849 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1850 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1851 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1852 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1853 1854 /* is_no = sbuf1_i[2*j-1]; PetscCheckFalse(is_no != 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1855 max1 = sbuf1_i[2]; /* num of rows */ 1856 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1857 row = sbuf1_i[ct1]; /* row index of submat */ 1858 if (!allcolumns) { 1859 idex = 0; 1860 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1861 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1862 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1863 #if defined(PETSC_USE_CTABLE) 1864 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1865 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1866 } else { 1867 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1868 } 1869 #else 1870 tcol = cmap[rbuf3_i[ct2]]; 1871 #endif 1872 if (tcol) { 1873 subcols[idex] = --tcol; /* may not be sorted */ 1874 subvals[idex++] = rbuf4_i[ct2]; 1875 1876 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1877 For reuse, we replace received C->j with index that should be inserted to submat */ 1878 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1879 } 1880 } 1881 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1882 } else { /* scall == MAT_REUSE_MATRIX */ 1883 submat = submats[0]; 1884 subc = (Mat_SeqAIJ*)submat->data; 1885 1886 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1887 for (l=0; l<nnz; l++) { 1888 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1889 subvals[idex++] = rbuf4_i[ct2]; 1890 } 1891 1892 bj = subc->j + subc->i[row]; /* sorted column indices */ 1893 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1894 } 1895 } else { /* allcolumns */ 1896 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1897 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1898 ct2 += nnz; 1899 } 1900 } 1901 } 1902 1903 /* sending a->a are done */ 1904 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr); 1905 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1906 1907 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1908 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1909 submats[0] = submat; 1910 1911 /* Restore the indices */ 1912 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1913 if (!allcolumns) { 1914 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1915 } 1916 1917 /* Destroy allocated memory */ 1918 for (i=0; i<nrqs; ++i) { 1919 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1920 } 1921 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1922 if (sbuf_aa) { 1923 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1924 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1925 } 1926 1927 if (scall == MAT_INITIAL_MATRIX) { 1928 ierr = PetscFree(lens);CHKERRQ(ierr); 1929 if (sbuf_aj) { 1930 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1931 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1932 } 1933 } 1934 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1935 ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1936 PetscFunctionReturn(0); 1937 } 1938 1939 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1940 { 1941 PetscErrorCode ierr; 1942 PetscInt ncol; 1943 PetscBool colflag,allcolumns=PETSC_FALSE; 1944 1945 PetscFunctionBegin; 1946 /* Allocate memory to hold all the submatrices */ 1947 if (scall == MAT_INITIAL_MATRIX) { 1948 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1949 } 1950 1951 /* Check for special case: each processor gets entire matrix columns */ 1952 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1953 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1954 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1955 1956 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1957 PetscFunctionReturn(0); 1958 } 1959 1960 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1961 { 1962 PetscErrorCode ierr; 1963 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1964 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1965 Mat_SeqAIJ *subc; 1966 Mat_SubSppt *smat; 1967 1968 PetscFunctionBegin; 1969 /* Check for special case: each processor has a single IS */ 1970 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1971 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1972 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1973 PetscFunctionReturn(0); 1974 } 1975 1976 /* Collect global wantallmatrix and nstages */ 1977 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1978 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1979 if (!nmax) nmax = 1; 1980 1981 if (scall == MAT_INITIAL_MATRIX) { 1982 /* Collect global wantallmatrix and nstages */ 1983 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1984 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1985 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1986 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1987 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1988 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1989 wantallmatrix = PETSC_TRUE; 1990 1991 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1992 } 1993 } 1994 1995 /* Determine the number of stages through which submatrices are done 1996 Each stage will extract nmax submatrices. 1997 nmax is determined by the matrix column dimension. 1998 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1999 */ 2000 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2001 2002 in[0] = -1*(PetscInt)wantallmatrix; 2003 in[1] = nstages; 2004 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr); 2005 wantallmatrix = (PetscBool)(-out[0]); 2006 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2007 2008 } else { /* MAT_REUSE_MATRIX */ 2009 if (ismax) { 2010 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2011 smat = subc->submatis1; 2012 } else { /* (*submat)[0] is a dummy matrix */ 2013 smat = (Mat_SubSppt*)(*submat)[0]->data; 2014 } 2015 if (!smat) { 2016 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2017 wantallmatrix = PETSC_TRUE; 2018 } else if (smat->singleis) { 2019 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2020 PetscFunctionReturn(0); 2021 } else { 2022 nstages = smat->nstages; 2023 } 2024 } 2025 2026 if (wantallmatrix) { 2027 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2028 PetscFunctionReturn(0); 2029 } 2030 2031 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2032 if (scall == MAT_INITIAL_MATRIX) { 2033 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2034 } 2035 2036 for (i=0,pos=0; i<nstages; i++) { 2037 if (pos+nmax <= ismax) max_no = nmax; 2038 else if (pos >= ismax) max_no = 0; 2039 else max_no = ismax-pos; 2040 2041 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2042 if (!max_no) { 2043 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2044 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2045 smat->nstages = nstages; 2046 } 2047 pos++; /* advance to next dummy matrix if any */ 2048 } else pos += max_no; 2049 } 2050 2051 if (ismax && scall == MAT_INITIAL_MATRIX) { 2052 /* save nstages for reuse */ 2053 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2054 smat = subc->submatis1; 2055 smat->nstages = nstages; 2056 } 2057 PetscFunctionReturn(0); 2058 } 2059 2060 /* -------------------------------------------------------------------------*/ 2061 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2062 { 2063 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2064 Mat A = c->A; 2065 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2066 const PetscInt **icol,**irow; 2067 PetscInt *nrow,*ncol,start; 2068 PetscErrorCode ierr; 2069 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2070 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2071 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2072 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2073 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2074 #if defined(PETSC_USE_CTABLE) 2075 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2076 #else 2077 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2078 #endif 2079 const PetscInt *irow_i; 2080 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2081 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2082 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2083 MPI_Comm comm; 2084 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2085 PetscMPIInt *onodes1,*olengths1,end; 2086 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2087 Mat_SubSppt *smat_i; 2088 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2089 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2090 2091 PetscFunctionBegin; 2092 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2093 size = c->size; 2094 rank = c->rank; 2095 2096 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2097 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2098 2099 for (i=0; i<ismax; i++) { 2100 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2101 if (!issorted[i]) iscsorted = issorted[i]; 2102 2103 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2104 2105 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2106 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2107 2108 /* Check for special case: allcolumn */ 2109 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2110 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2111 if (colflag && ncol[i] == C->cmap->N) { 2112 allcolumns[i] = PETSC_TRUE; 2113 icol[i] = NULL; 2114 } else { 2115 allcolumns[i] = PETSC_FALSE; 2116 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2117 } 2118 } 2119 2120 if (scall == MAT_REUSE_MATRIX) { 2121 /* Assumes new rows are same length as the old rows */ 2122 for (i=0; i<ismax; i++) { 2123 PetscCheckFalse(!submats[i],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%" PetscInt_FMT "] is null, cannot reuse",i); 2124 subc = (Mat_SeqAIJ*)submats[i]->data; 2125 PetscCheckFalse((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i]),PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2126 2127 /* Initial matrix as if empty */ 2128 ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr); 2129 2130 smat_i = subc->submatis1; 2131 2132 nrqs = smat_i->nrqs; 2133 nrqr = smat_i->nrqr; 2134 rbuf1 = smat_i->rbuf1; 2135 rbuf2 = smat_i->rbuf2; 2136 rbuf3 = smat_i->rbuf3; 2137 req_source2 = smat_i->req_source2; 2138 2139 sbuf1 = smat_i->sbuf1; 2140 sbuf2 = smat_i->sbuf2; 2141 ptr = smat_i->ptr; 2142 tmp = smat_i->tmp; 2143 ctr = smat_i->ctr; 2144 2145 pa = smat_i->pa; 2146 req_size = smat_i->req_size; 2147 req_source1 = smat_i->req_source1; 2148 2149 allcolumns[i] = smat_i->allcolumns; 2150 row2proc[i] = smat_i->row2proc; 2151 rmap[i] = smat_i->rmap; 2152 cmap[i] = smat_i->cmap; 2153 } 2154 2155 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2156 PetscCheckFalse(!submats[0],PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2157 smat_i = (Mat_SubSppt*)submats[0]->data; 2158 2159 nrqs = smat_i->nrqs; 2160 nrqr = smat_i->nrqr; 2161 rbuf1 = smat_i->rbuf1; 2162 rbuf2 = smat_i->rbuf2; 2163 rbuf3 = smat_i->rbuf3; 2164 req_source2 = smat_i->req_source2; 2165 2166 sbuf1 = smat_i->sbuf1; 2167 sbuf2 = smat_i->sbuf2; 2168 ptr = smat_i->ptr; 2169 tmp = smat_i->tmp; 2170 ctr = smat_i->ctr; 2171 2172 pa = smat_i->pa; 2173 req_size = smat_i->req_size; 2174 req_source1 = smat_i->req_source1; 2175 2176 allcolumns[0] = PETSC_FALSE; 2177 } 2178 } else { /* scall == MAT_INITIAL_MATRIX */ 2179 /* Get some new tags to keep the communication clean */ 2180 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2181 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2182 2183 /* evaluate communication - mesg to who, length of mesg, and buffer space 2184 required. Based on this, buffers are allocated, and data copied into them*/ 2185 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2186 2187 for (i=0; i<ismax; i++) { 2188 jmax = nrow[i]; 2189 irow_i = irow[i]; 2190 2191 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2192 row2proc[i] = row2proc_i; 2193 2194 if (issorted[i]) proc = 0; 2195 for (j=0; j<jmax; j++) { 2196 if (!issorted[i]) proc = 0; 2197 row = irow_i[j]; 2198 while (row >= C->rmap->range[proc+1]) proc++; 2199 w4[proc]++; 2200 row2proc_i[j] = proc; /* map row index to proc */ 2201 } 2202 for (j=0; j<size; j++) { 2203 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2204 } 2205 } 2206 2207 nrqs = 0; /* no of outgoing messages */ 2208 msz = 0; /* total mesg length (for all procs) */ 2209 w1[rank] = 0; /* no mesg sent to self */ 2210 w3[rank] = 0; 2211 for (i=0; i<size; i++) { 2212 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2213 } 2214 ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2215 for (i=0,j=0; i<size; i++) { 2216 if (w1[i]) { pa[j] = i; j++; } 2217 } 2218 2219 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2220 for (i=0; i<nrqs; i++) { 2221 j = pa[i]; 2222 w1[j] += w2[j] + 2* w3[j]; 2223 msz += w1[j]; 2224 } 2225 ierr = PetscInfo(0,"Number of outgoing messages %" PetscInt_FMT " Total message length %" PetscInt_FMT "\n",nrqs,msz);CHKERRQ(ierr); 2226 2227 /* Determine the number of messages to expect, their lengths, from from-ids */ 2228 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2229 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2230 2231 /* Now post the Irecvs corresponding to these messages */ 2232 ierr = PetscObjectGetNewTag((PetscObject)C,&tag0);CHKERRQ(ierr); 2233 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2234 2235 /* Allocate Memory for outgoing messages */ 2236 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2237 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 2238 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 2239 2240 { 2241 PetscInt *iptr = tmp; 2242 k = 0; 2243 for (i=0; i<nrqs; i++) { 2244 j = pa[i]; 2245 iptr += k; 2246 sbuf1[j] = iptr; 2247 k = w1[j]; 2248 } 2249 } 2250 2251 /* Form the outgoing messages. Initialize the header space */ 2252 for (i=0; i<nrqs; i++) { 2253 j = pa[i]; 2254 sbuf1[j][0] = 0; 2255 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 2256 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2257 } 2258 2259 /* Parse the isrow and copy data into outbuf */ 2260 for (i=0; i<ismax; i++) { 2261 row2proc_i = row2proc[i]; 2262 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 2263 irow_i = irow[i]; 2264 jmax = nrow[i]; 2265 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2266 proc = row2proc_i[j]; 2267 if (proc != rank) { /* copy to the outgoing buf*/ 2268 ctr[proc]++; 2269 *ptr[proc] = irow_i[j]; 2270 ptr[proc]++; 2271 } 2272 } 2273 /* Update the headers for the current IS */ 2274 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2275 if ((ctr_j = ctr[j])) { 2276 sbuf1_j = sbuf1[j]; 2277 k = ++sbuf1_j[0]; 2278 sbuf1_j[2*k] = ctr_j; 2279 sbuf1_j[2*k-1] = i; 2280 } 2281 } 2282 } 2283 2284 /* Now post the sends */ 2285 ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr); 2286 for (i=0; i<nrqs; ++i) { 2287 j = pa[i]; 2288 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr); 2289 } 2290 2291 /* Post Receives to capture the buffer size */ 2292 ierr = PetscMalloc1(nrqs,&r_waits2);CHKERRQ(ierr); 2293 ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr); 2294 if (nrqs) rbuf2[0] = tmp + msz; 2295 for (i=1; i<nrqs; ++i) { 2296 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2297 } 2298 for (i=0; i<nrqs; ++i) { 2299 j = pa[i]; 2300 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 2301 } 2302 2303 /* Send to other procs the buf size they should allocate */ 2304 /* Receive messages*/ 2305 ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr); 2306 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2307 { 2308 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2309 PetscInt *sbuf2_i; 2310 2311 ierr = MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2312 for (i=0; i<nrqr; ++i) { 2313 req_size[i] = 0; 2314 rbuf1_i = rbuf1[i]; 2315 start = 2*rbuf1_i[0] + 1; 2316 end = olengths1[i]; 2317 ierr = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr); 2318 sbuf2_i = sbuf2[i]; 2319 for (j=start; j<end; j++) { 2320 id = rbuf1_i[j] - rstart; 2321 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2322 sbuf2_i[j] = ncols; 2323 req_size[i] += ncols; 2324 } 2325 req_source1[i] = onodes1[i]; 2326 /* form the header */ 2327 sbuf2_i[0] = req_size[i]; 2328 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2329 2330 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 2331 } 2332 } 2333 2334 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2335 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2336 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2337 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2338 2339 /* Receive messages*/ 2340 ierr = PetscMalloc1(nrqs,&r_waits3);CHKERRQ(ierr); 2341 ierr = MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2342 for (i=0; i<nrqs; ++i) { 2343 ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr); 2344 req_source2[i] = pa[i]; 2345 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 2346 } 2347 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2348 2349 /* Wait on sends1 and sends2 */ 2350 ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2351 ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2352 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2353 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2354 2355 /* Now allocate sending buffers for a->j, and send them off */ 2356 ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr); 2357 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2358 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);} 2359 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2360 2361 ierr = PetscMalloc1(nrqr,&s_waits3);CHKERRQ(ierr); 2362 { 2363 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2364 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2365 PetscInt cend = C->cmap->rend; 2366 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2367 2368 for (i=0; i<nrqr; i++) { 2369 rbuf1_i = rbuf1[i]; 2370 sbuf_aj_i = sbuf_aj[i]; 2371 ct1 = 2*rbuf1_i[0] + 1; 2372 ct2 = 0; 2373 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2374 kmax = rbuf1[i][2*j]; 2375 for (k=0; k<kmax; k++,ct1++) { 2376 row = rbuf1_i[ct1] - rstart; 2377 nzA = a_i[row+1] - a_i[row]; 2378 nzB = b_i[row+1] - b_i[row]; 2379 ncols = nzA + nzB; 2380 cworkA = a_j + a_i[row]; 2381 cworkB = b_j + b_i[row]; 2382 2383 /* load the column indices for this row into cols */ 2384 cols = sbuf_aj_i + ct2; 2385 2386 lwrite = 0; 2387 for (l=0; l<nzB; l++) { 2388 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2389 } 2390 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2391 for (l=0; l<nzB; l++) { 2392 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2393 } 2394 2395 ct2 += ncols; 2396 } 2397 } 2398 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 2399 } 2400 } 2401 2402 /* create col map: global col of C -> local col of submatrices */ 2403 { 2404 const PetscInt *icol_i; 2405 #if defined(PETSC_USE_CTABLE) 2406 for (i=0; i<ismax; i++) { 2407 if (!allcolumns[i]) { 2408 ierr = PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2409 2410 jmax = ncol[i]; 2411 icol_i = icol[i]; 2412 cmap_i = cmap[i]; 2413 for (j=0; j<jmax; j++) { 2414 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2415 } 2416 } else cmap[i] = NULL; 2417 } 2418 #else 2419 for (i=0; i<ismax; i++) { 2420 if (!allcolumns[i]) { 2421 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2422 jmax = ncol[i]; 2423 icol_i = icol[i]; 2424 cmap_i = cmap[i]; 2425 for (j=0; j<jmax; j++) { 2426 cmap_i[icol_i[j]] = j+1; 2427 } 2428 } else cmap[i] = NULL; 2429 } 2430 #endif 2431 } 2432 2433 /* Create lens which is required for MatCreate... */ 2434 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2435 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2436 2437 if (ismax) { 2438 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2439 } 2440 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2441 2442 /* Update lens from local data */ 2443 for (i=0; i<ismax; i++) { 2444 row2proc_i = row2proc[i]; 2445 jmax = nrow[i]; 2446 if (!allcolumns[i]) cmap_i = cmap[i]; 2447 irow_i = irow[i]; 2448 lens_i = lens[i]; 2449 for (j=0; j<jmax; j++) { 2450 row = irow_i[j]; 2451 proc = row2proc_i[j]; 2452 if (proc == rank) { 2453 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2454 if (!allcolumns[i]) { 2455 for (k=0; k<ncols; k++) { 2456 #if defined(PETSC_USE_CTABLE) 2457 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2458 #else 2459 tcol = cmap_i[cols[k]]; 2460 #endif 2461 if (tcol) lens_i[j]++; 2462 } 2463 } else { /* allcolumns */ 2464 lens_i[j] = ncols; 2465 } 2466 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2467 } 2468 } 2469 } 2470 2471 /* Create row map: global row of C -> local row of submatrices */ 2472 #if defined(PETSC_USE_CTABLE) 2473 for (i=0; i<ismax; i++) { 2474 ierr = PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2475 irow_i = irow[i]; 2476 jmax = nrow[i]; 2477 for (j=0; j<jmax; j++) { 2478 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2479 } 2480 } 2481 #else 2482 for (i=0; i<ismax; i++) { 2483 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2484 rmap_i = rmap[i]; 2485 irow_i = irow[i]; 2486 jmax = nrow[i]; 2487 for (j=0; j<jmax; j++) { 2488 rmap_i[irow_i[j]] = j; 2489 } 2490 } 2491 #endif 2492 2493 /* Update lens from offproc data */ 2494 { 2495 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2496 2497 ierr = MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2498 for (tmp2=0; tmp2<nrqs; tmp2++) { 2499 sbuf1_i = sbuf1[pa[tmp2]]; 2500 jmax = sbuf1_i[0]; 2501 ct1 = 2*jmax+1; 2502 ct2 = 0; 2503 rbuf2_i = rbuf2[tmp2]; 2504 rbuf3_i = rbuf3[tmp2]; 2505 for (j=1; j<=jmax; j++) { 2506 is_no = sbuf1_i[2*j-1]; 2507 max1 = sbuf1_i[2*j]; 2508 lens_i = lens[is_no]; 2509 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2510 rmap_i = rmap[is_no]; 2511 for (k=0; k<max1; k++,ct1++) { 2512 #if defined(PETSC_USE_CTABLE) 2513 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2514 row--; 2515 PetscCheckFalse(row < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2516 #else 2517 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2518 #endif 2519 max2 = rbuf2_i[ct1]; 2520 for (l=0; l<max2; l++,ct2++) { 2521 if (!allcolumns[is_no]) { 2522 #if defined(PETSC_USE_CTABLE) 2523 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2524 #else 2525 tcol = cmap_i[rbuf3_i[ct2]]; 2526 #endif 2527 if (tcol) lens_i[row]++; 2528 } else { /* allcolumns */ 2529 lens_i[row]++; /* lens_i[row] += max2 ? */ 2530 } 2531 } 2532 } 2533 } 2534 } 2535 } 2536 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2537 ierr = MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2538 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2539 2540 /* Create the submatrices */ 2541 for (i=0; i<ismax; i++) { 2542 PetscInt rbs,cbs; 2543 2544 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2545 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2546 2547 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2548 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2549 2550 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2551 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2552 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2553 2554 /* create struct Mat_SubSppt and attached it to submat */ 2555 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2556 subc = (Mat_SeqAIJ*)submats[i]->data; 2557 subc->submatis1 = smat_i; 2558 2559 smat_i->destroy = submats[i]->ops->destroy; 2560 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2561 submats[i]->factortype = C->factortype; 2562 2563 smat_i->id = i; 2564 smat_i->nrqs = nrqs; 2565 smat_i->nrqr = nrqr; 2566 smat_i->rbuf1 = rbuf1; 2567 smat_i->rbuf2 = rbuf2; 2568 smat_i->rbuf3 = rbuf3; 2569 smat_i->sbuf2 = sbuf2; 2570 smat_i->req_source2 = req_source2; 2571 2572 smat_i->sbuf1 = sbuf1; 2573 smat_i->ptr = ptr; 2574 smat_i->tmp = tmp; 2575 smat_i->ctr = ctr; 2576 2577 smat_i->pa = pa; 2578 smat_i->req_size = req_size; 2579 smat_i->req_source1 = req_source1; 2580 2581 smat_i->allcolumns = allcolumns[i]; 2582 smat_i->singleis = PETSC_FALSE; 2583 smat_i->row2proc = row2proc[i]; 2584 smat_i->rmap = rmap[i]; 2585 smat_i->cmap = cmap[i]; 2586 } 2587 2588 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2589 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2590 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2591 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2592 2593 /* create struct Mat_SubSppt and attached it to submat */ 2594 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2595 submats[0]->data = (void*)smat_i; 2596 2597 smat_i->destroy = submats[0]->ops->destroy; 2598 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2599 submats[0]->factortype = C->factortype; 2600 2601 smat_i->id = 0; 2602 smat_i->nrqs = nrqs; 2603 smat_i->nrqr = nrqr; 2604 smat_i->rbuf1 = rbuf1; 2605 smat_i->rbuf2 = rbuf2; 2606 smat_i->rbuf3 = rbuf3; 2607 smat_i->sbuf2 = sbuf2; 2608 smat_i->req_source2 = req_source2; 2609 2610 smat_i->sbuf1 = sbuf1; 2611 smat_i->ptr = ptr; 2612 smat_i->tmp = tmp; 2613 smat_i->ctr = ctr; 2614 2615 smat_i->pa = pa; 2616 smat_i->req_size = req_size; 2617 smat_i->req_source1 = req_source1; 2618 2619 smat_i->allcolumns = PETSC_FALSE; 2620 smat_i->singleis = PETSC_FALSE; 2621 smat_i->row2proc = NULL; 2622 smat_i->rmap = NULL; 2623 smat_i->cmap = NULL; 2624 } 2625 2626 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2627 ierr = PetscFree(lens);CHKERRQ(ierr); 2628 if (sbuf_aj) { 2629 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2630 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2631 } 2632 2633 } /* endof scall == MAT_INITIAL_MATRIX */ 2634 2635 /* Post recv matrix values */ 2636 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2637 ierr = PetscMalloc1(nrqs,&rbuf4);CHKERRQ(ierr); 2638 ierr = PetscMalloc1(nrqs,&r_waits4);CHKERRQ(ierr); 2639 for (i=0; i<nrqs; ++i) { 2640 ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr); 2641 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 2642 } 2643 2644 /* Allocate sending buffers for a->a, and send them off */ 2645 ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr); 2646 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2647 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);} 2648 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2649 2650 ierr = PetscMalloc1(nrqr,&s_waits4);CHKERRQ(ierr); 2651 { 2652 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2653 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2654 PetscInt cend = C->cmap->rend; 2655 PetscInt *b_j = b->j; 2656 PetscScalar *vworkA,*vworkB,*a_a,*b_a; 2657 2658 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2659 ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2660 for (i=0; i<nrqr; i++) { 2661 rbuf1_i = rbuf1[i]; 2662 sbuf_aa_i = sbuf_aa[i]; 2663 ct1 = 2*rbuf1_i[0]+1; 2664 ct2 = 0; 2665 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2666 kmax = rbuf1_i[2*j]; 2667 for (k=0; k<kmax; k++,ct1++) { 2668 row = rbuf1_i[ct1] - rstart; 2669 nzA = a_i[row+1] - a_i[row]; 2670 nzB = b_i[row+1] - b_i[row]; 2671 ncols = nzA + nzB; 2672 cworkB = b_j + b_i[row]; 2673 vworkA = a_a + a_i[row]; 2674 vworkB = b_a + b_i[row]; 2675 2676 /* load the column values for this row into vals*/ 2677 vals = sbuf_aa_i+ct2; 2678 2679 lwrite = 0; 2680 for (l=0; l<nzB; l++) { 2681 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2682 } 2683 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2684 for (l=0; l<nzB; l++) { 2685 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2686 } 2687 2688 ct2 += ncols; 2689 } 2690 } 2691 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 2692 } 2693 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2694 ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2695 } 2696 2697 /* Assemble the matrices */ 2698 /* First assemble the local rows */ 2699 for (i=0; i<ismax; i++) { 2700 row2proc_i = row2proc[i]; 2701 subc = (Mat_SeqAIJ*)submats[i]->data; 2702 imat_ilen = subc->ilen; 2703 imat_j = subc->j; 2704 imat_i = subc->i; 2705 imat_a = subc->a; 2706 2707 if (!allcolumns[i]) cmap_i = cmap[i]; 2708 rmap_i = rmap[i]; 2709 irow_i = irow[i]; 2710 jmax = nrow[i]; 2711 for (j=0; j<jmax; j++) { 2712 row = irow_i[j]; 2713 proc = row2proc_i[j]; 2714 if (proc == rank) { 2715 old_row = row; 2716 #if defined(PETSC_USE_CTABLE) 2717 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2718 row--; 2719 #else 2720 row = rmap_i[row]; 2721 #endif 2722 ilen_row = imat_ilen[row]; 2723 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2724 mat_i = imat_i[row]; 2725 mat_a = imat_a + mat_i; 2726 mat_j = imat_j + mat_i; 2727 if (!allcolumns[i]) { 2728 for (k=0; k<ncols; k++) { 2729 #if defined(PETSC_USE_CTABLE) 2730 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2731 #else 2732 tcol = cmap_i[cols[k]]; 2733 #endif 2734 if (tcol) { 2735 *mat_j++ = tcol - 1; 2736 *mat_a++ = vals[k]; 2737 ilen_row++; 2738 } 2739 } 2740 } else { /* allcolumns */ 2741 for (k=0; k<ncols; k++) { 2742 *mat_j++ = cols[k]; /* global col index! */ 2743 *mat_a++ = vals[k]; 2744 ilen_row++; 2745 } 2746 } 2747 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2748 2749 imat_ilen[row] = ilen_row; 2750 } 2751 } 2752 } 2753 2754 /* Now assemble the off proc rows */ 2755 ierr = MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2756 for (tmp2=0; tmp2<nrqs; tmp2++) { 2757 sbuf1_i = sbuf1[pa[tmp2]]; 2758 jmax = sbuf1_i[0]; 2759 ct1 = 2*jmax + 1; 2760 ct2 = 0; 2761 rbuf2_i = rbuf2[tmp2]; 2762 rbuf3_i = rbuf3[tmp2]; 2763 rbuf4_i = rbuf4[tmp2]; 2764 for (j=1; j<=jmax; j++) { 2765 is_no = sbuf1_i[2*j-1]; 2766 rmap_i = rmap[is_no]; 2767 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2768 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2769 imat_ilen = subc->ilen; 2770 imat_j = subc->j; 2771 imat_i = subc->i; 2772 imat_a = subc->a; 2773 max1 = sbuf1_i[2*j]; 2774 for (k=0; k<max1; k++,ct1++) { 2775 row = sbuf1_i[ct1]; 2776 #if defined(PETSC_USE_CTABLE) 2777 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2778 row--; 2779 #else 2780 row = rmap_i[row]; 2781 #endif 2782 ilen = imat_ilen[row]; 2783 mat_i = imat_i[row]; 2784 mat_a = imat_a + mat_i; 2785 mat_j = imat_j + mat_i; 2786 max2 = rbuf2_i[ct1]; 2787 if (!allcolumns[is_no]) { 2788 for (l=0; l<max2; l++,ct2++) { 2789 #if defined(PETSC_USE_CTABLE) 2790 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2791 #else 2792 tcol = cmap_i[rbuf3_i[ct2]]; 2793 #endif 2794 if (tcol) { 2795 *mat_j++ = tcol - 1; 2796 *mat_a++ = rbuf4_i[ct2]; 2797 ilen++; 2798 } 2799 } 2800 } else { /* allcolumns */ 2801 for (l=0; l<max2; l++,ct2++) { 2802 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2803 *mat_a++ = rbuf4_i[ct2]; 2804 ilen++; 2805 } 2806 } 2807 imat_ilen[row] = ilen; 2808 } 2809 } 2810 } 2811 2812 if (!iscsorted) { /* sort column indices of the rows */ 2813 for (i=0; i<ismax; i++) { 2814 subc = (Mat_SeqAIJ*)submats[i]->data; 2815 imat_j = subc->j; 2816 imat_i = subc->i; 2817 imat_a = subc->a; 2818 imat_ilen = subc->ilen; 2819 2820 if (allcolumns[i]) continue; 2821 jmax = nrow[i]; 2822 for (j=0; j<jmax; j++) { 2823 mat_i = imat_i[j]; 2824 mat_a = imat_a + mat_i; 2825 mat_j = imat_j + mat_i; 2826 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2827 } 2828 } 2829 } 2830 2831 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2832 ierr = MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2833 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2834 2835 /* Restore the indices */ 2836 for (i=0; i<ismax; i++) { 2837 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2838 if (!allcolumns[i]) { 2839 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2840 } 2841 } 2842 2843 for (i=0; i<ismax; i++) { 2844 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2845 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2846 } 2847 2848 /* Destroy allocated memory */ 2849 if (sbuf_aa) { 2850 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2851 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2852 } 2853 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2854 2855 for (i=0; i<nrqs; ++i) { 2856 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2857 } 2858 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2859 2860 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2861 PetscFunctionReturn(0); 2862 } 2863 2864 /* 2865 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2866 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2867 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2868 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2869 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2870 state, and needs to be "assembled" later by compressing B's column space. 2871 2872 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2873 Following this call, C->A & C->B have been created, even if empty. 2874 */ 2875 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2876 { 2877 /* If making this function public, change the error returned in this function away from _PLIB. */ 2878 PetscErrorCode ierr; 2879 Mat_MPIAIJ *aij; 2880 Mat_SeqAIJ *Baij; 2881 PetscBool seqaij,Bdisassembled; 2882 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2883 PetscScalar v; 2884 const PetscInt *rowindices,*colindices; 2885 2886 PetscFunctionBegin; 2887 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2888 if (A) { 2889 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2890 PetscCheckFalse(!seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2891 if (rowemb) { 2892 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2893 PetscCheckFalse(m != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with diag matrix row size %" PetscInt_FMT,m,A->rmap->n); 2894 } else { 2895 if (C->rmap->n != A->rmap->n) { 2896 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2897 } 2898 } 2899 if (dcolemb) { 2900 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2901 PetscCheckFalse(n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %" PetscInt_FMT " is incompatible with diag matrix col size %" PetscInt_FMT,n,A->cmap->n); 2902 } else { 2903 PetscCheckFalse(C->cmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2904 } 2905 } 2906 if (B) { 2907 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2908 PetscCheckFalse(!seqaij,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2909 if (rowemb) { 2910 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2911 PetscCheckFalse(m != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %" PetscInt_FMT " is incompatible with off-diag matrix row size %" PetscInt_FMT,m,A->rmap->n); 2912 } else { 2913 if (C->rmap->n != B->rmap->n) { 2914 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2915 } 2916 } 2917 if (ocolemb) { 2918 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2919 PetscCheckFalse(n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %" PetscInt_FMT " is incompatible with off-diag matrix col size %" PetscInt_FMT,n,B->cmap->n); 2920 } else { 2921 PetscCheckFalse(C->cmap->N - C->cmap->n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2922 } 2923 } 2924 2925 aij = (Mat_MPIAIJ*)C->data; 2926 if (!aij->A) { 2927 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2928 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2929 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2930 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2931 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2932 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2933 } 2934 if (A) { 2935 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2936 } else { 2937 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2938 } 2939 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2940 /* 2941 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2942 need to "disassemble" B -- convert it to using C's global indices. 2943 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2944 2945 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2946 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2947 2948 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2949 At least avoid calling MatSetValues() and the implied searches? 2950 */ 2951 2952 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2953 #if defined(PETSC_USE_CTABLE) 2954 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2955 #else 2956 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2957 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2958 if (aij->B) { 2959 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2960 } 2961 #endif 2962 ngcol = 0; 2963 if (aij->lvec) { 2964 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2965 } 2966 if (aij->garray) { 2967 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2968 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2969 } 2970 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2971 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2972 } 2973 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2974 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2975 } 2976 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2977 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2978 } 2979 } 2980 Bdisassembled = PETSC_FALSE; 2981 if (!aij->B) { 2982 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2983 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2984 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2985 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2986 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2987 Bdisassembled = PETSC_TRUE; 2988 } 2989 if (B) { 2990 Baij = (Mat_SeqAIJ*)B->data; 2991 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2992 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2993 for (i=0; i<B->rmap->n; i++) { 2994 nz[i] = Baij->i[i+1] - Baij->i[i]; 2995 } 2996 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2997 ierr = PetscFree(nz);CHKERRQ(ierr); 2998 } 2999 3000 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3001 shift = rend-rstart; 3002 count = 0; 3003 rowindices = NULL; 3004 colindices = NULL; 3005 if (rowemb) { 3006 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3007 } 3008 if (ocolemb) { 3009 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3010 } 3011 for (i=0; i<B->rmap->n; i++) { 3012 PetscInt row; 3013 row = i; 3014 if (rowindices) row = rowindices[i]; 3015 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3016 col = Baij->j[count]; 3017 if (colindices) col = colindices[col]; 3018 if (Bdisassembled && col>=rstart) col += shift; 3019 v = Baij->a[count]; 3020 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3021 ++count; 3022 } 3023 } 3024 /* No assembly for aij->B is necessary. */ 3025 /* FIXME: set aij->B's nonzerostate correctly. */ 3026 } else { 3027 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3028 } 3029 C->preallocated = PETSC_TRUE; 3030 C->was_assembled = PETSC_FALSE; 3031 C->assembled = PETSC_FALSE; 3032 /* 3033 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3034 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3035 */ 3036 PetscFunctionReturn(0); 3037 } 3038 3039 /* 3040 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3041 */ 3042 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3043 { 3044 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3045 3046 PetscFunctionBegin; 3047 PetscValidPointer(A,2); 3048 PetscValidPointer(B,3); 3049 /* FIXME: make sure C is assembled */ 3050 *A = aij->A; 3051 *B = aij->B; 3052 /* Note that we don't incref *A and *B, so be careful! */ 3053 PetscFunctionReturn(0); 3054 } 3055 3056 /* 3057 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3058 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3059 */ 3060 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3061 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3062 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3063 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3064 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3065 { 3066 PetscErrorCode ierr; 3067 PetscMPIInt size,flag; 3068 PetscInt i,ii,cismax,ispar; 3069 Mat *A,*B; 3070 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3071 3072 PetscFunctionBegin; 3073 if (!ismax) PetscFunctionReturn(0); 3074 3075 for (i = 0, cismax = 0; i < ismax; ++i) { 3076 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr); 3077 PetscCheckFalse(flag != MPI_IDENT,PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3078 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr); 3079 if (size > 1) ++cismax; 3080 } 3081 3082 /* 3083 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3084 ispar counts the number of parallel ISs across C's comm. 3085 */ 3086 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr); 3087 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3088 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3089 PetscFunctionReturn(0); 3090 } 3091 3092 /* if (ispar) */ 3093 /* 3094 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3095 These are used to extract the off-diag portion of the resulting parallel matrix. 3096 The row IS for the off-diag portion is the same as for the diag portion, 3097 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3098 */ 3099 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3100 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3101 for (i = 0, ii = 0; i < ismax; ++i) { 3102 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3103 if (size > 1) { 3104 /* 3105 TODO: This is the part that's ***NOT SCALABLE***. 3106 To fix this we need to extract just the indices of C's nonzero columns 3107 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3108 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3109 to be done without serializing on the IS list, so, most likely, it is best 3110 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3111 */ 3112 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3113 /* Now we have to 3114 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3115 were sorted on each rank, concatenated they might no longer be sorted; 3116 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3117 indices in the nondecreasing order to the original index positions. 3118 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3119 */ 3120 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3121 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3122 ++ii; 3123 } 3124 } 3125 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3126 for (i = 0, ii = 0; i < ismax; ++i) { 3127 PetscInt j,issize; 3128 const PetscInt *indices; 3129 3130 /* 3131 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3132 */ 3133 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3134 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3135 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3136 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3137 for (j = 1; j < issize; ++j) { 3138 PetscCheckFalse(indices[j] == indices[j-1],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]); 3139 } 3140 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3141 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3142 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3143 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3144 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3145 for (j = 1; j < issize; ++j) { 3146 PetscCheckFalse(indices[j-1] == indices[j],PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %" PetscInt_FMT ": indices at %" PetscInt_FMT " and %" PetscInt_FMT " are both %" PetscInt_FMT,i,j-1,j,indices[j]); 3147 } 3148 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3149 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3150 if (size > 1) { 3151 cisrow[ii] = isrow[i]; 3152 ++ii; 3153 } 3154 } 3155 /* 3156 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3157 array of sequential matrices underlying the resulting parallel matrices. 3158 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3159 contain duplicates. 3160 3161 There are as many diag matrices as there are original index sets. There are only as many parallel 3162 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3163 3164 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3165 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3166 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3167 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3168 with A[i] and B[ii] extracted from the corresponding MPI submat. 3169 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3170 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3171 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3172 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3173 values into A[i] and B[ii] sitting inside the corresponding submat. 3174 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3175 A[i], B[ii], AA[i] or BB[ii] matrices. 3176 */ 3177 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3178 if (scall == MAT_INITIAL_MATRIX) { 3179 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3180 } 3181 3182 /* Now obtain the sequential A and B submatrices separately. */ 3183 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3184 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3185 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3186 3187 /* 3188 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3189 matrices A & B have been extracted directly into the parallel matrices containing them, or 3190 simply into the sequential matrix identical with the corresponding A (if size == 1). 3191 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3192 to have the same sparsity pattern. 3193 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3194 must be constructed for C. This is done by setseqmat(s). 3195 */ 3196 for (i = 0, ii = 0; i < ismax; ++i) { 3197 /* 3198 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3199 That way we can avoid sorting and computing permutations when reusing. 3200 To this end: 3201 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3202 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3203 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3204 */ 3205 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3206 3207 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3208 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3209 if (size > 1) { 3210 if (scall == MAT_INITIAL_MATRIX) { 3211 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3212 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3213 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3214 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3215 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3216 } 3217 /* 3218 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3219 */ 3220 { 3221 Mat AA = A[i],BB = B[ii]; 3222 3223 if (AA || BB) { 3224 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3225 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3226 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3227 } 3228 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3229 } 3230 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3231 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3232 ++ii; 3233 } else { /* if (size == 1) */ 3234 if (scall == MAT_REUSE_MATRIX) { 3235 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3236 } 3237 if (isrow_p[i] || iscol_p[i]) { 3238 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3239 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3240 /* Otherwise A is extracted straight into (*submats)[i]. */ 3241 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3242 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3243 } else (*submat)[i] = A[i]; 3244 } 3245 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3246 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3247 } 3248 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3249 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3250 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3251 ierr = PetscFree(A);CHKERRQ(ierr); 3252 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3253 PetscFunctionReturn(0); 3254 } 3255 3256 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3257 { 3258 PetscErrorCode ierr; 3259 3260 PetscFunctionBegin; 3261 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3262 PetscFunctionReturn(0); 3263 } 3264