1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++) { 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++) { 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank) { 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows) { 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++) { 115 if (tosizes_temp[i]) { 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++) { 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++) { 133 for (j=0; j<fromsizes[2*i]; j++) { 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++) { 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++) { 169 for (j=0; j<tosizes[2*i]; j++) { 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 MPI_Comm *iscomms; 198 PetscErrorCode ierr; 199 200 PetscFunctionBegin; 201 max_lsize = 0; 202 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 203 for (i=0; i<nidx; i++){ 204 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 205 max_lsize = lsize>max_lsize ? lsize:max_lsize; 206 isz[i] = lsize; 207 } 208 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 209 for (i=0; i<nidx; i++){ 210 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 211 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 213 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 214 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 215 } 216 /* retrieve information to get row id and its overlap */ 217 for (i=0; i<nrecvs; ){ 218 is_id = recvdata[i++]; 219 data_size = recvdata[i++]; 220 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 221 isz_i = isz[is_id]; 222 for (j=0; j< data_size; j++){ 223 col = recvdata[i++]; 224 indices_i[isz_i++] = col; 225 } 226 isz[is_id] = isz_i; 227 } 228 /* remove duplicate entities */ 229 for (i=0; i<nidx; i++){ 230 indices_i = indices_temp+(max_lsize+nrecvs)*i; 231 isz_i = isz[i]; 232 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 233 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 234 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 235 } 236 ierr = PetscFree(isz);CHKERRQ(ierr); 237 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 242 { 243 PetscLayout rmap,cmap; 244 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 245 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 246 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 247 const PetscInt *gcols,*ai,*aj,*bi,*bj; 248 Mat amat,bmat; 249 PetscMPIInt rank; 250 PetscBool done; 251 MPI_Comm comm; 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 256 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 257 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 258 /* Even if the mat is symmetric, we still assume it is not symmetric */ 259 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 260 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 261 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 262 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 263 /* total number of nonzero values is used to estimate the memory usage in the next step */ 264 tnz = ai[an]+bi[bn]; 265 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 266 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 267 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 268 /* to find the longest message */ 269 max_fszs = 0; 270 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 271 /* better way to estimate number of nonzero in the mat??? */ 272 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 273 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 274 rows_pos = 0; 275 totalrows = 0; 276 for (i=0; i<nfrom; i++){ 277 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 278 /* group data together */ 279 for (j=0; j<fromsizes[2*i]; j+=2){ 280 is_id = fromrows[rows_pos++];/* no of is */ 281 rows_i = rows_data[is_id]; 282 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 283 } 284 /* estimate a space to avoid multiple allocations */ 285 for (j=0; j<nidx; j++){ 286 indvc_ij = 0; 287 rows_i = rows_data[j]; 288 for (l=0; l<rows_pos_i[j]; l++){ 289 row = rows_i[l]-rstart; 290 start = ai[row]; 291 end = ai[row+1]; 292 for (k=start; k<end; k++){ /* Amat */ 293 col = aj[k] + cstart; 294 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 295 } 296 start = bi[row]; 297 end = bi[row+1]; 298 for (k=start; k<end; k++) { /* Bmat */ 299 col = gcols[bj[k]]; 300 indices_tmp[indvc_ij++] = col; 301 } 302 } 303 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 304 indv_counts[i*nidx+j] = indvc_ij; 305 totalrows += indvc_ij; 306 } 307 } 308 /* message triple <no of is, number of rows, rows> */ 309 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 310 totalrows = 0; 311 rows_pos = 0; 312 /* use this code again */ 313 for (i=0;i<nfrom;i++){ 314 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 315 for (j=0; j<fromsizes[2*i]; j+=2){ 316 is_id = fromrows[rows_pos++]; 317 rows_i = rows_data[is_id]; 318 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 319 } 320 /* add data */ 321 for (j=0; j<nidx; j++){ 322 if (!indv_counts[i*nidx+j]) continue; 323 indvc_ij = 0; 324 sbdata[totalrows++] = j; 325 sbdata[totalrows++] = indv_counts[i*nidx+j]; 326 sbsizes[2*i] += 2; 327 rows_i = rows_data[j]; 328 for (l=0; l<rows_pos_i[j]; l++){ 329 row = rows_i[l]-rstart; 330 start = ai[row]; 331 end = ai[row+1]; 332 for (k=start; k<end; k++){ /* Amat */ 333 col = aj[k] + cstart; 334 indices_tmp[indvc_ij++] = col; 335 } 336 start = bi[row]; 337 end = bi[row+1]; 338 for (k=start; k<end; k++) { /* Bmat */ 339 col = gcols[bj[k]]; 340 indices_tmp[indvc_ij++] = col; 341 } 342 } 343 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 344 sbsizes[2*i] += indvc_ij; 345 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 346 totalrows += indvc_ij; 347 } 348 } 349 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 350 for (i=0; i<nfrom; i++){ 351 offsets[i+1] = offsets[i] + sbsizes[2*i]; 352 sbsizes[2*i+1] = offsets[i]; 353 } 354 ierr = PetscFree(offsets);CHKERRQ(ierr); 355 if (sbrowsizes) *sbrowsizes = sbsizes; 356 if (sbrows) *sbrows = sbdata; 357 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 358 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 359 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 360 PetscFunctionReturn(0); 361 } 362 363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 364 { 365 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 366 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 367 PetscInt lsize,lsize_tmp,owner; 368 PetscMPIInt rank; 369 Mat amat,bmat; 370 PetscBool done; 371 PetscLayout cmap,rmap; 372 MPI_Comm comm; 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 377 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 378 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 379 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 380 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 381 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 382 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 383 /* is it a safe way to compute number of nonzero values ? */ 384 tnz = ai[an]+bi[bn]; 385 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 386 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 387 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 388 /* it is a better way to estimate memory than the old implementation 389 * where global size of matrix is used 390 * */ 391 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 392 for (i=0; i<nidx; i++) { 393 MPI_Comm iscomm; 394 395 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 396 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 397 lsize_tmp = 0; 398 for (j=0; j<lsize; j++) { 399 owner = -1; 400 row = indices[j]; 401 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 402 if (owner != rank) continue; 403 /* local number */ 404 row -= rstart; 405 start = ai[row]; 406 end = ai[row+1]; 407 for (k=start; k<end; k++) { /* Amat */ 408 col = aj[k] + cstart; 409 indices_temp[lsize_tmp++] = col; 410 } 411 start = bi[row]; 412 end = bi[row+1]; 413 for (k=start; k<end; k++) { /* Bmat */ 414 col = gcols[bj[k]]; 415 indices_temp[lsize_tmp++] = col; 416 } 417 } 418 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 419 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 420 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 421 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 422 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 423 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 424 } 425 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 426 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 427 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 432 /* 433 Sample message format: 434 If a processor A wants processor B to process some elements corresponding 435 to index sets is[1],is[5] 436 mesg [0] = 2 (no of index sets in the mesg) 437 ----------- 438 mesg [1] = 1 => is[1] 439 mesg [2] = sizeof(is[1]); 440 ----------- 441 mesg [3] = 5 => is[5] 442 mesg [4] = sizeof(is[5]); 443 ----------- 444 mesg [5] 445 mesg [n] datas[1] 446 ----------- 447 mesg[n+1] 448 mesg[m] data(is[5]) 449 ----------- 450 451 Notes: 452 nrqs - no of requests sent (or to be sent out) 453 nrqr - no of requests recieved (which have to be or which have been processed 454 */ 455 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 456 { 457 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 458 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 459 const PetscInt **idx,*idx_i; 460 PetscInt *n,**data,len; 461 #if defined(PETSC_USE_CTABLE) 462 PetscTable *table_data,table_data_i; 463 PetscInt *tdata,tcount,tcount_max; 464 #else 465 PetscInt *data_i,*d_p; 466 #endif 467 PetscErrorCode ierr; 468 PetscMPIInt size,rank,tag1,tag2; 469 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 470 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 471 PetscBT *table; 472 MPI_Comm comm; 473 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 474 MPI_Status *s_status,*recv_status; 475 MPI_Comm *iscomms; 476 char *t_p; 477 478 PetscFunctionBegin; 479 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 480 size = c->size; 481 rank = c->rank; 482 M = C->rmap->N; 483 484 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 485 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 486 487 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 488 489 for (i=0; i<imax; i++) { 490 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 491 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 492 } 493 494 /* evaluate communication - mesg to who,length of mesg, and buffer space 495 required. Based on this, buffers are allocated, and data copied into them */ 496 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 497 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 498 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 499 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 500 for (i=0; i<imax; i++) { 501 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 502 idx_i = idx[i]; 503 len = n[i]; 504 for (j=0; j<len; j++) { 505 row = idx_i[j]; 506 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 507 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 508 w4[proc]++; 509 } 510 for (j=0; j<size; j++) { 511 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 512 } 513 } 514 515 nrqs = 0; /* no of outgoing messages */ 516 msz = 0; /* total mesg length (for all proc */ 517 w1[rank] = 0; /* no mesg sent to intself */ 518 w3[rank] = 0; 519 for (i=0; i<size; i++) { 520 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 521 } 522 /* pa - is list of processors to communicate with */ 523 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 524 for (i=0,j=0; i<size; i++) { 525 if (w1[i]) {pa[j] = i; j++;} 526 } 527 528 /* Each message would have a header = 1 + 2*(no of IS) + data */ 529 for (i=0; i<nrqs; i++) { 530 j = pa[i]; 531 w1[j] += w2[j] + 2*w3[j]; 532 msz += w1[j]; 533 } 534 535 /* Determine the number of messages to expect, their lengths, from from-ids */ 536 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 537 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 538 539 /* Now post the Irecvs corresponding to these messages */ 540 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 541 542 /* Allocate Memory for outgoing messages */ 543 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 544 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 545 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 546 547 { 548 PetscInt *iptr = tmp,ict = 0; 549 for (i=0; i<nrqs; i++) { 550 j = pa[i]; 551 iptr += ict; 552 outdat[j] = iptr; 553 ict = w1[j]; 554 } 555 } 556 557 /* Form the outgoing messages */ 558 /* plug in the headers */ 559 for (i=0; i<nrqs; i++) { 560 j = pa[i]; 561 outdat[j][0] = 0; 562 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 563 ptr[j] = outdat[j] + 2*w3[j] + 1; 564 } 565 566 /* Memory for doing local proc's work */ 567 { 568 PetscInt M_BPB_imax = 0; 569 #if defined(PETSC_USE_CTABLE) 570 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 571 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 572 for (i=0; i<imax; i++) { 573 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 574 } 575 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 } 579 #else 580 PetscInt Mimax = 0; 581 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 582 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 583 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 584 for (i=0; i<imax; i++) { 585 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 586 data[i] = d_p + M*i; 587 } 588 #endif 589 } 590 591 /* Parse the IS and update local tables and the outgoing buf with the data */ 592 { 593 PetscInt n_i,isz_i,*outdat_j,ctr_j; 594 PetscBT table_i; 595 596 for (i=0; i<imax; i++) { 597 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 598 n_i = n[i]; 599 table_i = table[i]; 600 idx_i = idx[i]; 601 #if defined(PETSC_USE_CTABLE) 602 table_data_i = table_data[i]; 603 #else 604 data_i = data[i]; 605 #endif 606 isz_i = isz[i]; 607 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 608 row = idx_i[j]; 609 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 610 if (proc != rank) { /* copy to the outgoing buffer */ 611 ctr[proc]++; 612 *ptr[proc] = row; 613 ptr[proc]++; 614 } else if (!PetscBTLookupSet(table_i,row)) { 615 #if defined(PETSC_USE_CTABLE) 616 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 617 #else 618 data_i[isz_i] = row; /* Update the local table */ 619 #endif 620 isz_i++; 621 } 622 } 623 /* Update the headers for the current IS */ 624 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 625 if ((ctr_j = ctr[j])) { 626 outdat_j = outdat[j]; 627 k = ++outdat_j[0]; 628 outdat_j[2*k] = ctr_j; 629 outdat_j[2*k-1] = i; 630 } 631 } 632 isz[i] = isz_i; 633 } 634 } 635 636 /* Now post the sends */ 637 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 638 for (i=0; i<nrqs; ++i) { 639 j = pa[i]; 640 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 641 } 642 643 /* No longer need the original indices */ 644 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 645 for (i=0; i<imax; ++i) { 646 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 647 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 648 } 649 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 650 651 for (i=0; i<imax; ++i) { 652 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 653 } 654 655 /* Do Local work */ 656 #if defined(PETSC_USE_CTABLE) 657 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 658 #else 659 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 660 #endif 661 662 /* Receive messages */ 663 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 664 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 665 666 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 667 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 668 669 /* Phase 1 sends are complete - deallocate buffers */ 670 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 671 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 672 673 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 674 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 675 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 676 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 677 ierr = PetscFree(rbuf);CHKERRQ(ierr); 678 679 680 /* Send the data back */ 681 /* Do a global reduction to know the buffer space req for incoming messages */ 682 { 683 PetscMPIInt *rw1; 684 685 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 686 687 for (i=0; i<nrqr; ++i) { 688 proc = recv_status[i].MPI_SOURCE; 689 690 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 691 rw1[proc] = isz1[i]; 692 } 693 ierr = PetscFree(onodes1);CHKERRQ(ierr); 694 ierr = PetscFree(olengths1);CHKERRQ(ierr); 695 696 /* Determine the number of messages to expect, their lengths, from from-ids */ 697 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 698 ierr = PetscFree(rw1);CHKERRQ(ierr); 699 } 700 /* Now post the Irecvs corresponding to these messages */ 701 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 702 703 /* Now post the sends */ 704 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 705 for (i=0; i<nrqr; ++i) { 706 j = recv_status[i].MPI_SOURCE; 707 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 708 } 709 710 /* receive work done on other processors */ 711 { 712 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 713 PetscMPIInt idex; 714 PetscBT table_i; 715 MPI_Status *status2; 716 717 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 718 for (i=0; i<nrqs; ++i) { 719 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 720 /* Process the message */ 721 rbuf2_i = rbuf2[idex]; 722 ct1 = 2*rbuf2_i[0]+1; 723 jmax = rbuf2[idex][0]; 724 for (j=1; j<=jmax; j++) { 725 max = rbuf2_i[2*j]; 726 is_no = rbuf2_i[2*j-1]; 727 isz_i = isz[is_no]; 728 table_i = table[is_no]; 729 #if defined(PETSC_USE_CTABLE) 730 table_data_i = table_data[is_no]; 731 #else 732 data_i = data[is_no]; 733 #endif 734 for (k=0; k<max; k++,ct1++) { 735 row = rbuf2_i[ct1]; 736 if (!PetscBTLookupSet(table_i,row)) { 737 #if defined(PETSC_USE_CTABLE) 738 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 739 #else 740 data_i[isz_i] = row; 741 #endif 742 isz_i++; 743 } 744 } 745 isz[is_no] = isz_i; 746 } 747 } 748 749 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 750 ierr = PetscFree(status2);CHKERRQ(ierr); 751 } 752 753 #if defined(PETSC_USE_CTABLE) 754 tcount_max = 0; 755 for (i=0; i<imax; ++i) { 756 table_data_i = table_data[i]; 757 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 758 if (tcount_max < tcount) tcount_max = tcount; 759 } 760 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 761 #endif 762 763 for (i=0; i<imax; ++i) { 764 #if defined(PETSC_USE_CTABLE) 765 PetscTablePosition tpos; 766 table_data_i = table_data[i]; 767 768 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 769 while (tpos) { 770 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 771 tdata[--j] = --k; 772 } 773 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 774 #else 775 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 776 #endif 777 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 778 } 779 780 ierr = PetscFree(iscomms);CHKERRQ(ierr); 781 ierr = PetscFree(onodes2);CHKERRQ(ierr); 782 ierr = PetscFree(olengths2);CHKERRQ(ierr); 783 784 ierr = PetscFree(pa);CHKERRQ(ierr); 785 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 786 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 787 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 788 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 789 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 790 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 791 ierr = PetscFree(s_status);CHKERRQ(ierr); 792 ierr = PetscFree(recv_status);CHKERRQ(ierr); 793 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 794 ierr = PetscFree(xdata);CHKERRQ(ierr); 795 ierr = PetscFree(isz1);CHKERRQ(ierr); 796 #if defined(PETSC_USE_CTABLE) 797 for (i=0; i<imax; i++) { 798 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 799 } 800 ierr = PetscFree(table_data);CHKERRQ(ierr); 801 ierr = PetscFree(tdata);CHKERRQ(ierr); 802 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 803 #else 804 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 805 #endif 806 PetscFunctionReturn(0); 807 } 808 809 /* 810 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 811 the work on the local processor. 812 813 Inputs: 814 C - MAT_MPIAIJ; 815 imax - total no of index sets processed at a time; 816 table - an array of char - size = m bits. 817 818 Output: 819 isz - array containing the count of the solution elements corresponding 820 to each index set; 821 data or table_data - pointer to the solutions 822 */ 823 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 824 { 825 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 826 Mat A = c->A,B = c->B; 827 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 828 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 829 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 830 PetscBT table_i; 831 #if defined(PETSC_USE_CTABLE) 832 PetscTable table_data_i; 833 PetscErrorCode ierr; 834 PetscTablePosition tpos; 835 PetscInt tcount,*tdata; 836 #else 837 PetscInt *data_i; 838 #endif 839 840 PetscFunctionBegin; 841 rstart = C->rmap->rstart; 842 cstart = C->cmap->rstart; 843 ai = a->i; 844 aj = a->j; 845 bi = b->i; 846 bj = b->j; 847 garray = c->garray; 848 849 for (i=0; i<imax; i++) { 850 #if defined(PETSC_USE_CTABLE) 851 /* copy existing entries of table_data_i into tdata[] */ 852 table_data_i = table_data[i]; 853 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 854 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 855 856 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 857 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 858 while (tpos) { 859 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 860 tdata[--j] = --row; 861 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 862 } 863 #else 864 data_i = data[i]; 865 #endif 866 table_i = table[i]; 867 isz_i = isz[i]; 868 max = isz[i]; 869 870 for (j=0; j<max; j++) { 871 #if defined(PETSC_USE_CTABLE) 872 row = tdata[j] - rstart; 873 #else 874 row = data_i[j] - rstart; 875 #endif 876 start = ai[row]; 877 end = ai[row+1]; 878 for (k=start; k<end; k++) { /* Amat */ 879 val = aj[k] + cstart; 880 if (!PetscBTLookupSet(table_i,val)) { 881 #if defined(PETSC_USE_CTABLE) 882 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 883 #else 884 data_i[isz_i] = val; 885 #endif 886 isz_i++; 887 } 888 } 889 start = bi[row]; 890 end = bi[row+1]; 891 for (k=start; k<end; k++) { /* Bmat */ 892 val = garray[bj[k]]; 893 if (!PetscBTLookupSet(table_i,val)) { 894 #if defined(PETSC_USE_CTABLE) 895 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 896 #else 897 data_i[isz_i] = val; 898 #endif 899 isz_i++; 900 } 901 } 902 } 903 isz[i] = isz_i; 904 905 #if defined(PETSC_USE_CTABLE) 906 ierr = PetscFree(tdata);CHKERRQ(ierr); 907 #endif 908 } 909 PetscFunctionReturn(0); 910 } 911 912 /* 913 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 914 and return the output 915 916 Input: 917 C - the matrix 918 nrqr - no of messages being processed. 919 rbuf - an array of pointers to the recieved requests 920 921 Output: 922 xdata - array of messages to be sent back 923 isz1 - size of each message 924 925 For better efficiency perhaps we should malloc separately each xdata[i], 926 then if a remalloc is required we need only copy the data for that one row 927 rather then all previous rows as it is now where a single large chunck of 928 memory is used. 929 930 */ 931 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 932 { 933 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 934 Mat A = c->A,B = c->B; 935 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 936 PetscErrorCode ierr; 937 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 938 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 939 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 940 PetscInt *rbuf_i,kmax,rbuf_0; 941 PetscBT xtable; 942 943 PetscFunctionBegin; 944 m = C->rmap->N; 945 rstart = C->rmap->rstart; 946 cstart = C->cmap->rstart; 947 ai = a->i; 948 aj = a->j; 949 bi = b->i; 950 bj = b->j; 951 garray = c->garray; 952 953 954 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 955 rbuf_i = rbuf[i]; 956 rbuf_0 = rbuf_i[0]; 957 ct += rbuf_0; 958 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 959 } 960 961 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 962 else max1 = 1; 963 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 964 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 965 ++no_malloc; 966 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 967 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 968 969 ct3 = 0; 970 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 971 rbuf_i = rbuf[i]; 972 rbuf_0 = rbuf_i[0]; 973 ct1 = 2*rbuf_0+1; 974 ct2 = ct1; 975 ct3 += ct1; 976 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 977 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 978 oct2 = ct2; 979 kmax = rbuf_i[2*j]; 980 for (k=0; k<kmax; k++,ct1++) { 981 row = rbuf_i[ct1]; 982 if (!PetscBTLookupSet(xtable,row)) { 983 if (!(ct3 < mem_estimate)) { 984 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 985 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 986 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 987 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 988 xdata[0] = tmp; 989 mem_estimate = new_estimate; ++no_malloc; 990 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 991 } 992 xdata[i][ct2++] = row; 993 ct3++; 994 } 995 } 996 for (k=oct2,max2=ct2; k<max2; k++) { 997 row = xdata[i][k] - rstart; 998 start = ai[row]; 999 end = ai[row+1]; 1000 for (l=start; l<end; l++) { 1001 val = aj[l] + cstart; 1002 if (!PetscBTLookupSet(xtable,val)) { 1003 if (!(ct3 < mem_estimate)) { 1004 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1005 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1006 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1007 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1008 xdata[0] = tmp; 1009 mem_estimate = new_estimate; ++no_malloc; 1010 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1011 } 1012 xdata[i][ct2++] = val; 1013 ct3++; 1014 } 1015 } 1016 start = bi[row]; 1017 end = bi[row+1]; 1018 for (l=start; l<end; l++) { 1019 val = garray[bj[l]]; 1020 if (!PetscBTLookupSet(xtable,val)) { 1021 if (!(ct3 < mem_estimate)) { 1022 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1023 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1024 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1025 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1026 xdata[0] = tmp; 1027 mem_estimate = new_estimate; ++no_malloc; 1028 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1029 } 1030 xdata[i][ct2++] = val; 1031 ct3++; 1032 } 1033 } 1034 } 1035 /* Update the header*/ 1036 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1037 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1038 } 1039 xdata[i][0] = rbuf_0; 1040 xdata[i+1] = xdata[i] + ct2; 1041 isz1[i] = ct2; /* size of each message */ 1042 } 1043 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1044 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1045 PetscFunctionReturn(0); 1046 } 1047 /* -------------------------------------------------------------------------*/ 1048 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1049 /* 1050 Every processor gets the entire matrix 1051 */ 1052 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1053 { 1054 Mat B; 1055 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1056 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1057 PetscErrorCode ierr; 1058 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1059 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1060 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1061 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1062 1063 PetscFunctionBegin; 1064 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1065 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1066 1067 if (scall == MAT_INITIAL_MATRIX) { 1068 /* ---------------------------------------------------------------- 1069 Tell every processor the number of nonzeros per row 1070 */ 1071 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1072 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1073 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1074 } 1075 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1076 for (i=0; i<size; i++) { 1077 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1078 displs[i] = A->rmap->range[i]; 1079 } 1080 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1081 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1082 #else 1083 sendcount = A->rmap->rend - A->rmap->rstart; 1084 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1085 #endif 1086 /* --------------------------------------------------------------- 1087 Create the sequential matrix of the same type as the local block diagonal 1088 */ 1089 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1090 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1091 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1092 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1093 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1094 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1095 **Bin = B; 1096 b = (Mat_SeqAIJ*)B->data; 1097 1098 /*-------------------------------------------------------------------- 1099 Copy my part of matrix column indices over 1100 */ 1101 sendcount = ad->nz + bd->nz; 1102 jsendbuf = b->j + b->i[rstarts[rank]]; 1103 a_jsendbuf = ad->j; 1104 b_jsendbuf = bd->j; 1105 n = A->rmap->rend - A->rmap->rstart; 1106 cnt = 0; 1107 for (i=0; i<n; i++) { 1108 1109 /* put in lower diagonal portion */ 1110 m = bd->i[i+1] - bd->i[i]; 1111 while (m > 0) { 1112 /* is it above diagonal (in bd (compressed) numbering) */ 1113 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1114 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1115 m--; 1116 } 1117 1118 /* put in diagonal portion */ 1119 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1120 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1121 } 1122 1123 /* put in upper diagonal portion */ 1124 while (m-- > 0) { 1125 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1126 } 1127 } 1128 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1129 1130 /*-------------------------------------------------------------------- 1131 Gather all column indices to all processors 1132 */ 1133 for (i=0; i<size; i++) { 1134 recvcounts[i] = 0; 1135 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1136 recvcounts[i] += lens[j]; 1137 } 1138 } 1139 displs[0] = 0; 1140 for (i=1; i<size; i++) { 1141 displs[i] = displs[i-1] + recvcounts[i-1]; 1142 } 1143 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1144 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1145 #else 1146 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1147 #endif 1148 /*-------------------------------------------------------------------- 1149 Assemble the matrix into useable form (note numerical values not yet set) 1150 */ 1151 /* set the b->ilen (length of each row) values */ 1152 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1153 /* set the b->i indices */ 1154 b->i[0] = 0; 1155 for (i=1; i<=A->rmap->N; i++) { 1156 b->i[i] = b->i[i-1] + lens[i-1]; 1157 } 1158 ierr = PetscFree(lens);CHKERRQ(ierr); 1159 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1160 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1161 1162 } else { 1163 B = **Bin; 1164 b = (Mat_SeqAIJ*)B->data; 1165 } 1166 1167 /*-------------------------------------------------------------------- 1168 Copy my part of matrix numerical values into the values location 1169 */ 1170 if (flag == MAT_GET_VALUES) { 1171 sendcount = ad->nz + bd->nz; 1172 sendbuf = b->a + b->i[rstarts[rank]]; 1173 a_sendbuf = ad->a; 1174 b_sendbuf = bd->a; 1175 b_sendj = bd->j; 1176 n = A->rmap->rend - A->rmap->rstart; 1177 cnt = 0; 1178 for (i=0; i<n; i++) { 1179 1180 /* put in lower diagonal portion */ 1181 m = bd->i[i+1] - bd->i[i]; 1182 while (m > 0) { 1183 /* is it above diagonal (in bd (compressed) numbering) */ 1184 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1185 sendbuf[cnt++] = *b_sendbuf++; 1186 m--; 1187 b_sendj++; 1188 } 1189 1190 /* put in diagonal portion */ 1191 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1192 sendbuf[cnt++] = *a_sendbuf++; 1193 } 1194 1195 /* put in upper diagonal portion */ 1196 while (m-- > 0) { 1197 sendbuf[cnt++] = *b_sendbuf++; 1198 b_sendj++; 1199 } 1200 } 1201 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1202 1203 /* ----------------------------------------------------------------- 1204 Gather all numerical values to all processors 1205 */ 1206 if (!recvcounts) { 1207 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1208 } 1209 for (i=0; i<size; i++) { 1210 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1211 } 1212 displs[0] = 0; 1213 for (i=1; i<size; i++) { 1214 displs[i] = displs[i-1] + recvcounts[i-1]; 1215 } 1216 recvbuf = b->a; 1217 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1218 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1219 #else 1220 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1221 #endif 1222 } /* endof (flag == MAT_GET_VALUES) */ 1223 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1224 1225 if (A->symmetric) { 1226 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1227 } else if (A->hermitian) { 1228 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1229 } else if (A->structurally_symmetric) { 1230 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1231 } 1232 PetscFunctionReturn(0); 1233 } 1234 1235 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1236 { 1237 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1238 Mat submat,A = c->A,B = c->B; 1239 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1240 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1241 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1242 const PetscInt *icol,*irow; 1243 PetscInt nrow,ncol,start; 1244 PetscErrorCode ierr; 1245 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1246 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1247 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1248 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1249 PetscInt *lens,rmax,ncols,*cols,Crow; 1250 #if defined(PETSC_USE_CTABLE) 1251 PetscTable cmap,rmap; 1252 PetscInt *cmap_loc,*rmap_loc; 1253 #else 1254 PetscInt *cmap,*rmap; 1255 #endif 1256 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1257 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1258 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1259 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1260 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1261 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1262 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1263 MPI_Comm comm; 1264 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1265 PetscMPIInt *onodes1,*olengths1,idex,end; 1266 Mat_SubSppt *smatis1; 1267 PetscBool isrowsorted,iscolsorted; 1268 1269 PetscFunctionBegin; 1270 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1271 1272 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1273 size = c->size; 1274 rank = c->rank; 1275 1276 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1277 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1278 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1279 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1280 if (allcolumns) { 1281 icol = NULL; 1282 ncol = C->cmap->N; 1283 } else { 1284 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1285 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1286 } 1287 1288 if (scall == MAT_INITIAL_MATRIX) { 1289 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1290 1291 /* Get some new tags to keep the communication clean */ 1292 tag1 = ((PetscObject)C)->tag; 1293 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1294 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1295 1296 /* evaluate communication - mesg to who, length of mesg, and buffer space 1297 required. Based on this, buffers are allocated, and data copied into them */ 1298 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1299 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1300 1301 /* w1[proc] = num of rows owned by proc -- to be requested */ 1302 proc = 0; 1303 nrqs = 0; /* num of outgoing messages */ 1304 for (j=0; j<nrow; j++) { 1305 row = irow[j]; 1306 if (!isrowsorted) proc = 0; 1307 while (row >= C->rmap->range[proc+1]) proc++; 1308 w1[proc]++; 1309 row2proc[j] = proc; /* map row index to proc */ 1310 1311 if (proc != rank && !w2[proc]) { 1312 w2[proc] = 1; nrqs++; 1313 } 1314 } 1315 w1[rank] = 0; /* rows owned by self will not be requested */ 1316 1317 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1318 for (proc=0,j=0; proc<size; proc++) { 1319 if (w1[proc]) { pa[j++] = proc;} 1320 } 1321 1322 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1323 msz = 0; /* total mesg length (for all procs) */ 1324 for (i=0; i<nrqs; i++) { 1325 proc = pa[i]; 1326 w1[proc] += 3; 1327 msz += w1[proc]; 1328 } 1329 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1330 1331 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1332 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1333 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1334 1335 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1336 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1337 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1338 1339 /* Now post the Irecvs corresponding to these messages */ 1340 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1341 1342 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1343 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1344 1345 /* Allocate Memory for outgoing messages */ 1346 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1347 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1348 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1349 1350 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1351 iptr = tmp; 1352 for (i=0; i<nrqs; i++) { 1353 proc = pa[i]; 1354 sbuf1[proc] = iptr; 1355 iptr += w1[proc]; 1356 } 1357 1358 /* Form the outgoing messages */ 1359 /* Initialize the header space */ 1360 for (i=0; i<nrqs; i++) { 1361 proc = pa[i]; 1362 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1363 ptr[proc] = sbuf1[proc] + 3; 1364 } 1365 1366 /* Parse the isrow and copy data into outbuf */ 1367 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1368 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1369 proc = row2proc[j]; 1370 if (proc != rank) { /* copy to the outgoing buf*/ 1371 *ptr[proc] = irow[j]; 1372 ctr[proc]++; ptr[proc]++; 1373 } 1374 } 1375 1376 /* Update the headers for the current IS */ 1377 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1378 if ((ctr_j = ctr[j])) { 1379 sbuf1_j = sbuf1[j]; 1380 k = ++sbuf1_j[0]; 1381 sbuf1_j[2*k] = ctr_j; 1382 sbuf1_j[2*k-1] = 0; 1383 } 1384 } 1385 1386 /* Now post the sends */ 1387 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1388 for (i=0; i<nrqs; ++i) { 1389 proc = pa[i]; 1390 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1391 } 1392 1393 /* Post Receives to capture the buffer size */ 1394 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1395 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1396 1397 rbuf2[0] = tmp + msz; 1398 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1399 1400 for (i=0; i<nrqs; ++i) { 1401 proc = pa[i]; 1402 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1403 } 1404 1405 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1406 1407 /* Send to other procs the buf size they should allocate */ 1408 /* Receive messages*/ 1409 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1410 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1411 1412 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1413 for (i=0; i<nrqr; ++i) { 1414 req_size[i] = 0; 1415 rbuf1_i = rbuf1[i]; 1416 start = 2*rbuf1_i[0] + 1; 1417 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1418 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1419 sbuf2_i = sbuf2[i]; 1420 for (j=start; j<end; j++) { 1421 k = rbuf1_i[j] - rstart; 1422 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1423 sbuf2_i[j] = ncols; 1424 req_size[i] += ncols; 1425 } 1426 req_source1[i] = r_status1[i].MPI_SOURCE; 1427 1428 /* form the header */ 1429 sbuf2_i[0] = req_size[i]; 1430 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1431 1432 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1433 } 1434 1435 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1436 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1437 1438 /* rbuf2 is received, Post recv column indices a->j */ 1439 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1440 1441 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1442 for (i=0; i<nrqs; ++i) { 1443 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1444 req_source2[i] = r_status2[i].MPI_SOURCE; 1445 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1446 } 1447 1448 /* Wait on sends1 and sends2 */ 1449 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1450 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1451 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1452 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1453 1454 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1455 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1456 1457 /* Now allocate sending buffers for a->j, and send them off */ 1458 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1459 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1460 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1461 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1462 1463 for (i=0; i<nrqr; i++) { /* for each requested message */ 1464 rbuf1_i = rbuf1[i]; 1465 sbuf_aj_i = sbuf_aj[i]; 1466 ct1 = 2*rbuf1_i[0] + 1; 1467 ct2 = 0; 1468 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1469 1470 kmax = rbuf1[i][2]; 1471 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1472 row = rbuf1_i[ct1] - rstart; 1473 nzA = ai[row+1] - ai[row]; 1474 nzB = bi[row+1] - bi[row]; 1475 ncols = nzA + nzB; 1476 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1477 1478 /* load the column indices for this row into cols*/ 1479 cols = sbuf_aj_i + ct2; 1480 1481 lwrite = 0; 1482 for (l=0; l<nzB; l++) { 1483 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1484 } 1485 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1486 for (l=0; l<nzB; l++) { 1487 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1488 } 1489 1490 ct2 += ncols; 1491 } 1492 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1493 } 1494 1495 /* create column map (cmap): global col of C -> local col of submat */ 1496 #if defined(PETSC_USE_CTABLE) 1497 if (!allcolumns) { 1498 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1499 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1500 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1501 if (icol[j] >= cstart && icol[j] <cend) { 1502 cmap_loc[icol[j] - cstart] = j+1; 1503 } else { /* use PetscTable for non-local col indices */ 1504 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1505 } 1506 } 1507 } else { 1508 cmap = NULL; 1509 cmap_loc = NULL; 1510 } 1511 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1512 #else 1513 if (!allcolumns) { 1514 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1515 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1516 } else { 1517 cmap = NULL; 1518 } 1519 #endif 1520 1521 /* Create lens for MatSeqAIJSetPreallocation() */ 1522 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1523 1524 /* Compute lens from local part of C */ 1525 for (j=0; j<nrow; j++) { 1526 row = irow[j]; 1527 proc = row2proc[j]; 1528 if (proc == rank) { 1529 /* diagonal part A = c->A */ 1530 ncols = ai[row-rstart+1] - ai[row-rstart]; 1531 cols = aj + ai[row-rstart]; 1532 if (!allcolumns) { 1533 for (k=0; k<ncols; k++) { 1534 #if defined(PETSC_USE_CTABLE) 1535 tcol = cmap_loc[cols[k]]; 1536 #else 1537 tcol = cmap[cols[k]+cstart]; 1538 #endif 1539 if (tcol) lens[j]++; 1540 } 1541 } else { /* allcolumns */ 1542 lens[j] = ncols; 1543 } 1544 1545 /* off-diagonal part B = c->B */ 1546 ncols = bi[row-rstart+1] - bi[row-rstart]; 1547 cols = bj + bi[row-rstart]; 1548 if (!allcolumns) { 1549 for (k=0; k<ncols; k++) { 1550 #if defined(PETSC_USE_CTABLE) 1551 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1552 #else 1553 tcol = cmap[bmap[cols[k]]]; 1554 #endif 1555 if (tcol) lens[j]++; 1556 } 1557 } else { /* allcolumns */ 1558 lens[j] += ncols; 1559 } 1560 } 1561 } 1562 1563 /* Create row map (rmap): global row of C -> local row of submat */ 1564 #if defined(PETSC_USE_CTABLE) 1565 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1566 for (j=0; j<nrow; j++) { 1567 row = irow[j]; 1568 proc = row2proc[j]; 1569 if (proc == rank) { /* a local row */ 1570 rmap_loc[row - rstart] = j; 1571 } else { 1572 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1573 } 1574 } 1575 #else 1576 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1577 for (j=0; j<nrow; j++) { 1578 rmap[irow[j]] = j; 1579 } 1580 #endif 1581 1582 /* Update lens from offproc data */ 1583 /* recv a->j is done */ 1584 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1585 for (i=0; i<nrqs; i++) { 1586 proc = pa[i]; 1587 sbuf1_i = sbuf1[proc]; 1588 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1589 ct1 = 2 + 1; 1590 ct2 = 0; 1591 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1592 rbuf3_i = rbuf3[i]; /* received C->j */ 1593 1594 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1595 max1 = sbuf1_i[2]; 1596 for (k=0; k<max1; k++,ct1++) { 1597 #if defined(PETSC_USE_CTABLE) 1598 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1599 row--; 1600 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1601 #else 1602 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1603 #endif 1604 /* Now, store row index of submat in sbuf1_i[ct1] */ 1605 sbuf1_i[ct1] = row; 1606 1607 nnz = rbuf2_i[ct1]; 1608 if (!allcolumns) { 1609 for (l=0; l<nnz; l++,ct2++) { 1610 #if defined(PETSC_USE_CTABLE) 1611 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1612 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1613 } else { 1614 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1615 } 1616 #else 1617 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1618 #endif 1619 if (tcol) lens[row]++; 1620 } 1621 } else { /* allcolumns */ 1622 lens[row] += nnz; 1623 } 1624 } 1625 } 1626 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1627 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1628 1629 /* Create the submatrices */ 1630 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1631 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1632 1633 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1634 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1635 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1636 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1637 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1638 1639 /* create struct Mat_SubSppt and attached it to submat */ 1640 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1641 subc = (Mat_SeqAIJ*)submat->data; 1642 subc->submatis1 = smatis1; 1643 1644 smatis1->id = 0; 1645 smatis1->nrqs = nrqs; 1646 smatis1->nrqr = nrqr; 1647 smatis1->rbuf1 = rbuf1; 1648 smatis1->rbuf2 = rbuf2; 1649 smatis1->rbuf3 = rbuf3; 1650 smatis1->sbuf2 = sbuf2; 1651 smatis1->req_source2 = req_source2; 1652 1653 smatis1->sbuf1 = sbuf1; 1654 smatis1->ptr = ptr; 1655 smatis1->tmp = tmp; 1656 smatis1->ctr = ctr; 1657 1658 smatis1->pa = pa; 1659 smatis1->req_size = req_size; 1660 smatis1->req_source1 = req_source1; 1661 1662 smatis1->allcolumns = allcolumns; 1663 smatis1->singleis = PETSC_TRUE; 1664 smatis1->row2proc = row2proc; 1665 smatis1->rmap = rmap; 1666 smatis1->cmap = cmap; 1667 #if defined(PETSC_USE_CTABLE) 1668 smatis1->rmap_loc = rmap_loc; 1669 smatis1->cmap_loc = cmap_loc; 1670 #endif 1671 1672 smatis1->destroy = submat->ops->destroy; 1673 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1674 submat->factortype = C->factortype; 1675 1676 /* compute rmax */ 1677 rmax = 0; 1678 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1679 1680 } else { /* scall == MAT_REUSE_MATRIX */ 1681 submat = submats[0]; 1682 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1683 1684 subc = (Mat_SeqAIJ*)submat->data; 1685 rmax = subc->rmax; 1686 smatis1 = subc->submatis1; 1687 nrqs = smatis1->nrqs; 1688 nrqr = smatis1->nrqr; 1689 rbuf1 = smatis1->rbuf1; 1690 rbuf2 = smatis1->rbuf2; 1691 rbuf3 = smatis1->rbuf3; 1692 req_source2 = smatis1->req_source2; 1693 1694 sbuf1 = smatis1->sbuf1; 1695 sbuf2 = smatis1->sbuf2; 1696 ptr = smatis1->ptr; 1697 tmp = smatis1->tmp; 1698 ctr = smatis1->ctr; 1699 1700 pa = smatis1->pa; 1701 req_size = smatis1->req_size; 1702 req_source1 = smatis1->req_source1; 1703 1704 allcolumns = smatis1->allcolumns; 1705 row2proc = smatis1->row2proc; 1706 rmap = smatis1->rmap; 1707 cmap = smatis1->cmap; 1708 #if defined(PETSC_USE_CTABLE) 1709 rmap_loc = smatis1->rmap_loc; 1710 cmap_loc = smatis1->cmap_loc; 1711 #endif 1712 } 1713 1714 /* Post recv matrix values */ 1715 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1716 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1717 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1718 for (i=0; i<nrqs; ++i) { 1719 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1720 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1721 } 1722 1723 /* Allocate sending buffers for a->a, and send them off */ 1724 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1725 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1726 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1727 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1728 1729 for (i=0; i<nrqr; i++) { 1730 rbuf1_i = rbuf1[i]; 1731 sbuf_aa_i = sbuf_aa[i]; 1732 ct1 = 2*rbuf1_i[0]+1; 1733 ct2 = 0; 1734 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1735 1736 kmax = rbuf1_i[2]; 1737 for (k=0; k<kmax; k++,ct1++) { 1738 row = rbuf1_i[ct1] - rstart; 1739 nzA = ai[row+1] - ai[row]; 1740 nzB = bi[row+1] - bi[row]; 1741 ncols = nzA + nzB; 1742 cworkB = bj + bi[row]; 1743 vworkA = a_a + ai[row]; 1744 vworkB = b_a + bi[row]; 1745 1746 /* load the column values for this row into vals*/ 1747 vals = sbuf_aa_i + ct2; 1748 1749 lwrite = 0; 1750 for (l=0; l<nzB; l++) { 1751 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1752 } 1753 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1754 for (l=0; l<nzB; l++) { 1755 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1756 } 1757 1758 ct2 += ncols; 1759 } 1760 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1761 } 1762 1763 /* Assemble submat */ 1764 /* First assemble the local rows */ 1765 for (j=0; j<nrow; j++) { 1766 row = irow[j]; 1767 proc = row2proc[j]; 1768 if (proc == rank) { 1769 Crow = row - rstart; /* local row index of C */ 1770 #if defined(PETSC_USE_CTABLE) 1771 row = rmap_loc[Crow]; /* row index of submat */ 1772 #else 1773 row = rmap[row]; 1774 #endif 1775 1776 if (allcolumns) { 1777 /* diagonal part A = c->A */ 1778 ncols = ai[Crow+1] - ai[Crow]; 1779 cols = aj + ai[Crow]; 1780 vals = a->a + ai[Crow]; 1781 i = 0; 1782 for (k=0; k<ncols; k++) { 1783 subcols[i] = cols[k] + cstart; 1784 subvals[i++] = vals[k]; 1785 } 1786 1787 /* off-diagonal part B = c->B */ 1788 ncols = bi[Crow+1] - bi[Crow]; 1789 cols = bj + bi[Crow]; 1790 vals = b->a + bi[Crow]; 1791 for (k=0; k<ncols; k++) { 1792 subcols[i] = bmap[cols[k]]; 1793 subvals[i++] = vals[k]; 1794 } 1795 1796 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1797 1798 } else { /* !allcolumns */ 1799 #if defined(PETSC_USE_CTABLE) 1800 /* diagonal part A = c->A */ 1801 ncols = ai[Crow+1] - ai[Crow]; 1802 cols = aj + ai[Crow]; 1803 vals = a->a + ai[Crow]; 1804 i = 0; 1805 for (k=0; k<ncols; k++) { 1806 tcol = cmap_loc[cols[k]]; 1807 if (tcol) { 1808 subcols[i] = --tcol; 1809 subvals[i++] = vals[k]; 1810 } 1811 } 1812 1813 /* off-diagonal part B = c->B */ 1814 ncols = bi[Crow+1] - bi[Crow]; 1815 cols = bj + bi[Crow]; 1816 vals = b->a + bi[Crow]; 1817 for (k=0; k<ncols; k++) { 1818 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1819 if (tcol) { 1820 subcols[i] = --tcol; 1821 subvals[i++] = vals[k]; 1822 } 1823 } 1824 #else 1825 /* diagonal part A = c->A */ 1826 ncols = ai[Crow+1] - ai[Crow]; 1827 cols = aj + ai[Crow]; 1828 vals = a->a + ai[Crow]; 1829 i = 0; 1830 for (k=0; k<ncols; k++) { 1831 tcol = cmap[cols[k]+cstart]; 1832 if (tcol) { 1833 subcols[i] = --tcol; 1834 subvals[i++] = vals[k]; 1835 } 1836 } 1837 1838 /* off-diagonal part B = c->B */ 1839 ncols = bi[Crow+1] - bi[Crow]; 1840 cols = bj + bi[Crow]; 1841 vals = b->a + bi[Crow]; 1842 for (k=0; k<ncols; k++) { 1843 tcol = cmap[bmap[cols[k]]]; 1844 if (tcol) { 1845 subcols[i] = --tcol; 1846 subvals[i++] = vals[k]; 1847 } 1848 } 1849 #endif 1850 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1851 } 1852 } 1853 } 1854 1855 /* Now assemble the off-proc rows */ 1856 for (i=0; i<nrqs; i++) { /* for each requested message */ 1857 /* recv values from other processes */ 1858 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1859 proc = pa[idex]; 1860 sbuf1_i = sbuf1[proc]; 1861 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1862 ct1 = 2 + 1; 1863 ct2 = 0; /* count of received C->j */ 1864 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1865 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1866 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1867 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1868 1869 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1870 max1 = sbuf1_i[2]; /* num of rows */ 1871 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1872 row = sbuf1_i[ct1]; /* row index of submat */ 1873 if (!allcolumns) { 1874 idex = 0; 1875 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1876 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1877 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1878 #if defined(PETSC_USE_CTABLE) 1879 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1880 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1881 } else { 1882 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1883 } 1884 #else 1885 tcol = cmap[rbuf3_i[ct2]]; 1886 #endif 1887 if (tcol) { 1888 subcols[idex] = --tcol; /* may not be sorted */ 1889 subvals[idex++] = rbuf4_i[ct2]; 1890 1891 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1892 For reuse, we replace received C->j with index that should be inserted to submat */ 1893 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1894 } 1895 } 1896 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1897 } else { /* scall == MAT_REUSE_MATRIX */ 1898 submat = submats[0]; 1899 subc = (Mat_SeqAIJ*)submat->data; 1900 1901 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1902 for (l=0; l<nnz; l++) { 1903 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1904 subvals[idex++] = rbuf4_i[ct2]; 1905 } 1906 1907 bj = subc->j + subc->i[row]; /* sorted column indices */ 1908 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1909 } 1910 } else { /* allcolumns */ 1911 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1912 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1913 ct2 += nnz; 1914 } 1915 } 1916 } 1917 1918 /* sending a->a are done */ 1919 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1920 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1921 1922 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1923 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1924 submats[0] = submat; 1925 1926 /* Restore the indices */ 1927 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1928 if (!allcolumns) { 1929 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1930 } 1931 1932 /* Destroy allocated memory */ 1933 for (i=0; i<nrqs; ++i) { 1934 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1935 } 1936 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1937 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1938 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1939 1940 if (scall == MAT_INITIAL_MATRIX) { 1941 ierr = PetscFree(lens);CHKERRQ(ierr); 1942 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1943 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1944 } 1945 PetscFunctionReturn(0); 1946 } 1947 1948 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1949 { 1950 PetscErrorCode ierr; 1951 PetscInt ncol; 1952 PetscBool colflag,allcolumns=PETSC_FALSE; 1953 1954 PetscFunctionBegin; 1955 /* Allocate memory to hold all the submatrices */ 1956 if (scall == MAT_INITIAL_MATRIX) { 1957 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1958 } 1959 1960 /* Check for special case: each processor gets entire matrix columns */ 1961 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1962 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1963 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1964 1965 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1966 PetscFunctionReturn(0); 1967 } 1968 1969 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1970 { 1971 PetscErrorCode ierr; 1972 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1973 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1974 Mat_SeqAIJ *subc; 1975 Mat_SubSppt *smat; 1976 1977 PetscFunctionBegin; 1978 /* Check for special case: each processor has a single IS */ 1979 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1980 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1981 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */ 1982 PetscFunctionReturn(0); 1983 } 1984 1985 /* Collect global wantallmatrix and nstages */ 1986 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1987 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1988 if (!nmax) nmax = 1; 1989 1990 if (scall == MAT_INITIAL_MATRIX) { 1991 /* Collect global wantallmatrix and nstages */ 1992 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1993 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1994 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1995 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1996 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1997 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1998 wantallmatrix = PETSC_TRUE; 1999 2000 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2001 } 2002 } 2003 2004 /* Determine the number of stages through which submatrices are done 2005 Each stage will extract nmax submatrices. 2006 nmax is determined by the matrix column dimension. 2007 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2008 */ 2009 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2010 2011 in[0] = -1*(PetscInt)wantallmatrix; 2012 in[1] = nstages; 2013 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2014 wantallmatrix = (PetscBool)(-out[0]); 2015 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2016 2017 } else { /* MAT_REUSE_MATRIX */ 2018 if (ismax) { 2019 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2020 smat = subc->submatis1; 2021 } else { /* (*submat)[0] is a dummy matrix */ 2022 smat = (Mat_SubSppt*)(*submat)[0]->data; 2023 } 2024 if (!smat) { 2025 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2026 wantallmatrix = PETSC_TRUE; 2027 } else if (smat->singleis) { 2028 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2029 PetscFunctionReturn(0); 2030 } else { 2031 nstages = smat->nstages; 2032 } 2033 } 2034 2035 if (wantallmatrix) { 2036 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2037 PetscFunctionReturn(0); 2038 } 2039 2040 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2041 if (scall == MAT_INITIAL_MATRIX) { 2042 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2043 } 2044 2045 for (i=0,pos=0; i<nstages; i++) { 2046 if (pos+nmax <= ismax) max_no = nmax; 2047 else if (pos == ismax) max_no = 0; 2048 else max_no = ismax-pos; 2049 2050 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2051 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2052 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2053 smat->nstages = nstages; 2054 } 2055 pos += max_no; 2056 } 2057 2058 if (ismax && scall == MAT_INITIAL_MATRIX) { 2059 /* save nstages for reuse */ 2060 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2061 smat = subc->submatis1; 2062 smat->nstages = nstages; 2063 } 2064 PetscFunctionReturn(0); 2065 } 2066 2067 /* -------------------------------------------------------------------------*/ 2068 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2069 { 2070 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2071 Mat A = c->A; 2072 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2073 const PetscInt **icol,**irow; 2074 PetscInt *nrow,*ncol,start; 2075 PetscErrorCode ierr; 2076 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2077 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2078 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2079 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2080 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2081 #if defined(PETSC_USE_CTABLE) 2082 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2083 #else 2084 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2085 #endif 2086 const PetscInt *irow_i; 2087 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2088 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2089 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2090 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2091 MPI_Status *r_status3,*r_status4,*s_status4; 2092 MPI_Comm comm; 2093 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2094 PetscMPIInt *onodes1,*olengths1,end; 2095 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2096 Mat_SubSppt *smat_i; 2097 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2098 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2099 2100 PetscFunctionBegin; 2101 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2102 size = c->size; 2103 rank = c->rank; 2104 2105 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2106 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2107 2108 for (i=0; i<ismax; i++) { 2109 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2110 if (!issorted[i]) iscsorted = issorted[i]; 2111 2112 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2113 2114 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2115 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2116 2117 /* Check for special case: allcolumn */ 2118 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2119 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2120 if (colflag && ncol[i] == C->cmap->N) { 2121 allcolumns[i] = PETSC_TRUE; 2122 icol[i] = NULL; 2123 } else { 2124 allcolumns[i] = PETSC_FALSE; 2125 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2126 } 2127 } 2128 2129 if (scall == MAT_REUSE_MATRIX) { 2130 /* Assumes new rows are same length as the old rows */ 2131 for (i=0; i<ismax; i++) { 2132 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2133 subc = (Mat_SeqAIJ*)submats[i]->data; 2134 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2135 2136 /* Initial matrix as if empty */ 2137 ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2138 2139 smat_i = subc->submatis1; 2140 2141 nrqs = smat_i->nrqs; 2142 nrqr = smat_i->nrqr; 2143 rbuf1 = smat_i->rbuf1; 2144 rbuf2 = smat_i->rbuf2; 2145 rbuf3 = smat_i->rbuf3; 2146 req_source2 = smat_i->req_source2; 2147 2148 sbuf1 = smat_i->sbuf1; 2149 sbuf2 = smat_i->sbuf2; 2150 ptr = smat_i->ptr; 2151 tmp = smat_i->tmp; 2152 ctr = smat_i->ctr; 2153 2154 pa = smat_i->pa; 2155 req_size = smat_i->req_size; 2156 req_source1 = smat_i->req_source1; 2157 2158 allcolumns[i] = smat_i->allcolumns; 2159 row2proc[i] = smat_i->row2proc; 2160 rmap[i] = smat_i->rmap; 2161 cmap[i] = smat_i->cmap; 2162 } 2163 2164 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2165 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2166 smat_i = (Mat_SubSppt*)submats[0]->data; 2167 2168 nrqs = smat_i->nrqs; 2169 nrqr = smat_i->nrqr; 2170 rbuf1 = smat_i->rbuf1; 2171 rbuf2 = smat_i->rbuf2; 2172 rbuf3 = smat_i->rbuf3; 2173 req_source2 = smat_i->req_source2; 2174 2175 sbuf1 = smat_i->sbuf1; 2176 sbuf2 = smat_i->sbuf2; 2177 ptr = smat_i->ptr; 2178 tmp = smat_i->tmp; 2179 ctr = smat_i->ctr; 2180 2181 pa = smat_i->pa; 2182 req_size = smat_i->req_size; 2183 req_source1 = smat_i->req_source1; 2184 2185 allcolumns[0] = PETSC_FALSE; 2186 } 2187 } else { /* scall == MAT_INITIAL_MATRIX */ 2188 /* Get some new tags to keep the communication clean */ 2189 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2190 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2191 2192 /* evaluate communication - mesg to who, length of mesg, and buffer space 2193 required. Based on this, buffers are allocated, and data copied into them*/ 2194 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2195 2196 for (i=0; i<ismax; i++) { 2197 jmax = nrow[i]; 2198 irow_i = irow[i]; 2199 2200 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2201 row2proc[i] = row2proc_i; 2202 2203 if (issorted[i]) proc = 0; 2204 for (j=0; j<jmax; j++) { 2205 if (!issorted[i]) proc = 0; 2206 row = irow_i[j]; 2207 while (row >= C->rmap->range[proc+1]) proc++; 2208 w4[proc]++; 2209 row2proc_i[j] = proc; /* map row index to proc */ 2210 } 2211 for (j=0; j<size; j++) { 2212 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2213 } 2214 } 2215 2216 nrqs = 0; /* no of outgoing messages */ 2217 msz = 0; /* total mesg length (for all procs) */ 2218 w1[rank] = 0; /* no mesg sent to self */ 2219 w3[rank] = 0; 2220 for (i=0; i<size; i++) { 2221 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2222 } 2223 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2224 for (i=0,j=0; i<size; i++) { 2225 if (w1[i]) { pa[j] = i; j++; } 2226 } 2227 2228 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2229 for (i=0; i<nrqs; i++) { 2230 j = pa[i]; 2231 w1[j] += w2[j] + 2* w3[j]; 2232 msz += w1[j]; 2233 } 2234 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2235 2236 /* Determine the number of messages to expect, their lengths, from from-ids */ 2237 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2238 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2239 2240 /* Now post the Irecvs corresponding to these messages */ 2241 tag0 = ((PetscObject)C)->tag; 2242 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2243 2244 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2245 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2246 2247 /* Allocate Memory for outgoing messages */ 2248 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2249 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2250 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2251 2252 { 2253 PetscInt *iptr = tmp; 2254 k = 0; 2255 for (i=0; i<nrqs; i++) { 2256 j = pa[i]; 2257 iptr += k; 2258 sbuf1[j] = iptr; 2259 k = w1[j]; 2260 } 2261 } 2262 2263 /* Form the outgoing messages. Initialize the header space */ 2264 for (i=0; i<nrqs; i++) { 2265 j = pa[i]; 2266 sbuf1[j][0] = 0; 2267 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2268 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2269 } 2270 2271 /* Parse the isrow and copy data into outbuf */ 2272 for (i=0; i<ismax; i++) { 2273 row2proc_i = row2proc[i]; 2274 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2275 irow_i = irow[i]; 2276 jmax = nrow[i]; 2277 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2278 proc = row2proc_i[j]; 2279 if (proc != rank) { /* copy to the outgoing buf*/ 2280 ctr[proc]++; 2281 *ptr[proc] = irow_i[j]; 2282 ptr[proc]++; 2283 } 2284 } 2285 /* Update the headers for the current IS */ 2286 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2287 if ((ctr_j = ctr[j])) { 2288 sbuf1_j = sbuf1[j]; 2289 k = ++sbuf1_j[0]; 2290 sbuf1_j[2*k] = ctr_j; 2291 sbuf1_j[2*k-1] = i; 2292 } 2293 } 2294 } 2295 2296 /* Now post the sends */ 2297 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2298 for (i=0; i<nrqs; ++i) { 2299 j = pa[i]; 2300 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2301 } 2302 2303 /* Post Receives to capture the buffer size */ 2304 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2305 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2306 rbuf2[0] = tmp + msz; 2307 for (i=1; i<nrqs; ++i) { 2308 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2309 } 2310 for (i=0; i<nrqs; ++i) { 2311 j = pa[i]; 2312 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2313 } 2314 2315 /* Send to other procs the buf size they should allocate */ 2316 /* Receive messages*/ 2317 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2318 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2319 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2320 { 2321 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2322 PetscInt *sbuf2_i; 2323 2324 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2325 for (i=0; i<nrqr; ++i) { 2326 req_size[i] = 0; 2327 rbuf1_i = rbuf1[i]; 2328 start = 2*rbuf1_i[0] + 1; 2329 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2330 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2331 sbuf2_i = sbuf2[i]; 2332 for (j=start; j<end; j++) { 2333 id = rbuf1_i[j] - rstart; 2334 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2335 sbuf2_i[j] = ncols; 2336 req_size[i] += ncols; 2337 } 2338 req_source1[i] = r_status1[i].MPI_SOURCE; 2339 /* form the header */ 2340 sbuf2_i[0] = req_size[i]; 2341 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2342 2343 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2344 } 2345 } 2346 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2347 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2348 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2349 2350 /* Receive messages*/ 2351 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2352 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2353 2354 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2355 for (i=0; i<nrqs; ++i) { 2356 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2357 req_source2[i] = r_status2[i].MPI_SOURCE; 2358 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2359 } 2360 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2361 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2362 2363 /* Wait on sends1 and sends2 */ 2364 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2365 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2366 2367 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2368 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2369 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2370 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2371 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2372 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2373 2374 /* Now allocate sending buffers for a->j, and send them off */ 2375 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2376 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2377 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2378 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2379 2380 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2381 { 2382 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2383 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2384 PetscInt cend = C->cmap->rend; 2385 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2386 2387 for (i=0; i<nrqr; i++) { 2388 rbuf1_i = rbuf1[i]; 2389 sbuf_aj_i = sbuf_aj[i]; 2390 ct1 = 2*rbuf1_i[0] + 1; 2391 ct2 = 0; 2392 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2393 kmax = rbuf1[i][2*j]; 2394 for (k=0; k<kmax; k++,ct1++) { 2395 row = rbuf1_i[ct1] - rstart; 2396 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2397 ncols = nzA + nzB; 2398 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2399 2400 /* load the column indices for this row into cols */ 2401 cols = sbuf_aj_i + ct2; 2402 2403 lwrite = 0; 2404 for (l=0; l<nzB; l++) { 2405 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2406 } 2407 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2408 for (l=0; l<nzB; l++) { 2409 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2410 } 2411 2412 ct2 += ncols; 2413 } 2414 } 2415 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2416 } 2417 } 2418 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2419 2420 /* create col map: global col of C -> local col of submatrices */ 2421 { 2422 const PetscInt *icol_i; 2423 #if defined(PETSC_USE_CTABLE) 2424 for (i=0; i<ismax; i++) { 2425 if (!allcolumns[i]) { 2426 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2427 2428 jmax = ncol[i]; 2429 icol_i = icol[i]; 2430 cmap_i = cmap[i]; 2431 for (j=0; j<jmax; j++) { 2432 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2433 } 2434 } else cmap[i] = NULL; 2435 } 2436 #else 2437 for (i=0; i<ismax; i++) { 2438 if (!allcolumns[i]) { 2439 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2440 jmax = ncol[i]; 2441 icol_i = icol[i]; 2442 cmap_i = cmap[i]; 2443 for (j=0; j<jmax; j++) { 2444 cmap_i[icol_i[j]] = j+1; 2445 } 2446 } else cmap[i] = NULL; 2447 } 2448 #endif 2449 } 2450 2451 /* Create lens which is required for MatCreate... */ 2452 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2453 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2454 2455 if (ismax) { 2456 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2457 } 2458 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2459 2460 /* Update lens from local data */ 2461 for (i=0; i<ismax; i++) { 2462 row2proc_i = row2proc[i]; 2463 jmax = nrow[i]; 2464 if (!allcolumns[i]) cmap_i = cmap[i]; 2465 irow_i = irow[i]; 2466 lens_i = lens[i]; 2467 for (j=0; j<jmax; j++) { 2468 row = irow_i[j]; 2469 proc = row2proc_i[j]; 2470 if (proc == rank) { 2471 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2472 if (!allcolumns[i]) { 2473 for (k=0; k<ncols; k++) { 2474 #if defined(PETSC_USE_CTABLE) 2475 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2476 #else 2477 tcol = cmap_i[cols[k]]; 2478 #endif 2479 if (tcol) lens_i[j]++; 2480 } 2481 } else { /* allcolumns */ 2482 lens_i[j] = ncols; 2483 } 2484 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2485 } 2486 } 2487 } 2488 2489 /* Create row map: global row of C -> local row of submatrices */ 2490 #if defined(PETSC_USE_CTABLE) 2491 for (i=0; i<ismax; i++) { 2492 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2493 irow_i = irow[i]; 2494 jmax = nrow[i]; 2495 for (j=0; j<jmax; j++) { 2496 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2497 } 2498 } 2499 #else 2500 for (i=0; i<ismax; i++) { 2501 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2502 rmap_i = rmap[i]; 2503 irow_i = irow[i]; 2504 jmax = nrow[i]; 2505 for (j=0; j<jmax; j++) { 2506 rmap_i[irow_i[j]] = j; 2507 } 2508 } 2509 #endif 2510 2511 /* Update lens from offproc data */ 2512 { 2513 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2514 2515 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2516 for (tmp2=0; tmp2<nrqs; tmp2++) { 2517 sbuf1_i = sbuf1[pa[tmp2]]; 2518 jmax = sbuf1_i[0]; 2519 ct1 = 2*jmax+1; 2520 ct2 = 0; 2521 rbuf2_i = rbuf2[tmp2]; 2522 rbuf3_i = rbuf3[tmp2]; 2523 for (j=1; j<=jmax; j++) { 2524 is_no = sbuf1_i[2*j-1]; 2525 max1 = sbuf1_i[2*j]; 2526 lens_i = lens[is_no]; 2527 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2528 rmap_i = rmap[is_no]; 2529 for (k=0; k<max1; k++,ct1++) { 2530 #if defined(PETSC_USE_CTABLE) 2531 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2532 row--; 2533 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2534 #else 2535 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2536 #endif 2537 max2 = rbuf2_i[ct1]; 2538 for (l=0; l<max2; l++,ct2++) { 2539 if (!allcolumns[is_no]) { 2540 #if defined(PETSC_USE_CTABLE) 2541 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2542 #else 2543 tcol = cmap_i[rbuf3_i[ct2]]; 2544 #endif 2545 if (tcol) lens_i[row]++; 2546 } else { /* allcolumns */ 2547 lens_i[row]++; /* lens_i[row] += max2 ? */ 2548 } 2549 } 2550 } 2551 } 2552 } 2553 } 2554 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2555 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2556 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2557 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2558 2559 /* Create the submatrices */ 2560 for (i=0; i<ismax; i++) { 2561 PetscInt rbs,cbs; 2562 2563 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2564 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2565 2566 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2567 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2568 2569 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2570 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2571 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2572 2573 /* create struct Mat_SubSppt and attached it to submat */ 2574 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2575 subc = (Mat_SeqAIJ*)submats[i]->data; 2576 subc->submatis1 = smat_i; 2577 2578 smat_i->destroy = submats[i]->ops->destroy; 2579 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2580 submats[i]->factortype = C->factortype; 2581 2582 smat_i->id = i; 2583 smat_i->nrqs = nrqs; 2584 smat_i->nrqr = nrqr; 2585 smat_i->rbuf1 = rbuf1; 2586 smat_i->rbuf2 = rbuf2; 2587 smat_i->rbuf3 = rbuf3; 2588 smat_i->sbuf2 = sbuf2; 2589 smat_i->req_source2 = req_source2; 2590 2591 smat_i->sbuf1 = sbuf1; 2592 smat_i->ptr = ptr; 2593 smat_i->tmp = tmp; 2594 smat_i->ctr = ctr; 2595 2596 smat_i->pa = pa; 2597 smat_i->req_size = req_size; 2598 smat_i->req_source1 = req_source1; 2599 2600 smat_i->allcolumns = allcolumns[i]; 2601 smat_i->singleis = PETSC_FALSE; 2602 smat_i->row2proc = row2proc[i]; 2603 smat_i->rmap = rmap[i]; 2604 smat_i->cmap = cmap[i]; 2605 } 2606 2607 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2608 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2609 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2610 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2611 2612 /* create struct Mat_SubSppt and attached it to submat */ 2613 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2614 submats[0]->data = (void*)smat_i; 2615 2616 smat_i->destroy = submats[0]->ops->destroy; 2617 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2618 submats[0]->factortype = C->factortype; 2619 2620 smat_i->id = 0; 2621 smat_i->nrqs = nrqs; 2622 smat_i->nrqr = nrqr; 2623 smat_i->rbuf1 = rbuf1; 2624 smat_i->rbuf2 = rbuf2; 2625 smat_i->rbuf3 = rbuf3; 2626 smat_i->sbuf2 = sbuf2; 2627 smat_i->req_source2 = req_source2; 2628 2629 smat_i->sbuf1 = sbuf1; 2630 smat_i->ptr = ptr; 2631 smat_i->tmp = tmp; 2632 smat_i->ctr = ctr; 2633 2634 smat_i->pa = pa; 2635 smat_i->req_size = req_size; 2636 smat_i->req_source1 = req_source1; 2637 2638 smat_i->allcolumns = PETSC_FALSE; 2639 smat_i->singleis = PETSC_FALSE; 2640 smat_i->row2proc = NULL; 2641 smat_i->rmap = NULL; 2642 smat_i->cmap = NULL; 2643 } 2644 2645 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2646 ierr = PetscFree(lens);CHKERRQ(ierr); 2647 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2648 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2649 2650 } /* endof scall == MAT_INITIAL_MATRIX */ 2651 2652 /* Post recv matrix values */ 2653 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2654 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2655 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2656 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2657 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2658 for (i=0; i<nrqs; ++i) { 2659 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2660 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2661 } 2662 2663 /* Allocate sending buffers for a->a, and send them off */ 2664 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2665 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2666 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2667 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2668 2669 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2670 { 2671 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2672 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2673 PetscInt cend = C->cmap->rend; 2674 PetscInt *b_j = b->j; 2675 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2676 2677 for (i=0; i<nrqr; i++) { 2678 rbuf1_i = rbuf1[i]; 2679 sbuf_aa_i = sbuf_aa[i]; 2680 ct1 = 2*rbuf1_i[0]+1; 2681 ct2 = 0; 2682 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2683 kmax = rbuf1_i[2*j]; 2684 for (k=0; k<kmax; k++,ct1++) { 2685 row = rbuf1_i[ct1] - rstart; 2686 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2687 ncols = nzA + nzB; 2688 cworkB = b_j + b_i[row]; 2689 vworkA = a_a + a_i[row]; 2690 vworkB = b_a + b_i[row]; 2691 2692 /* load the column values for this row into vals*/ 2693 vals = sbuf_aa_i+ct2; 2694 2695 lwrite = 0; 2696 for (l=0; l<nzB; l++) { 2697 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2698 } 2699 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2700 for (l=0; l<nzB; l++) { 2701 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2702 } 2703 2704 ct2 += ncols; 2705 } 2706 } 2707 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2708 } 2709 } 2710 2711 /* Assemble the matrices */ 2712 /* First assemble the local rows */ 2713 for (i=0; i<ismax; i++) { 2714 row2proc_i = row2proc[i]; 2715 subc = (Mat_SeqAIJ*)submats[i]->data; 2716 imat_ilen = subc->ilen; 2717 imat_j = subc->j; 2718 imat_i = subc->i; 2719 imat_a = subc->a; 2720 2721 if (!allcolumns[i]) cmap_i = cmap[i]; 2722 rmap_i = rmap[i]; 2723 irow_i = irow[i]; 2724 jmax = nrow[i]; 2725 for (j=0; j<jmax; j++) { 2726 row = irow_i[j]; 2727 proc = row2proc_i[j]; 2728 if (proc == rank) { 2729 old_row = row; 2730 #if defined(PETSC_USE_CTABLE) 2731 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2732 row--; 2733 #else 2734 row = rmap_i[row]; 2735 #endif 2736 ilen_row = imat_ilen[row]; 2737 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2738 mat_i = imat_i[row]; 2739 mat_a = imat_a + mat_i; 2740 mat_j = imat_j + mat_i; 2741 if (!allcolumns[i]) { 2742 for (k=0; k<ncols; k++) { 2743 #if defined(PETSC_USE_CTABLE) 2744 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2745 #else 2746 tcol = cmap_i[cols[k]]; 2747 #endif 2748 if (tcol) { 2749 *mat_j++ = tcol - 1; 2750 *mat_a++ = vals[k]; 2751 ilen_row++; 2752 } 2753 } 2754 } else { /* allcolumns */ 2755 for (k=0; k<ncols; k++) { 2756 *mat_j++ = cols[k]; /* global col index! */ 2757 *mat_a++ = vals[k]; 2758 ilen_row++; 2759 } 2760 } 2761 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2762 2763 imat_ilen[row] = ilen_row; 2764 } 2765 } 2766 } 2767 2768 /* Now assemble the off proc rows */ 2769 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2770 for (tmp2=0; tmp2<nrqs; tmp2++) { 2771 sbuf1_i = sbuf1[pa[tmp2]]; 2772 jmax = sbuf1_i[0]; 2773 ct1 = 2*jmax + 1; 2774 ct2 = 0; 2775 rbuf2_i = rbuf2[tmp2]; 2776 rbuf3_i = rbuf3[tmp2]; 2777 rbuf4_i = rbuf4[tmp2]; 2778 for (j=1; j<=jmax; j++) { 2779 is_no = sbuf1_i[2*j-1]; 2780 rmap_i = rmap[is_no]; 2781 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2782 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2783 imat_ilen = subc->ilen; 2784 imat_j = subc->j; 2785 imat_i = subc->i; 2786 imat_a = subc->a; 2787 max1 = sbuf1_i[2*j]; 2788 for (k=0; k<max1; k++,ct1++) { 2789 row = sbuf1_i[ct1]; 2790 #if defined(PETSC_USE_CTABLE) 2791 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2792 row--; 2793 #else 2794 row = rmap_i[row]; 2795 #endif 2796 ilen = imat_ilen[row]; 2797 mat_i = imat_i[row]; 2798 mat_a = imat_a + mat_i; 2799 mat_j = imat_j + mat_i; 2800 max2 = rbuf2_i[ct1]; 2801 if (!allcolumns[is_no]) { 2802 for (l=0; l<max2; l++,ct2++) { 2803 #if defined(PETSC_USE_CTABLE) 2804 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2805 #else 2806 tcol = cmap_i[rbuf3_i[ct2]]; 2807 #endif 2808 if (tcol) { 2809 *mat_j++ = tcol - 1; 2810 *mat_a++ = rbuf4_i[ct2]; 2811 ilen++; 2812 } 2813 } 2814 } else { /* allcolumns */ 2815 for (l=0; l<max2; l++,ct2++) { 2816 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2817 *mat_a++ = rbuf4_i[ct2]; 2818 ilen++; 2819 } 2820 } 2821 imat_ilen[row] = ilen; 2822 } 2823 } 2824 } 2825 2826 if (!iscsorted) { /* sort column indices of the rows */ 2827 for (i=0; i<ismax; i++) { 2828 subc = (Mat_SeqAIJ*)submats[i]->data; 2829 imat_j = subc->j; 2830 imat_i = subc->i; 2831 imat_a = subc->a; 2832 imat_ilen = subc->ilen; 2833 2834 if (allcolumns[i]) continue; 2835 jmax = nrow[i]; 2836 for (j=0; j<jmax; j++) { 2837 mat_i = imat_i[j]; 2838 mat_a = imat_a + mat_i; 2839 mat_j = imat_j + mat_i; 2840 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2841 } 2842 } 2843 } 2844 2845 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2846 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2847 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2848 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2849 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2850 2851 /* Restore the indices */ 2852 for (i=0; i<ismax; i++) { 2853 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2854 if (!allcolumns[i]) { 2855 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2856 } 2857 } 2858 2859 for (i=0; i<ismax; i++) { 2860 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2861 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2862 } 2863 2864 /* Destroy allocated memory */ 2865 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2866 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2867 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2868 2869 for (i=0; i<nrqs; ++i) { 2870 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2871 } 2872 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2873 2874 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2875 PetscFunctionReturn(0); 2876 } 2877 2878 /* 2879 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2880 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2881 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2882 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2883 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2884 state, and needs to be "assembled" later by compressing B's column space. 2885 2886 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2887 Following this call, C->A & C->B have been created, even if empty. 2888 */ 2889 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2890 { 2891 /* If making this function public, change the error returned in this function away from _PLIB. */ 2892 PetscErrorCode ierr; 2893 Mat_MPIAIJ *aij; 2894 Mat_SeqAIJ *Baij; 2895 PetscBool seqaij,Bdisassembled; 2896 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2897 PetscScalar v; 2898 const PetscInt *rowindices,*colindices; 2899 2900 PetscFunctionBegin; 2901 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2902 if (A) { 2903 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2904 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2905 if (rowemb) { 2906 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2907 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2908 } else { 2909 if (C->rmap->n != A->rmap->n) { 2910 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2911 } 2912 } 2913 if (dcolemb) { 2914 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2915 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2916 } else { 2917 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2918 } 2919 } 2920 if (B) { 2921 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2922 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2923 if (rowemb) { 2924 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2925 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2926 } else { 2927 if (C->rmap->n != B->rmap->n) { 2928 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2929 } 2930 } 2931 if (ocolemb) { 2932 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2933 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2934 } else { 2935 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2936 } 2937 } 2938 2939 aij = (Mat_MPIAIJ*)C->data; 2940 if (!aij->A) { 2941 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2942 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2943 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2944 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2945 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2946 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2947 } 2948 if (A) { 2949 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2950 } else { 2951 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2952 } 2953 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2954 /* 2955 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2956 need to "disassemble" B -- convert it to using C's global indices. 2957 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2958 2959 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2960 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2961 2962 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2963 At least avoid calling MatSetValues() and the implied searches? 2964 */ 2965 2966 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2967 #if defined(PETSC_USE_CTABLE) 2968 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2969 #else 2970 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2971 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2972 if (aij->B) { 2973 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2974 } 2975 #endif 2976 ngcol = 0; 2977 if (aij->lvec) { 2978 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2979 } 2980 if (aij->garray) { 2981 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2982 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2983 } 2984 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2985 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2986 } 2987 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2988 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2989 } 2990 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2991 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2992 } 2993 } 2994 Bdisassembled = PETSC_FALSE; 2995 if (!aij->B) { 2996 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2997 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2998 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2999 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 3000 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3001 Bdisassembled = PETSC_TRUE; 3002 } 3003 if (B) { 3004 Baij = (Mat_SeqAIJ*)B->data; 3005 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3006 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3007 for (i=0; i<B->rmap->n; i++) { 3008 nz[i] = Baij->i[i+1] - Baij->i[i]; 3009 } 3010 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3011 ierr = PetscFree(nz);CHKERRQ(ierr); 3012 } 3013 3014 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3015 shift = rend-rstart; 3016 count = 0; 3017 rowindices = NULL; 3018 colindices = NULL; 3019 if (rowemb) { 3020 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3021 } 3022 if (ocolemb) { 3023 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3024 } 3025 for (i=0; i<B->rmap->n; i++) { 3026 PetscInt row; 3027 row = i; 3028 if (rowindices) row = rowindices[i]; 3029 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3030 col = Baij->j[count]; 3031 if (colindices) col = colindices[col]; 3032 if (Bdisassembled && col>=rstart) col += shift; 3033 v = Baij->a[count]; 3034 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3035 ++count; 3036 } 3037 } 3038 /* No assembly for aij->B is necessary. */ 3039 /* FIXME: set aij->B's nonzerostate correctly. */ 3040 } else { 3041 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3042 } 3043 C->preallocated = PETSC_TRUE; 3044 C->was_assembled = PETSC_FALSE; 3045 C->assembled = PETSC_FALSE; 3046 /* 3047 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3048 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3049 */ 3050 PetscFunctionReturn(0); 3051 } 3052 3053 /* 3054 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3055 */ 3056 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3057 { 3058 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3059 3060 PetscFunctionBegin; 3061 PetscValidPointer(A,2); 3062 PetscValidPointer(B,3); 3063 /* FIXME: make sure C is assembled */ 3064 *A = aij->A; 3065 *B = aij->B; 3066 /* Note that we don't incref *A and *B, so be careful! */ 3067 PetscFunctionReturn(0); 3068 } 3069 3070 /* 3071 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3072 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3073 */ 3074 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3075 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3076 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3077 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3078 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3079 { 3080 PetscErrorCode ierr; 3081 PetscMPIInt isize,flag; 3082 PetscInt i,ii,cismax,ispar; 3083 Mat *A,*B; 3084 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3085 3086 PetscFunctionBegin; 3087 if (!ismax) PetscFunctionReturn(0); 3088 3089 for (i = 0, cismax = 0; i < ismax; ++i) { 3090 PetscMPIInt isize; 3091 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3092 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3093 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3094 if (isize > 1) ++cismax; 3095 } 3096 3097 /* 3098 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3099 ispar counts the number of parallel ISs across C's comm. 3100 */ 3101 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3102 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3103 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3104 PetscFunctionReturn(0); 3105 } 3106 3107 /* if (ispar) */ 3108 /* 3109 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3110 These are used to extract the off-diag portion of the resulting parallel matrix. 3111 The row IS for the off-diag portion is the same as for the diag portion, 3112 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3113 */ 3114 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3115 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3116 for (i = 0, ii = 0; i < ismax; ++i) { 3117 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3118 if (isize > 1) { 3119 /* 3120 TODO: This is the part that's ***NOT SCALABLE***. 3121 To fix this we need to extract just the indices of C's nonzero columns 3122 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3123 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3124 to be done without serializing on the IS list, so, most likely, it is best 3125 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3126 */ 3127 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3128 /* Now we have to 3129 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3130 were sorted on each rank, concatenated they might no longer be sorted; 3131 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3132 indices in the nondecreasing order to the original index positions. 3133 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3134 */ 3135 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3136 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3137 ++ii; 3138 } 3139 } 3140 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3141 for (i = 0, ii = 0; i < ismax; ++i) { 3142 PetscInt j,issize; 3143 const PetscInt *indices; 3144 3145 /* 3146 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3147 */ 3148 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3149 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3150 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3151 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3152 for (j = 1; j < issize; ++j) { 3153 if (indices[j] == indices[j-1]) { 3154 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3155 } 3156 } 3157 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3158 3159 3160 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3161 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3162 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3163 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3164 for (j = 1; j < issize; ++j) { 3165 if (indices[j-1] == indices[j]) { 3166 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3167 } 3168 } 3169 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3170 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3171 if (isize > 1) { 3172 cisrow[ii] = isrow[i]; 3173 ++ii; 3174 } 3175 } 3176 /* 3177 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3178 array of sequential matrices underlying the resulting parallel matrices. 3179 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3180 contain duplicates. 3181 3182 There are as many diag matrices as there are original index sets. There are only as many parallel 3183 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3184 3185 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3186 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3187 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3188 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3189 with A[i] and B[ii] extracted from the corresponding MPI submat. 3190 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3191 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3192 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3193 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3194 values into A[i] and B[ii] sitting inside the corresponding submat. 3195 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3196 A[i], B[ii], AA[i] or BB[ii] matrices. 3197 */ 3198 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3199 if (scall == MAT_INITIAL_MATRIX) { 3200 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3201 } 3202 3203 /* Now obtain the sequential A and B submatrices separately. */ 3204 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3205 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3206 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3207 3208 /* 3209 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3210 matrices A & B have been extracted directly into the parallel matrices containing them, or 3211 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3212 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3213 to have the same sparsity pattern. 3214 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3215 must be constructed for C. This is done by setseqmat(s). 3216 */ 3217 for (i = 0, ii = 0; i < ismax; ++i) { 3218 /* 3219 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3220 That way we can avoid sorting and computing permutations when reusing. 3221 To this end: 3222 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3223 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3224 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3225 */ 3226 MatStructure pattern; 3227 pattern = DIFFERENT_NONZERO_PATTERN; 3228 3229 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3230 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3231 if (isize > 1) { 3232 if (scall == MAT_INITIAL_MATRIX) { 3233 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3234 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3235 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3236 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3237 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3238 } 3239 /* 3240 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3241 */ 3242 { 3243 Mat AA,BB; 3244 AA = A[i]; 3245 BB = B[ii]; 3246 if (AA || BB) { 3247 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3248 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3249 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3250 } 3251 3252 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3253 } 3254 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3255 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3256 ++ii; 3257 } else { /* if (isize == 1) */ 3258 if (scall == MAT_REUSE_MATRIX) { 3259 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3260 } 3261 if (isrow_p[i] || iscol_p[i]) { 3262 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3263 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3264 /* Otherwise A is extracted straight into (*submats)[i]. */ 3265 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3266 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3267 } else (*submat)[i] = A[i]; 3268 } 3269 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3270 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3271 } 3272 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3273 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3274 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3275 ierr = PetscFree(A);CHKERRQ(ierr); 3276 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3277 PetscFunctionReturn(0); 3278 } 3279 3280 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3281 { 3282 PetscErrorCode ierr; 3283 3284 PetscFunctionBegin; 3285 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3286 PetscFunctionReturn(0); 3287 } 3288