1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 22 { 23 PetscErrorCode ierr; 24 PetscInt i; 25 26 PetscFunctionBegin; 27 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 for (i=0; i<ov; ++i) { 29 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 30 } 31 PetscFunctionReturn(0); 32 } 33 34 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 35 { 36 PetscErrorCode ierr; 37 PetscInt i; 38 39 PetscFunctionBegin; 40 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 41 for (i=0; i<ov; ++i) { 42 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 43 } 44 PetscFunctionReturn(0); 45 } 46 47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 48 { 49 PetscErrorCode ierr; 50 MPI_Comm comm; 51 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 52 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 53 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 54 const PetscInt *indices_i,**indices; 55 PetscLayout rmap; 56 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 57 PetscSF sf; 58 PetscSFNode *remote; 59 60 PetscFunctionBegin; 61 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 62 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 63 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 64 /* get row map to determine where rows should be going */ 65 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 66 /* retrieve IS data and put all together so that we 67 * can optimize communication 68 * */ 69 ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 70 for (i=0,tlength=0; i<nidx; i++) { 71 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 72 tlength += length[i]; 73 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 74 } 75 /* find these rows on remote processors */ 76 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 77 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 78 nrrows = 0; 79 for (i=0; i<nidx; i++) { 80 length_i = length[i]; 81 indices_i = indices[i]; 82 for (j=0; j<length_i; j++) { 83 owner = -1; 84 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 85 /* remote processors */ 86 if (owner != rank) { 87 tosizes_temp[owner]++; /* number of rows to owner */ 88 rrow_ranks[nrrows] = owner; /* processor */ 89 rrow_isids[nrrows] = i; /* is id */ 90 remoterows[nrrows++] = indices_i[j]; /* row */ 91 } 92 } 93 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 94 } 95 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 96 /* test if we need to exchange messages 97 * generally speaking, we do not need to exchange 98 * data when overlap is 1 99 * */ 100 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRMPI(ierr); 101 /* we do not have any messages 102 * It usually corresponds to overlap 1 103 * */ 104 if (!reducednrrows) { 105 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 106 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 107 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 nto = 0; 111 /* send sizes and ranks for building a two-sided communcation */ 112 for (i=0; i<size; i++) { 113 if (tosizes_temp[i]) { 114 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 115 tosizes_temp[i] = nto; /* a map from processor to index */ 116 toranks[nto++] = i; /* processor */ 117 } 118 } 119 ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr); 120 toffsets[0] = 0; 121 for (i=0; i<nto; i++) { 122 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 123 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 124 } 125 /* send information to other processors */ 126 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 127 nrecvrows = 0; 128 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 129 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 130 nrecvrows = 0; 131 for (i=0; i<nfrom; i++) { 132 for (j=0; j<fromsizes[2*i]; j++) { 133 remote[nrecvrows].rank = fromranks[i]; 134 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 135 } 136 } 137 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 138 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 139 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 140 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 141 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 142 /* message pair <no of is, row> */ 143 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 144 for (i=0; i<nrrows; i++) { 145 owner = rrow_ranks[i]; /* processor */ 146 j = tosizes_temp[owner]; /* index */ 147 todata[toffsets[j]++] = rrow_isids[i]; 148 todata[toffsets[j]++] = remoterows[i]; 149 } 150 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 151 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 152 ierr = PetscFree(toffsets);CHKERRQ(ierr); 153 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 154 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 155 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 156 /* send rows belonging to the remote so that then we could get the overlapping data back */ 157 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 158 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 159 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 160 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 161 ierr = PetscFree(fromranks);CHKERRQ(ierr); 162 nrecvrows = 0; 163 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 164 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 165 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 166 nrecvrows = 0; 167 for (i=0; i<nto; i++) { 168 for (j=0; j<tosizes[2*i]; j++) { 169 remote[nrecvrows].rank = toranks[i]; 170 remote[nrecvrows++].index = tosizes[2*i+1]+j; 171 } 172 } 173 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 174 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 175 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 176 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 177 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 178 /* overlap communication and computation */ 179 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 180 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 181 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 182 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 183 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 184 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 185 ierr = PetscFree(toranks);CHKERRQ(ierr); 186 ierr = PetscFree(tosizes);CHKERRQ(ierr); 187 ierr = PetscFree(todata);CHKERRQ(ierr); 188 PetscFunctionReturn(0); 189 } 190 191 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 192 { 193 PetscInt *isz,isz_i,i,j,is_id, data_size; 194 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 195 const PetscInt *indices_i_temp; 196 MPI_Comm *iscomms; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++) { 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++) { 209 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 210 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 211 ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr); 212 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 213 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 214 } 215 /* retrieve information to get row id and its overlap */ 216 for (i=0; i<nrecvs;) { 217 is_id = recvdata[i++]; 218 data_size = recvdata[i++]; 219 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 220 isz_i = isz[is_id]; 221 for (j=0; j< data_size; j++) { 222 col = recvdata[i++]; 223 indices_i[isz_i++] = col; 224 } 225 isz[is_id] = isz_i; 226 } 227 /* remove duplicate entities */ 228 for (i=0; i<nidx; i++) { 229 indices_i = indices_temp+(max_lsize+nrecvs)*i; 230 isz_i = isz[i]; 231 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 232 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 233 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 234 } 235 ierr = PetscFree(isz);CHKERRQ(ierr); 236 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 237 PetscFunctionReturn(0); 238 } 239 240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 241 { 242 PetscLayout rmap,cmap; 243 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 244 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 245 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 246 const PetscInt *gcols,*ai,*aj,*bi,*bj; 247 Mat amat,bmat; 248 PetscMPIInt rank; 249 PetscBool done; 250 MPI_Comm comm; 251 PetscErrorCode ierr; 252 253 PetscFunctionBegin; 254 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 255 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 256 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 257 /* Even if the mat is symmetric, we still assume it is not symmetric */ 258 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 261 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 262 /* total number of nonzero values is used to estimate the memory usage in the next step */ 263 tnz = ai[an]+bi[bn]; 264 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 265 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 266 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 267 /* to find the longest message */ 268 max_fszs = 0; 269 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 270 /* better way to estimate number of nonzero in the mat??? */ 271 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 272 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 273 rows_pos = 0; 274 totalrows = 0; 275 for (i=0; i<nfrom; i++) { 276 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 277 /* group data together */ 278 for (j=0; j<fromsizes[2*i]; j+=2) { 279 is_id = fromrows[rows_pos++];/* no of is */ 280 rows_i = rows_data[is_id]; 281 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 282 } 283 /* estimate a space to avoid multiple allocations */ 284 for (j=0; j<nidx; j++) { 285 indvc_ij = 0; 286 rows_i = rows_data[j]; 287 for (l=0; l<rows_pos_i[j]; l++) { 288 row = rows_i[l]-rstart; 289 start = ai[row]; 290 end = ai[row+1]; 291 for (k=start; k<end; k++) { /* Amat */ 292 col = aj[k] + cstart; 293 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 294 } 295 start = bi[row]; 296 end = bi[row+1]; 297 for (k=start; k<end; k++) { /* Bmat */ 298 col = gcols[bj[k]]; 299 indices_tmp[indvc_ij++] = col; 300 } 301 } 302 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 303 indv_counts[i*nidx+j] = indvc_ij; 304 totalrows += indvc_ij; 305 } 306 } 307 /* message triple <no of is, number of rows, rows> */ 308 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 309 totalrows = 0; 310 rows_pos = 0; 311 /* use this code again */ 312 for (i=0;i<nfrom;i++) { 313 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 314 for (j=0; j<fromsizes[2*i]; j+=2) { 315 is_id = fromrows[rows_pos++]; 316 rows_i = rows_data[is_id]; 317 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 318 } 319 /* add data */ 320 for (j=0; j<nidx; j++) { 321 if (!indv_counts[i*nidx+j]) continue; 322 indvc_ij = 0; 323 sbdata[totalrows++] = j; 324 sbdata[totalrows++] = indv_counts[i*nidx+j]; 325 sbsizes[2*i] += 2; 326 rows_i = rows_data[j]; 327 for (l=0; l<rows_pos_i[j]; l++) { 328 row = rows_i[l]-rstart; 329 start = ai[row]; 330 end = ai[row+1]; 331 for (k=start; k<end; k++) { /* Amat */ 332 col = aj[k] + cstart; 333 indices_tmp[indvc_ij++] = col; 334 } 335 start = bi[row]; 336 end = bi[row+1]; 337 for (k=start; k<end; k++) { /* Bmat */ 338 col = gcols[bj[k]]; 339 indices_tmp[indvc_ij++] = col; 340 } 341 } 342 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 343 sbsizes[2*i] += indvc_ij; 344 ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr); 345 totalrows += indvc_ij; 346 } 347 } 348 ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 349 offsets[0] = 0; 350 for (i=0; i<nfrom; i++) { 351 offsets[i+1] = offsets[i] + sbsizes[2*i]; 352 sbsizes[2*i+1] = offsets[i]; 353 } 354 ierr = PetscFree(offsets);CHKERRQ(ierr); 355 if (sbrowsizes) *sbrowsizes = sbsizes; 356 if (sbrows) *sbrows = sbdata; 357 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 358 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 359 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 360 PetscFunctionReturn(0); 361 } 362 363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 364 { 365 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 366 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 367 PetscInt lsize,lsize_tmp; 368 PetscMPIInt rank,owner; 369 Mat amat,bmat; 370 PetscBool done; 371 PetscLayout cmap,rmap; 372 MPI_Comm comm; 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 377 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 378 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 379 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 380 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 381 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 382 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 383 /* is it a safe way to compute number of nonzero values ? */ 384 tnz = ai[an]+bi[bn]; 385 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 386 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 387 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 388 /* it is a better way to estimate memory than the old implementation 389 * where global size of matrix is used 390 * */ 391 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 392 for (i=0; i<nidx; i++) { 393 MPI_Comm iscomm; 394 395 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 396 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 397 lsize_tmp = 0; 398 for (j=0; j<lsize; j++) { 399 owner = -1; 400 row = indices[j]; 401 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 402 if (owner != rank) continue; 403 /* local number */ 404 row -= rstart; 405 start = ai[row]; 406 end = ai[row+1]; 407 for (k=start; k<end; k++) { /* Amat */ 408 col = aj[k] + cstart; 409 indices_temp[lsize_tmp++] = col; 410 } 411 start = bi[row]; 412 end = bi[row+1]; 413 for (k=start; k<end; k++) { /* Bmat */ 414 col = gcols[bj[k]]; 415 indices_temp[lsize_tmp++] = col; 416 } 417 } 418 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 419 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 420 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 421 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 422 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 423 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 424 } 425 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 426 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 427 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 /* 432 Sample message format: 433 If a processor A wants processor B to process some elements corresponding 434 to index sets is[1],is[5] 435 mesg [0] = 2 (no of index sets in the mesg) 436 ----------- 437 mesg [1] = 1 => is[1] 438 mesg [2] = sizeof(is[1]); 439 ----------- 440 mesg [3] = 5 => is[5] 441 mesg [4] = sizeof(is[5]); 442 ----------- 443 mesg [5] 444 mesg [n] datas[1] 445 ----------- 446 mesg[n+1] 447 mesg[m] data(is[5]) 448 ----------- 449 450 Notes: 451 nrqs - no of requests sent (or to be sent out) 452 nrqr - no of requests received (which have to be or which have been processed) 453 */ 454 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 455 { 456 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 457 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 458 const PetscInt **idx,*idx_i; 459 PetscInt *n,**data,len; 460 #if defined(PETSC_USE_CTABLE) 461 PetscTable *table_data,table_data_i; 462 PetscInt *tdata,tcount,tcount_max; 463 #else 464 PetscInt *data_i,*d_p; 465 #endif 466 PetscErrorCode ierr; 467 PetscMPIInt size,rank,tag1,tag2,proc = 0; 468 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 469 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 470 PetscBT *table; 471 MPI_Comm comm; 472 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 473 MPI_Status *s_status,*recv_status; 474 MPI_Comm *iscomms; 475 char *t_p; 476 477 PetscFunctionBegin; 478 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 479 size = c->size; 480 rank = c->rank; 481 M = C->rmap->N; 482 483 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 484 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 485 486 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 487 488 for (i=0; i<imax; i++) { 489 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 490 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 491 } 492 493 /* evaluate communication - mesg to who,length of mesg, and buffer space 494 required. Based on this, buffers are allocated, and data copied into them */ 495 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 496 for (i=0; i<imax; i++) { 497 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 498 idx_i = idx[i]; 499 len = n[i]; 500 for (j=0; j<len; j++) { 501 row = idx_i[j]; 502 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 503 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 504 w4[proc]++; 505 } 506 for (j=0; j<size; j++) { 507 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 508 } 509 } 510 511 nrqs = 0; /* no of outgoing messages */ 512 msz = 0; /* total mesg length (for all proc */ 513 w1[rank] = 0; /* no mesg sent to intself */ 514 w3[rank] = 0; 515 for (i=0; i<size; i++) { 516 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 517 } 518 /* pa - is list of processors to communicate with */ 519 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 520 for (i=0,j=0; i<size; i++) { 521 if (w1[i]) {pa[j] = i; j++;} 522 } 523 524 /* Each message would have a header = 1 + 2*(no of IS) + data */ 525 for (i=0; i<nrqs; i++) { 526 j = pa[i]; 527 w1[j] += w2[j] + 2*w3[j]; 528 msz += w1[j]; 529 } 530 531 /* Determine the number of messages to expect, their lengths, from from-ids */ 532 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 533 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 534 535 /* Now post the Irecvs corresponding to these messages */ 536 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 537 538 /* Allocate Memory for outgoing messages */ 539 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 540 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 541 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 542 543 { 544 PetscInt *iptr = tmp,ict = 0; 545 for (i=0; i<nrqs; i++) { 546 j = pa[i]; 547 iptr += ict; 548 outdat[j] = iptr; 549 ict = w1[j]; 550 } 551 } 552 553 /* Form the outgoing messages */ 554 /* plug in the headers */ 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 outdat[j][0] = 0; 558 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 559 ptr[j] = outdat[j] + 2*w3[j] + 1; 560 } 561 562 /* Memory for doing local proc's work */ 563 { 564 PetscInt M_BPB_imax = 0; 565 #if defined(PETSC_USE_CTABLE) 566 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 567 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 570 } 571 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 572 for (i=0; i<imax; i++) { 573 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 574 } 575 #else 576 PetscInt Mimax = 0; 577 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 578 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 579 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 580 for (i=0; i<imax; i++) { 581 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 582 data[i] = d_p + M*i; 583 } 584 #endif 585 } 586 587 /* Parse the IS and update local tables and the outgoing buf with the data */ 588 { 589 PetscInt n_i,isz_i,*outdat_j,ctr_j; 590 PetscBT table_i; 591 592 for (i=0; i<imax; i++) { 593 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 594 n_i = n[i]; 595 table_i = table[i]; 596 idx_i = idx[i]; 597 #if defined(PETSC_USE_CTABLE) 598 table_data_i = table_data[i]; 599 #else 600 data_i = data[i]; 601 #endif 602 isz_i = isz[i]; 603 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 604 row = idx_i[j]; 605 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 606 if (proc != rank) { /* copy to the outgoing buffer */ 607 ctr[proc]++; 608 *ptr[proc] = row; 609 ptr[proc]++; 610 } else if (!PetscBTLookupSet(table_i,row)) { 611 #if defined(PETSC_USE_CTABLE) 612 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 613 #else 614 data_i[isz_i] = row; /* Update the local table */ 615 #endif 616 isz_i++; 617 } 618 } 619 /* Update the headers for the current IS */ 620 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 621 if ((ctr_j = ctr[j])) { 622 outdat_j = outdat[j]; 623 k = ++outdat_j[0]; 624 outdat_j[2*k] = ctr_j; 625 outdat_j[2*k-1] = i; 626 } 627 } 628 isz[i] = isz_i; 629 } 630 } 631 632 /* Now post the sends */ 633 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 634 for (i=0; i<nrqs; ++i) { 635 j = pa[i]; 636 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 637 } 638 639 /* No longer need the original indices */ 640 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 641 for (i=0; i<imax; ++i) { 642 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 643 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 644 } 645 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 646 647 for (i=0; i<imax; ++i) { 648 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 649 } 650 651 /* Do Local work */ 652 #if defined(PETSC_USE_CTABLE) 653 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 654 #else 655 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 656 #endif 657 658 /* Receive messages */ 659 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 660 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);} 661 662 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 663 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRMPI(ierr);} 664 665 /* Phase 1 sends are complete - deallocate buffers */ 666 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 667 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 668 669 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 670 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 671 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 672 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 673 ierr = PetscFree(rbuf);CHKERRQ(ierr); 674 675 /* Send the data back */ 676 /* Do a global reduction to know the buffer space req for incoming messages */ 677 { 678 PetscMPIInt *rw1; 679 680 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 681 682 for (i=0; i<nrqr; ++i) { 683 proc = recv_status[i].MPI_SOURCE; 684 685 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 686 rw1[proc] = isz1[i]; 687 } 688 ierr = PetscFree(onodes1);CHKERRQ(ierr); 689 ierr = PetscFree(olengths1);CHKERRQ(ierr); 690 691 /* Determine the number of messages to expect, their lengths, from from-ids */ 692 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 693 ierr = PetscFree(rw1);CHKERRQ(ierr); 694 } 695 /* Now post the Irecvs corresponding to these messages */ 696 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 697 698 /* Now post the sends */ 699 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 700 for (i=0; i<nrqr; ++i) { 701 j = recv_status[i].MPI_SOURCE; 702 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr); 703 } 704 705 /* receive work done on other processors */ 706 { 707 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 708 PetscMPIInt idex; 709 PetscBT table_i; 710 MPI_Status *status2; 711 712 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 713 for (i=0; i<nrqs; ++i) { 714 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRMPI(ierr); 715 /* Process the message */ 716 rbuf2_i = rbuf2[idex]; 717 ct1 = 2*rbuf2_i[0]+1; 718 jmax = rbuf2[idex][0]; 719 for (j=1; j<=jmax; j++) { 720 max = rbuf2_i[2*j]; 721 is_no = rbuf2_i[2*j-1]; 722 isz_i = isz[is_no]; 723 table_i = table[is_no]; 724 #if defined(PETSC_USE_CTABLE) 725 table_data_i = table_data[is_no]; 726 #else 727 data_i = data[is_no]; 728 #endif 729 for (k=0; k<max; k++,ct1++) { 730 row = rbuf2_i[ct1]; 731 if (!PetscBTLookupSet(table_i,row)) { 732 #if defined(PETSC_USE_CTABLE) 733 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 734 #else 735 data_i[isz_i] = row; 736 #endif 737 isz_i++; 738 } 739 } 740 isz[is_no] = isz_i; 741 } 742 } 743 744 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRMPI(ierr);} 745 ierr = PetscFree(status2);CHKERRQ(ierr); 746 } 747 748 #if defined(PETSC_USE_CTABLE) 749 tcount_max = 0; 750 for (i=0; i<imax; ++i) { 751 table_data_i = table_data[i]; 752 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 753 if (tcount_max < tcount) tcount_max = tcount; 754 } 755 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 756 #endif 757 758 for (i=0; i<imax; ++i) { 759 #if defined(PETSC_USE_CTABLE) 760 PetscTablePosition tpos; 761 table_data_i = table_data[i]; 762 763 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 764 while (tpos) { 765 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 766 tdata[--j] = --k; 767 } 768 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 769 #else 770 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 771 #endif 772 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 773 } 774 775 ierr = PetscFree(iscomms);CHKERRQ(ierr); 776 ierr = PetscFree(onodes2);CHKERRQ(ierr); 777 ierr = PetscFree(olengths2);CHKERRQ(ierr); 778 779 ierr = PetscFree(pa);CHKERRQ(ierr); 780 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 781 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 782 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 783 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 784 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 785 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 786 ierr = PetscFree(s_status);CHKERRQ(ierr); 787 ierr = PetscFree(recv_status);CHKERRQ(ierr); 788 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 789 ierr = PetscFree(xdata);CHKERRQ(ierr); 790 ierr = PetscFree(isz1);CHKERRQ(ierr); 791 #if defined(PETSC_USE_CTABLE) 792 for (i=0; i<imax; i++) { 793 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 794 } 795 ierr = PetscFree(table_data);CHKERRQ(ierr); 796 ierr = PetscFree(tdata);CHKERRQ(ierr); 797 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 798 #else 799 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 800 #endif 801 PetscFunctionReturn(0); 802 } 803 804 /* 805 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 806 the work on the local processor. 807 808 Inputs: 809 C - MAT_MPIAIJ; 810 imax - total no of index sets processed at a time; 811 table - an array of char - size = m bits. 812 813 Output: 814 isz - array containing the count of the solution elements corresponding 815 to each index set; 816 data or table_data - pointer to the solutions 817 */ 818 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 819 { 820 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 821 Mat A = c->A,B = c->B; 822 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 823 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 824 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 825 PetscBT table_i; 826 #if defined(PETSC_USE_CTABLE) 827 PetscTable table_data_i; 828 PetscErrorCode ierr; 829 PetscTablePosition tpos; 830 PetscInt tcount,*tdata; 831 #else 832 PetscInt *data_i; 833 #endif 834 835 PetscFunctionBegin; 836 rstart = C->rmap->rstart; 837 cstart = C->cmap->rstart; 838 ai = a->i; 839 aj = a->j; 840 bi = b->i; 841 bj = b->j; 842 garray = c->garray; 843 844 for (i=0; i<imax; i++) { 845 #if defined(PETSC_USE_CTABLE) 846 /* copy existing entries of table_data_i into tdata[] */ 847 table_data_i = table_data[i]; 848 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 849 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]); 850 851 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 852 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 853 while (tpos) { 854 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 855 tdata[--j] = --row; 856 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount); 857 } 858 #else 859 data_i = data[i]; 860 #endif 861 table_i = table[i]; 862 isz_i = isz[i]; 863 max = isz[i]; 864 865 for (j=0; j<max; j++) { 866 #if defined(PETSC_USE_CTABLE) 867 row = tdata[j] - rstart; 868 #else 869 row = data_i[j] - rstart; 870 #endif 871 start = ai[row]; 872 end = ai[row+1]; 873 for (k=start; k<end; k++) { /* Amat */ 874 val = aj[k] + cstart; 875 if (!PetscBTLookupSet(table_i,val)) { 876 #if defined(PETSC_USE_CTABLE) 877 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 878 #else 879 data_i[isz_i] = val; 880 #endif 881 isz_i++; 882 } 883 } 884 start = bi[row]; 885 end = bi[row+1]; 886 for (k=start; k<end; k++) { /* Bmat */ 887 val = garray[bj[k]]; 888 if (!PetscBTLookupSet(table_i,val)) { 889 #if defined(PETSC_USE_CTABLE) 890 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 891 #else 892 data_i[isz_i] = val; 893 #endif 894 isz_i++; 895 } 896 } 897 } 898 isz[i] = isz_i; 899 900 #if defined(PETSC_USE_CTABLE) 901 ierr = PetscFree(tdata);CHKERRQ(ierr); 902 #endif 903 } 904 PetscFunctionReturn(0); 905 } 906 907 /* 908 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 909 and return the output 910 911 Input: 912 C - the matrix 913 nrqr - no of messages being processed. 914 rbuf - an array of pointers to the received requests 915 916 Output: 917 xdata - array of messages to be sent back 918 isz1 - size of each message 919 920 For better efficiency perhaps we should malloc separately each xdata[i], 921 then if a remalloc is required we need only copy the data for that one row 922 rather then all previous rows as it is now where a single large chunck of 923 memory is used. 924 925 */ 926 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 927 { 928 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 929 Mat A = c->A,B = c->B; 930 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 931 PetscErrorCode ierr; 932 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 933 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 934 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 935 PetscInt *rbuf_i,kmax,rbuf_0; 936 PetscBT xtable; 937 938 PetscFunctionBegin; 939 m = C->rmap->N; 940 rstart = C->rmap->rstart; 941 cstart = C->cmap->rstart; 942 ai = a->i; 943 aj = a->j; 944 bi = b->i; 945 bj = b->j; 946 garray = c->garray; 947 948 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 949 rbuf_i = rbuf[i]; 950 rbuf_0 = rbuf_i[0]; 951 ct += rbuf_0; 952 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 953 } 954 955 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 956 else max1 = 1; 957 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 958 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 959 ++no_malloc; 960 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 961 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 962 963 ct3 = 0; 964 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 965 rbuf_i = rbuf[i]; 966 rbuf_0 = rbuf_i[0]; 967 ct1 = 2*rbuf_0+1; 968 ct2 = ct1; 969 ct3 += ct1; 970 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 971 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 972 oct2 = ct2; 973 kmax = rbuf_i[2*j]; 974 for (k=0; k<kmax; k++,ct1++) { 975 row = rbuf_i[ct1]; 976 if (!PetscBTLookupSet(xtable,row)) { 977 if (!(ct3 < mem_estimate)) { 978 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 979 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 980 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 981 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 982 xdata[0] = tmp; 983 mem_estimate = new_estimate; ++no_malloc; 984 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 985 } 986 xdata[i][ct2++] = row; 987 ct3++; 988 } 989 } 990 for (k=oct2,max2=ct2; k<max2; k++) { 991 row = xdata[i][k] - rstart; 992 start = ai[row]; 993 end = ai[row+1]; 994 for (l=start; l<end; l++) { 995 val = aj[l] + cstart; 996 if (!PetscBTLookupSet(xtable,val)) { 997 if (!(ct3 < mem_estimate)) { 998 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 999 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1000 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1001 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1002 xdata[0] = tmp; 1003 mem_estimate = new_estimate; ++no_malloc; 1004 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1005 } 1006 xdata[i][ct2++] = val; 1007 ct3++; 1008 } 1009 } 1010 start = bi[row]; 1011 end = bi[row+1]; 1012 for (l=start; l<end; l++) { 1013 val = garray[bj[l]]; 1014 if (!PetscBTLookupSet(xtable,val)) { 1015 if (!(ct3 < mem_estimate)) { 1016 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1017 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1018 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1019 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1020 xdata[0] = tmp; 1021 mem_estimate = new_estimate; ++no_malloc; 1022 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1023 } 1024 xdata[i][ct2++] = val; 1025 ct3++; 1026 } 1027 } 1028 } 1029 /* Update the header*/ 1030 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1031 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1032 } 1033 xdata[i][0] = rbuf_0; 1034 xdata[i+1] = xdata[i] + ct2; 1035 isz1[i] = ct2; /* size of each message */ 1036 } 1037 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1038 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1039 PetscFunctionReturn(0); 1040 } 1041 /* -------------------------------------------------------------------------*/ 1042 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1043 /* 1044 Every processor gets the entire matrix 1045 */ 1046 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1047 { 1048 Mat B; 1049 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1050 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1051 PetscErrorCode ierr; 1052 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1053 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1054 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1055 1056 PetscFunctionBegin; 1057 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 1058 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr); 1059 if (scall == MAT_INITIAL_MATRIX) { 1060 /* ---------------------------------------------------------------- 1061 Tell every processor the number of nonzeros per row 1062 */ 1063 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1064 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1065 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1066 } 1067 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1068 for (i=0; i<size; i++) { 1069 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1070 displs[i] = A->rmap->range[i]; 1071 } 1072 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1073 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1074 #else 1075 sendcount = A->rmap->rend - A->rmap->rstart; 1076 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1077 #endif 1078 /* --------------------------------------------------------------- 1079 Create the sequential matrix of the same type as the local block diagonal 1080 */ 1081 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1082 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1083 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1084 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1085 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1086 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1087 **Bin = B; 1088 b = (Mat_SeqAIJ*)B->data; 1089 1090 /*-------------------------------------------------------------------- 1091 Copy my part of matrix column indices over 1092 */ 1093 sendcount = ad->nz + bd->nz; 1094 jsendbuf = b->j + b->i[rstarts[rank]]; 1095 a_jsendbuf = ad->j; 1096 b_jsendbuf = bd->j; 1097 n = A->rmap->rend - A->rmap->rstart; 1098 cnt = 0; 1099 for (i=0; i<n; i++) { 1100 /* put in lower diagonal portion */ 1101 m = bd->i[i+1] - bd->i[i]; 1102 while (m > 0) { 1103 /* is it above diagonal (in bd (compressed) numbering) */ 1104 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1105 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1106 m--; 1107 } 1108 1109 /* put in diagonal portion */ 1110 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1111 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1112 } 1113 1114 /* put in upper diagonal portion */ 1115 while (m-- > 0) { 1116 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1117 } 1118 } 1119 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1120 1121 /*-------------------------------------------------------------------- 1122 Gather all column indices to all processors 1123 */ 1124 for (i=0; i<size; i++) { 1125 recvcounts[i] = 0; 1126 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1127 recvcounts[i] += lens[j]; 1128 } 1129 } 1130 displs[0] = 0; 1131 for (i=1; i<size; i++) { 1132 displs[i] = displs[i-1] + recvcounts[i-1]; 1133 } 1134 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1135 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1136 #else 1137 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1138 #endif 1139 /*-------------------------------------------------------------------- 1140 Assemble the matrix into useable form (note numerical values not yet set) 1141 */ 1142 /* set the b->ilen (length of each row) values */ 1143 ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr); 1144 /* set the b->i indices */ 1145 b->i[0] = 0; 1146 for (i=1; i<=A->rmap->N; i++) { 1147 b->i[i] = b->i[i-1] + lens[i-1]; 1148 } 1149 ierr = PetscFree(lens);CHKERRQ(ierr); 1150 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1151 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1152 1153 } else { 1154 B = **Bin; 1155 b = (Mat_SeqAIJ*)B->data; 1156 } 1157 1158 /*-------------------------------------------------------------------- 1159 Copy my part of matrix numerical values into the values location 1160 */ 1161 if (flag == MAT_GET_VALUES) { 1162 const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf; 1163 MatScalar *sendbuf,*recvbuf; 1164 1165 ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr); 1166 ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr); 1167 sendcount = ad->nz + bd->nz; 1168 sendbuf = b->a + b->i[rstarts[rank]]; 1169 a_sendbuf = ada; 1170 b_sendbuf = bda; 1171 b_sendj = bd->j; 1172 n = A->rmap->rend - A->rmap->rstart; 1173 cnt = 0; 1174 for (i=0; i<n; i++) { 1175 /* put in lower diagonal portion */ 1176 m = bd->i[i+1] - bd->i[i]; 1177 while (m > 0) { 1178 /* is it above diagonal (in bd (compressed) numbering) */ 1179 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1180 sendbuf[cnt++] = *b_sendbuf++; 1181 m--; 1182 b_sendj++; 1183 } 1184 1185 /* put in diagonal portion */ 1186 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1187 sendbuf[cnt++] = *a_sendbuf++; 1188 } 1189 1190 /* put in upper diagonal portion */ 1191 while (m-- > 0) { 1192 sendbuf[cnt++] = *b_sendbuf++; 1193 b_sendj++; 1194 } 1195 } 1196 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1197 1198 /* ----------------------------------------------------------------- 1199 Gather all numerical values to all processors 1200 */ 1201 if (!recvcounts) { 1202 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1203 } 1204 for (i=0; i<size; i++) { 1205 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1206 } 1207 displs[0] = 0; 1208 for (i=1; i<size; i++) { 1209 displs[i] = displs[i-1] + recvcounts[i-1]; 1210 } 1211 recvbuf = b->a; 1212 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1213 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1214 #else 1215 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1216 #endif 1217 ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr); 1218 ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr); 1219 } /* endof (flag == MAT_GET_VALUES) */ 1220 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1221 1222 if (A->symmetric) { 1223 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1224 } else if (A->hermitian) { 1225 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1226 } else if (A->structurally_symmetric) { 1227 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1228 } 1229 PetscFunctionReturn(0); 1230 } 1231 1232 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1233 { 1234 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1235 Mat submat,A = c->A,B = c->B; 1236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1237 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1238 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1239 const PetscInt *icol,*irow; 1240 PetscInt nrow,ncol,start; 1241 PetscErrorCode ierr; 1242 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1243 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1244 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1245 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1246 PetscInt *lens,rmax,ncols,*cols,Crow; 1247 #if defined(PETSC_USE_CTABLE) 1248 PetscTable cmap,rmap; 1249 PetscInt *cmap_loc,*rmap_loc; 1250 #else 1251 PetscInt *cmap,*rmap; 1252 #endif 1253 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1254 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1255 PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL; 1256 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1257 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1258 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1259 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1260 MPI_Comm comm; 1261 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1262 PetscMPIInt *onodes1,*olengths1,idex,end; 1263 Mat_SubSppt *smatis1; 1264 PetscBool isrowsorted,iscolsorted; 1265 1266 PetscFunctionBegin; 1267 PetscValidLogicalCollectiveInt(C,ismax,2); 1268 PetscValidLogicalCollectiveEnum(C,scall,5); 1269 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1270 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1271 ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1272 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1273 size = c->size; 1274 rank = c->rank; 1275 1276 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1277 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1278 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1279 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1280 if (allcolumns) { 1281 icol = NULL; 1282 ncol = C->cmap->N; 1283 } else { 1284 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1285 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1286 } 1287 1288 if (scall == MAT_INITIAL_MATRIX) { 1289 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1290 1291 /* Get some new tags to keep the communication clean */ 1292 tag1 = ((PetscObject)C)->tag; 1293 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1294 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1295 1296 /* evaluate communication - mesg to who, length of mesg, and buffer space 1297 required. Based on this, buffers are allocated, and data copied into them */ 1298 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1299 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1300 1301 /* w1[proc] = num of rows owned by proc -- to be requested */ 1302 proc = 0; 1303 nrqs = 0; /* num of outgoing messages */ 1304 for (j=0; j<nrow; j++) { 1305 row = irow[j]; 1306 if (!isrowsorted) proc = 0; 1307 while (row >= C->rmap->range[proc+1]) proc++; 1308 w1[proc]++; 1309 row2proc[j] = proc; /* map row index to proc */ 1310 1311 if (proc != rank && !w2[proc]) { 1312 w2[proc] = 1; nrqs++; 1313 } 1314 } 1315 w1[rank] = 0; /* rows owned by self will not be requested */ 1316 1317 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1318 for (proc=0,j=0; proc<size; proc++) { 1319 if (w1[proc]) { pa[j++] = proc;} 1320 } 1321 1322 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1323 msz = 0; /* total mesg length (for all procs) */ 1324 for (i=0; i<nrqs; i++) { 1325 proc = pa[i]; 1326 w1[proc] += 3; 1327 msz += w1[proc]; 1328 } 1329 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1330 1331 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1332 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1333 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1334 1335 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1336 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1337 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1338 1339 /* Now post the Irecvs corresponding to these messages */ 1340 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1341 1342 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1343 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1344 1345 /* Allocate Memory for outgoing messages */ 1346 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1347 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 1348 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 1349 1350 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1351 iptr = tmp; 1352 for (i=0; i<nrqs; i++) { 1353 proc = pa[i]; 1354 sbuf1[proc] = iptr; 1355 iptr += w1[proc]; 1356 } 1357 1358 /* Form the outgoing messages */ 1359 /* Initialize the header space */ 1360 for (i=0; i<nrqs; i++) { 1361 proc = pa[i]; 1362 ierr = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr); 1363 ptr[proc] = sbuf1[proc] + 3; 1364 } 1365 1366 /* Parse the isrow and copy data into outbuf */ 1367 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 1368 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1369 proc = row2proc[j]; 1370 if (proc != rank) { /* copy to the outgoing buf*/ 1371 *ptr[proc] = irow[j]; 1372 ctr[proc]++; ptr[proc]++; 1373 } 1374 } 1375 1376 /* Update the headers for the current IS */ 1377 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1378 if ((ctr_j = ctr[j])) { 1379 sbuf1_j = sbuf1[j]; 1380 k = ++sbuf1_j[0]; 1381 sbuf1_j[2*k] = ctr_j; 1382 sbuf1_j[2*k-1] = 0; 1383 } 1384 } 1385 1386 /* Now post the sends */ 1387 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1388 for (i=0; i<nrqs; ++i) { 1389 proc = pa[i]; 1390 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 1391 } 1392 1393 /* Post Receives to capture the buffer size */ 1394 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1395 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1396 1397 rbuf2[0] = tmp + msz; 1398 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1399 1400 for (i=0; i<nrqs; ++i) { 1401 proc = pa[i]; 1402 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 1403 } 1404 1405 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1406 1407 /* Send to other procs the buf size they should allocate */ 1408 /* Receive messages*/ 1409 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1410 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1411 1412 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 1413 for (i=0; i<nrqr; ++i) { 1414 req_size[i] = 0; 1415 rbuf1_i = rbuf1[i]; 1416 start = 2*rbuf1_i[0] + 1; 1417 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr); 1418 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1419 sbuf2_i = sbuf2[i]; 1420 for (j=start; j<end; j++) { 1421 k = rbuf1_i[j] - rstart; 1422 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1423 sbuf2_i[j] = ncols; 1424 req_size[i] += ncols; 1425 } 1426 req_source1[i] = r_status1[i].MPI_SOURCE; 1427 1428 /* form the header */ 1429 sbuf2_i[0] = req_size[i]; 1430 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1431 1432 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 1433 } 1434 1435 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1436 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1437 1438 /* rbuf2 is received, Post recv column indices a->j */ 1439 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 1440 1441 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1442 for (i=0; i<nrqs; ++i) { 1443 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1444 req_source2[i] = r_status2[i].MPI_SOURCE; 1445 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 1446 } 1447 1448 /* Wait on sends1 and sends2 */ 1449 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1450 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr); 1451 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1452 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1453 1454 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr); 1455 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1456 1457 /* Now allocate sending buffers for a->j, and send them off */ 1458 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1459 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1460 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1461 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1462 1463 for (i=0; i<nrqr; i++) { /* for each requested message */ 1464 rbuf1_i = rbuf1[i]; 1465 sbuf_aj_i = sbuf_aj[i]; 1466 ct1 = 2*rbuf1_i[0] + 1; 1467 ct2 = 0; 1468 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1469 1470 kmax = rbuf1[i][2]; 1471 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1472 row = rbuf1_i[ct1] - rstart; 1473 nzA = ai[row+1] - ai[row]; 1474 nzB = bi[row+1] - bi[row]; 1475 ncols = nzA + nzB; 1476 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1477 1478 /* load the column indices for this row into cols*/ 1479 cols = sbuf_aj_i + ct2; 1480 1481 lwrite = 0; 1482 for (l=0; l<nzB; l++) { 1483 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1484 } 1485 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1486 for (l=0; l<nzB; l++) { 1487 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1488 } 1489 1490 ct2 += ncols; 1491 } 1492 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 1493 } 1494 1495 /* create column map (cmap): global col of C -> local col of submat */ 1496 #if defined(PETSC_USE_CTABLE) 1497 if (!allcolumns) { 1498 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1499 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1500 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1501 if (icol[j] >= cstart && icol[j] <cend) { 1502 cmap_loc[icol[j] - cstart] = j+1; 1503 } else { /* use PetscTable for non-local col indices */ 1504 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1505 } 1506 } 1507 } else { 1508 cmap = NULL; 1509 cmap_loc = NULL; 1510 } 1511 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1512 #else 1513 if (!allcolumns) { 1514 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1515 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1516 } else { 1517 cmap = NULL; 1518 } 1519 #endif 1520 1521 /* Create lens for MatSeqAIJSetPreallocation() */ 1522 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1523 1524 /* Compute lens from local part of C */ 1525 for (j=0; j<nrow; j++) { 1526 row = irow[j]; 1527 proc = row2proc[j]; 1528 if (proc == rank) { 1529 /* diagonal part A = c->A */ 1530 ncols = ai[row-rstart+1] - ai[row-rstart]; 1531 cols = aj + ai[row-rstart]; 1532 if (!allcolumns) { 1533 for (k=0; k<ncols; k++) { 1534 #if defined(PETSC_USE_CTABLE) 1535 tcol = cmap_loc[cols[k]]; 1536 #else 1537 tcol = cmap[cols[k]+cstart]; 1538 #endif 1539 if (tcol) lens[j]++; 1540 } 1541 } else { /* allcolumns */ 1542 lens[j] = ncols; 1543 } 1544 1545 /* off-diagonal part B = c->B */ 1546 ncols = bi[row-rstart+1] - bi[row-rstart]; 1547 cols = bj + bi[row-rstart]; 1548 if (!allcolumns) { 1549 for (k=0; k<ncols; k++) { 1550 #if defined(PETSC_USE_CTABLE) 1551 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1552 #else 1553 tcol = cmap[bmap[cols[k]]]; 1554 #endif 1555 if (tcol) lens[j]++; 1556 } 1557 } else { /* allcolumns */ 1558 lens[j] += ncols; 1559 } 1560 } 1561 } 1562 1563 /* Create row map (rmap): global row of C -> local row of submat */ 1564 #if defined(PETSC_USE_CTABLE) 1565 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1566 for (j=0; j<nrow; j++) { 1567 row = irow[j]; 1568 proc = row2proc[j]; 1569 if (proc == rank) { /* a local row */ 1570 rmap_loc[row - rstart] = j; 1571 } else { 1572 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1573 } 1574 } 1575 #else 1576 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1577 for (j=0; j<nrow; j++) { 1578 rmap[irow[j]] = j; 1579 } 1580 #endif 1581 1582 /* Update lens from offproc data */ 1583 /* recv a->j is done */ 1584 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr); 1585 for (i=0; i<nrqs; i++) { 1586 proc = pa[i]; 1587 sbuf1_i = sbuf1[proc]; 1588 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1589 ct1 = 2 + 1; 1590 ct2 = 0; 1591 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1592 rbuf3_i = rbuf3[i]; /* received C->j */ 1593 1594 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1595 max1 = sbuf1_i[2]; 1596 for (k=0; k<max1; k++,ct1++) { 1597 #if defined(PETSC_USE_CTABLE) 1598 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1599 row--; 1600 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1601 #else 1602 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1603 #endif 1604 /* Now, store row index of submat in sbuf1_i[ct1] */ 1605 sbuf1_i[ct1] = row; 1606 1607 nnz = rbuf2_i[ct1]; 1608 if (!allcolumns) { 1609 for (l=0; l<nnz; l++,ct2++) { 1610 #if defined(PETSC_USE_CTABLE) 1611 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1612 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1613 } else { 1614 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1615 } 1616 #else 1617 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1618 #endif 1619 if (tcol) lens[row]++; 1620 } 1621 } else { /* allcolumns */ 1622 lens[row] += nnz; 1623 } 1624 } 1625 } 1626 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr); 1627 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1628 1629 /* Create the submatrices */ 1630 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1631 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1632 1633 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1634 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1635 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1636 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1637 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1638 1639 /* create struct Mat_SubSppt and attached it to submat */ 1640 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1641 subc = (Mat_SeqAIJ*)submat->data; 1642 subc->submatis1 = smatis1; 1643 1644 smatis1->id = 0; 1645 smatis1->nrqs = nrqs; 1646 smatis1->nrqr = nrqr; 1647 smatis1->rbuf1 = rbuf1; 1648 smatis1->rbuf2 = rbuf2; 1649 smatis1->rbuf3 = rbuf3; 1650 smatis1->sbuf2 = sbuf2; 1651 smatis1->req_source2 = req_source2; 1652 1653 smatis1->sbuf1 = sbuf1; 1654 smatis1->ptr = ptr; 1655 smatis1->tmp = tmp; 1656 smatis1->ctr = ctr; 1657 1658 smatis1->pa = pa; 1659 smatis1->req_size = req_size; 1660 smatis1->req_source1 = req_source1; 1661 1662 smatis1->allcolumns = allcolumns; 1663 smatis1->singleis = PETSC_TRUE; 1664 smatis1->row2proc = row2proc; 1665 smatis1->rmap = rmap; 1666 smatis1->cmap = cmap; 1667 #if defined(PETSC_USE_CTABLE) 1668 smatis1->rmap_loc = rmap_loc; 1669 smatis1->cmap_loc = cmap_loc; 1670 #endif 1671 1672 smatis1->destroy = submat->ops->destroy; 1673 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1674 submat->factortype = C->factortype; 1675 1676 /* compute rmax */ 1677 rmax = 0; 1678 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1679 1680 } else { /* scall == MAT_REUSE_MATRIX */ 1681 submat = submats[0]; 1682 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1683 1684 subc = (Mat_SeqAIJ*)submat->data; 1685 rmax = subc->rmax; 1686 smatis1 = subc->submatis1; 1687 nrqs = smatis1->nrqs; 1688 nrqr = smatis1->nrqr; 1689 rbuf1 = smatis1->rbuf1; 1690 rbuf2 = smatis1->rbuf2; 1691 rbuf3 = smatis1->rbuf3; 1692 req_source2 = smatis1->req_source2; 1693 1694 sbuf1 = smatis1->sbuf1; 1695 sbuf2 = smatis1->sbuf2; 1696 ptr = smatis1->ptr; 1697 tmp = smatis1->tmp; 1698 ctr = smatis1->ctr; 1699 1700 pa = smatis1->pa; 1701 req_size = smatis1->req_size; 1702 req_source1 = smatis1->req_source1; 1703 1704 allcolumns = smatis1->allcolumns; 1705 row2proc = smatis1->row2proc; 1706 rmap = smatis1->rmap; 1707 cmap = smatis1->cmap; 1708 #if defined(PETSC_USE_CTABLE) 1709 rmap_loc = smatis1->rmap_loc; 1710 cmap_loc = smatis1->cmap_loc; 1711 #endif 1712 } 1713 1714 /* Post recv matrix values */ 1715 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1716 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1717 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1718 for (i=0; i<nrqs; ++i) { 1719 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1720 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 1721 } 1722 1723 /* Allocate sending buffers for a->a, and send them off */ 1724 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1725 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1726 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1727 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1728 1729 for (i=0; i<nrqr; i++) { 1730 rbuf1_i = rbuf1[i]; 1731 sbuf_aa_i = sbuf_aa[i]; 1732 ct1 = 2*rbuf1_i[0]+1; 1733 ct2 = 0; 1734 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1735 1736 kmax = rbuf1_i[2]; 1737 for (k=0; k<kmax; k++,ct1++) { 1738 row = rbuf1_i[ct1] - rstart; 1739 nzA = ai[row+1] - ai[row]; 1740 nzB = bi[row+1] - bi[row]; 1741 ncols = nzA + nzB; 1742 cworkB = bj + bi[row]; 1743 vworkA = a_a + ai[row]; 1744 vworkB = b_a + bi[row]; 1745 1746 /* load the column values for this row into vals*/ 1747 vals = sbuf_aa_i + ct2; 1748 1749 lwrite = 0; 1750 for (l=0; l<nzB; l++) { 1751 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1752 } 1753 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1754 for (l=0; l<nzB; l++) { 1755 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1756 } 1757 1758 ct2 += ncols; 1759 } 1760 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 1761 } 1762 1763 /* Assemble submat */ 1764 /* First assemble the local rows */ 1765 for (j=0; j<nrow; j++) { 1766 row = irow[j]; 1767 proc = row2proc[j]; 1768 if (proc == rank) { 1769 Crow = row - rstart; /* local row index of C */ 1770 #if defined(PETSC_USE_CTABLE) 1771 row = rmap_loc[Crow]; /* row index of submat */ 1772 #else 1773 row = rmap[row]; 1774 #endif 1775 1776 if (allcolumns) { 1777 /* diagonal part A = c->A */ 1778 ncols = ai[Crow+1] - ai[Crow]; 1779 cols = aj + ai[Crow]; 1780 vals = a_a + ai[Crow]; 1781 i = 0; 1782 for (k=0; k<ncols; k++) { 1783 subcols[i] = cols[k] + cstart; 1784 subvals[i++] = vals[k]; 1785 } 1786 1787 /* off-diagonal part B = c->B */ 1788 ncols = bi[Crow+1] - bi[Crow]; 1789 cols = bj + bi[Crow]; 1790 vals = b_a + bi[Crow]; 1791 for (k=0; k<ncols; k++) { 1792 subcols[i] = bmap[cols[k]]; 1793 subvals[i++] = vals[k]; 1794 } 1795 1796 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1797 1798 } else { /* !allcolumns */ 1799 #if defined(PETSC_USE_CTABLE) 1800 /* diagonal part A = c->A */ 1801 ncols = ai[Crow+1] - ai[Crow]; 1802 cols = aj + ai[Crow]; 1803 vals = a_a + ai[Crow]; 1804 i = 0; 1805 for (k=0; k<ncols; k++) { 1806 tcol = cmap_loc[cols[k]]; 1807 if (tcol) { 1808 subcols[i] = --tcol; 1809 subvals[i++] = vals[k]; 1810 } 1811 } 1812 1813 /* off-diagonal part B = c->B */ 1814 ncols = bi[Crow+1] - bi[Crow]; 1815 cols = bj + bi[Crow]; 1816 vals = b_a + bi[Crow]; 1817 for (k=0; k<ncols; k++) { 1818 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1819 if (tcol) { 1820 subcols[i] = --tcol; 1821 subvals[i++] = vals[k]; 1822 } 1823 } 1824 #else 1825 /* diagonal part A = c->A */ 1826 ncols = ai[Crow+1] - ai[Crow]; 1827 cols = aj + ai[Crow]; 1828 vals = a_a + ai[Crow]; 1829 i = 0; 1830 for (k=0; k<ncols; k++) { 1831 tcol = cmap[cols[k]+cstart]; 1832 if (tcol) { 1833 subcols[i] = --tcol; 1834 subvals[i++] = vals[k]; 1835 } 1836 } 1837 1838 /* off-diagonal part B = c->B */ 1839 ncols = bi[Crow+1] - bi[Crow]; 1840 cols = bj + bi[Crow]; 1841 vals = b_a + bi[Crow]; 1842 for (k=0; k<ncols; k++) { 1843 tcol = cmap[bmap[cols[k]]]; 1844 if (tcol) { 1845 subcols[i] = --tcol; 1846 subvals[i++] = vals[k]; 1847 } 1848 } 1849 #endif 1850 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1851 } 1852 } 1853 } 1854 1855 /* Now assemble the off-proc rows */ 1856 for (i=0; i<nrqs; i++) { /* for each requested message */ 1857 /* recv values from other processes */ 1858 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr); 1859 proc = pa[idex]; 1860 sbuf1_i = sbuf1[proc]; 1861 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1862 ct1 = 2 + 1; 1863 ct2 = 0; /* count of received C->j */ 1864 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1865 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1866 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1867 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1868 1869 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1870 max1 = sbuf1_i[2]; /* num of rows */ 1871 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1872 row = sbuf1_i[ct1]; /* row index of submat */ 1873 if (!allcolumns) { 1874 idex = 0; 1875 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1876 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1877 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1878 #if defined(PETSC_USE_CTABLE) 1879 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1880 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1881 } else { 1882 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1883 } 1884 #else 1885 tcol = cmap[rbuf3_i[ct2]]; 1886 #endif 1887 if (tcol) { 1888 subcols[idex] = --tcol; /* may not be sorted */ 1889 subvals[idex++] = rbuf4_i[ct2]; 1890 1891 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1892 For reuse, we replace received C->j with index that should be inserted to submat */ 1893 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1894 } 1895 } 1896 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1897 } else { /* scall == MAT_REUSE_MATRIX */ 1898 submat = submats[0]; 1899 subc = (Mat_SeqAIJ*)submat->data; 1900 1901 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1902 for (l=0; l<nnz; l++) { 1903 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1904 subvals[idex++] = rbuf4_i[ct2]; 1905 } 1906 1907 bj = subc->j + subc->i[row]; /* sorted column indices */ 1908 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1909 } 1910 } else { /* allcolumns */ 1911 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1912 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1913 ct2 += nnz; 1914 } 1915 } 1916 } 1917 1918 /* sending a->a are done */ 1919 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr); 1920 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1921 1922 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1923 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1924 submats[0] = submat; 1925 1926 /* Restore the indices */ 1927 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1928 if (!allcolumns) { 1929 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1930 } 1931 1932 /* Destroy allocated memory */ 1933 for (i=0; i<nrqs; ++i) { 1934 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1935 } 1936 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1937 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1938 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1939 1940 if (scall == MAT_INITIAL_MATRIX) { 1941 ierr = PetscFree(lens);CHKERRQ(ierr); 1942 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1943 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1944 } 1945 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1946 ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1947 PetscFunctionReturn(0); 1948 } 1949 1950 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1951 { 1952 PetscErrorCode ierr; 1953 PetscInt ncol; 1954 PetscBool colflag,allcolumns=PETSC_FALSE; 1955 1956 PetscFunctionBegin; 1957 /* Allocate memory to hold all the submatrices */ 1958 if (scall == MAT_INITIAL_MATRIX) { 1959 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1960 } 1961 1962 /* Check for special case: each processor gets entire matrix columns */ 1963 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1964 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1965 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1966 1967 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1968 PetscFunctionReturn(0); 1969 } 1970 1971 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1972 { 1973 PetscErrorCode ierr; 1974 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1975 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1976 Mat_SeqAIJ *subc; 1977 Mat_SubSppt *smat; 1978 1979 PetscFunctionBegin; 1980 /* Check for special case: each processor has a single IS */ 1981 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1982 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1983 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1984 PetscFunctionReturn(0); 1985 } 1986 1987 /* Collect global wantallmatrix and nstages */ 1988 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1989 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1990 if (!nmax) nmax = 1; 1991 1992 if (scall == MAT_INITIAL_MATRIX) { 1993 /* Collect global wantallmatrix and nstages */ 1994 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1995 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1996 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1997 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1998 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1999 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2000 wantallmatrix = PETSC_TRUE; 2001 2002 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2003 } 2004 } 2005 2006 /* Determine the number of stages through which submatrices are done 2007 Each stage will extract nmax submatrices. 2008 nmax is determined by the matrix column dimension. 2009 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2010 */ 2011 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2012 2013 in[0] = -1*(PetscInt)wantallmatrix; 2014 in[1] = nstages; 2015 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr); 2016 wantallmatrix = (PetscBool)(-out[0]); 2017 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2018 2019 } else { /* MAT_REUSE_MATRIX */ 2020 if (ismax) { 2021 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2022 smat = subc->submatis1; 2023 } else { /* (*submat)[0] is a dummy matrix */ 2024 smat = (Mat_SubSppt*)(*submat)[0]->data; 2025 } 2026 if (!smat) { 2027 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2028 wantallmatrix = PETSC_TRUE; 2029 } else if (smat->singleis) { 2030 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2031 PetscFunctionReturn(0); 2032 } else { 2033 nstages = smat->nstages; 2034 } 2035 } 2036 2037 if (wantallmatrix) { 2038 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2039 PetscFunctionReturn(0); 2040 } 2041 2042 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2043 if (scall == MAT_INITIAL_MATRIX) { 2044 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2045 } 2046 2047 for (i=0,pos=0; i<nstages; i++) { 2048 if (pos+nmax <= ismax) max_no = nmax; 2049 else if (pos >= ismax) max_no = 0; 2050 else max_no = ismax-pos; 2051 2052 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2053 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2054 smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++; 2055 smat->nstages = nstages; 2056 } 2057 pos += max_no; 2058 } 2059 2060 if (ismax && scall == MAT_INITIAL_MATRIX) { 2061 /* save nstages for reuse */ 2062 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2063 smat = subc->submatis1; 2064 smat->nstages = nstages; 2065 } 2066 PetscFunctionReturn(0); 2067 } 2068 2069 /* -------------------------------------------------------------------------*/ 2070 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2071 { 2072 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2073 Mat A = c->A; 2074 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2075 const PetscInt **icol,**irow; 2076 PetscInt *nrow,*ncol,start; 2077 PetscErrorCode ierr; 2078 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2079 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2080 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2081 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2082 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2083 #if defined(PETSC_USE_CTABLE) 2084 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2085 #else 2086 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2087 #endif 2088 const PetscInt *irow_i; 2089 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2090 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2091 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2092 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2093 MPI_Status *r_status3,*r_status4,*s_status4; 2094 MPI_Comm comm; 2095 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2096 PetscMPIInt *onodes1,*olengths1,end; 2097 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2098 Mat_SubSppt *smat_i; 2099 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2100 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2101 2102 PetscFunctionBegin; 2103 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2104 size = c->size; 2105 rank = c->rank; 2106 2107 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2108 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2109 2110 for (i=0; i<ismax; i++) { 2111 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2112 if (!issorted[i]) iscsorted = issorted[i]; 2113 2114 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2115 2116 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2117 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2118 2119 /* Check for special case: allcolumn */ 2120 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2121 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2122 if (colflag && ncol[i] == C->cmap->N) { 2123 allcolumns[i] = PETSC_TRUE; 2124 icol[i] = NULL; 2125 } else { 2126 allcolumns[i] = PETSC_FALSE; 2127 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2128 } 2129 } 2130 2131 if (scall == MAT_REUSE_MATRIX) { 2132 /* Assumes new rows are same length as the old rows */ 2133 for (i=0; i<ismax; i++) { 2134 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2135 subc = (Mat_SeqAIJ*)submats[i]->data; 2136 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2137 2138 /* Initial matrix as if empty */ 2139 ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr); 2140 2141 smat_i = subc->submatis1; 2142 2143 nrqs = smat_i->nrqs; 2144 nrqr = smat_i->nrqr; 2145 rbuf1 = smat_i->rbuf1; 2146 rbuf2 = smat_i->rbuf2; 2147 rbuf3 = smat_i->rbuf3; 2148 req_source2 = smat_i->req_source2; 2149 2150 sbuf1 = smat_i->sbuf1; 2151 sbuf2 = smat_i->sbuf2; 2152 ptr = smat_i->ptr; 2153 tmp = smat_i->tmp; 2154 ctr = smat_i->ctr; 2155 2156 pa = smat_i->pa; 2157 req_size = smat_i->req_size; 2158 req_source1 = smat_i->req_source1; 2159 2160 allcolumns[i] = smat_i->allcolumns; 2161 row2proc[i] = smat_i->row2proc; 2162 rmap[i] = smat_i->rmap; 2163 cmap[i] = smat_i->cmap; 2164 } 2165 2166 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2167 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2168 smat_i = (Mat_SubSppt*)submats[0]->data; 2169 2170 nrqs = smat_i->nrqs; 2171 nrqr = smat_i->nrqr; 2172 rbuf1 = smat_i->rbuf1; 2173 rbuf2 = smat_i->rbuf2; 2174 rbuf3 = smat_i->rbuf3; 2175 req_source2 = smat_i->req_source2; 2176 2177 sbuf1 = smat_i->sbuf1; 2178 sbuf2 = smat_i->sbuf2; 2179 ptr = smat_i->ptr; 2180 tmp = smat_i->tmp; 2181 ctr = smat_i->ctr; 2182 2183 pa = smat_i->pa; 2184 req_size = smat_i->req_size; 2185 req_source1 = smat_i->req_source1; 2186 2187 allcolumns[0] = PETSC_FALSE; 2188 } 2189 } else { /* scall == MAT_INITIAL_MATRIX */ 2190 /* Get some new tags to keep the communication clean */ 2191 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2192 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2193 2194 /* evaluate communication - mesg to who, length of mesg, and buffer space 2195 required. Based on this, buffers are allocated, and data copied into them*/ 2196 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2197 2198 for (i=0; i<ismax; i++) { 2199 jmax = nrow[i]; 2200 irow_i = irow[i]; 2201 2202 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2203 row2proc[i] = row2proc_i; 2204 2205 if (issorted[i]) proc = 0; 2206 for (j=0; j<jmax; j++) { 2207 if (!issorted[i]) proc = 0; 2208 row = irow_i[j]; 2209 while (row >= C->rmap->range[proc+1]) proc++; 2210 w4[proc]++; 2211 row2proc_i[j] = proc; /* map row index to proc */ 2212 } 2213 for (j=0; j<size; j++) { 2214 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2215 } 2216 } 2217 2218 nrqs = 0; /* no of outgoing messages */ 2219 msz = 0; /* total mesg length (for all procs) */ 2220 w1[rank] = 0; /* no mesg sent to self */ 2221 w3[rank] = 0; 2222 for (i=0; i<size; i++) { 2223 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2224 } 2225 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2226 for (i=0,j=0; i<size; i++) { 2227 if (w1[i]) { pa[j] = i; j++; } 2228 } 2229 2230 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2231 for (i=0; i<nrqs; i++) { 2232 j = pa[i]; 2233 w1[j] += w2[j] + 2* w3[j]; 2234 msz += w1[j]; 2235 } 2236 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2237 2238 /* Determine the number of messages to expect, their lengths, from from-ids */ 2239 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2240 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2241 2242 /* Now post the Irecvs corresponding to these messages */ 2243 tag0 = ((PetscObject)C)->tag; 2244 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2245 2246 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2247 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2248 2249 /* Allocate Memory for outgoing messages */ 2250 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2251 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 2252 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 2253 2254 { 2255 PetscInt *iptr = tmp; 2256 k = 0; 2257 for (i=0; i<nrqs; i++) { 2258 j = pa[i]; 2259 iptr += k; 2260 sbuf1[j] = iptr; 2261 k = w1[j]; 2262 } 2263 } 2264 2265 /* Form the outgoing messages. Initialize the header space */ 2266 for (i=0; i<nrqs; i++) { 2267 j = pa[i]; 2268 sbuf1[j][0] = 0; 2269 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 2270 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2271 } 2272 2273 /* Parse the isrow and copy data into outbuf */ 2274 for (i=0; i<ismax; i++) { 2275 row2proc_i = row2proc[i]; 2276 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 2277 irow_i = irow[i]; 2278 jmax = nrow[i]; 2279 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2280 proc = row2proc_i[j]; 2281 if (proc != rank) { /* copy to the outgoing buf*/ 2282 ctr[proc]++; 2283 *ptr[proc] = irow_i[j]; 2284 ptr[proc]++; 2285 } 2286 } 2287 /* Update the headers for the current IS */ 2288 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2289 if ((ctr_j = ctr[j])) { 2290 sbuf1_j = sbuf1[j]; 2291 k = ++sbuf1_j[0]; 2292 sbuf1_j[2*k] = ctr_j; 2293 sbuf1_j[2*k-1] = i; 2294 } 2295 } 2296 } 2297 2298 /* Now post the sends */ 2299 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2300 for (i=0; i<nrqs; ++i) { 2301 j = pa[i]; 2302 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr); 2303 } 2304 2305 /* Post Receives to capture the buffer size */ 2306 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2307 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2308 rbuf2[0] = tmp + msz; 2309 for (i=1; i<nrqs; ++i) { 2310 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2311 } 2312 for (i=0; i<nrqs; ++i) { 2313 j = pa[i]; 2314 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 2315 } 2316 2317 /* Send to other procs the buf size they should allocate */ 2318 /* Receive messages*/ 2319 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2320 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2321 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2322 { 2323 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2324 PetscInt *sbuf2_i; 2325 2326 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 2327 for (i=0; i<nrqr; ++i) { 2328 req_size[i] = 0; 2329 rbuf1_i = rbuf1[i]; 2330 start = 2*rbuf1_i[0] + 1; 2331 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr); 2332 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2333 sbuf2_i = sbuf2[i]; 2334 for (j=start; j<end; j++) { 2335 id = rbuf1_i[j] - rstart; 2336 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2337 sbuf2_i[j] = ncols; 2338 req_size[i] += ncols; 2339 } 2340 req_source1[i] = r_status1[i].MPI_SOURCE; 2341 /* form the header */ 2342 sbuf2_i[0] = req_size[i]; 2343 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2344 2345 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 2346 } 2347 } 2348 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2349 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2350 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2351 2352 /* Receive messages*/ 2353 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2354 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2355 2356 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 2357 for (i=0; i<nrqs; ++i) { 2358 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2359 req_source2[i] = r_status2[i].MPI_SOURCE; 2360 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 2361 } 2362 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2363 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2364 2365 /* Wait on sends1 and sends2 */ 2366 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2367 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2368 2369 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);} 2370 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);} 2371 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2372 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2373 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2374 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2375 2376 /* Now allocate sending buffers for a->j, and send them off */ 2377 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2378 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2379 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2380 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2381 2382 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2383 { 2384 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2385 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2386 PetscInt cend = C->cmap->rend; 2387 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2388 2389 for (i=0; i<nrqr; i++) { 2390 rbuf1_i = rbuf1[i]; 2391 sbuf_aj_i = sbuf_aj[i]; 2392 ct1 = 2*rbuf1_i[0] + 1; 2393 ct2 = 0; 2394 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2395 kmax = rbuf1[i][2*j]; 2396 for (k=0; k<kmax; k++,ct1++) { 2397 row = rbuf1_i[ct1] - rstart; 2398 nzA = a_i[row+1] - a_i[row]; 2399 nzB = b_i[row+1] - b_i[row]; 2400 ncols = nzA + nzB; 2401 cworkA = a_j + a_i[row]; 2402 cworkB = b_j + b_i[row]; 2403 2404 /* load the column indices for this row into cols */ 2405 cols = sbuf_aj_i + ct2; 2406 2407 lwrite = 0; 2408 for (l=0; l<nzB; l++) { 2409 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2410 } 2411 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2412 for (l=0; l<nzB; l++) { 2413 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2414 } 2415 2416 ct2 += ncols; 2417 } 2418 } 2419 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 2420 } 2421 } 2422 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2423 2424 /* create col map: global col of C -> local col of submatrices */ 2425 { 2426 const PetscInt *icol_i; 2427 #if defined(PETSC_USE_CTABLE) 2428 for (i=0; i<ismax; i++) { 2429 if (!allcolumns[i]) { 2430 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2431 2432 jmax = ncol[i]; 2433 icol_i = icol[i]; 2434 cmap_i = cmap[i]; 2435 for (j=0; j<jmax; j++) { 2436 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2437 } 2438 } else cmap[i] = NULL; 2439 } 2440 #else 2441 for (i=0; i<ismax; i++) { 2442 if (!allcolumns[i]) { 2443 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2444 jmax = ncol[i]; 2445 icol_i = icol[i]; 2446 cmap_i = cmap[i]; 2447 for (j=0; j<jmax; j++) { 2448 cmap_i[icol_i[j]] = j+1; 2449 } 2450 } else cmap[i] = NULL; 2451 } 2452 #endif 2453 } 2454 2455 /* Create lens which is required for MatCreate... */ 2456 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2457 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2458 2459 if (ismax) { 2460 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2461 } 2462 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2463 2464 /* Update lens from local data */ 2465 for (i=0; i<ismax; i++) { 2466 row2proc_i = row2proc[i]; 2467 jmax = nrow[i]; 2468 if (!allcolumns[i]) cmap_i = cmap[i]; 2469 irow_i = irow[i]; 2470 lens_i = lens[i]; 2471 for (j=0; j<jmax; j++) { 2472 row = irow_i[j]; 2473 proc = row2proc_i[j]; 2474 if (proc == rank) { 2475 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2476 if (!allcolumns[i]) { 2477 for (k=0; k<ncols; k++) { 2478 #if defined(PETSC_USE_CTABLE) 2479 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2480 #else 2481 tcol = cmap_i[cols[k]]; 2482 #endif 2483 if (tcol) lens_i[j]++; 2484 } 2485 } else { /* allcolumns */ 2486 lens_i[j] = ncols; 2487 } 2488 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2489 } 2490 } 2491 } 2492 2493 /* Create row map: global row of C -> local row of submatrices */ 2494 #if defined(PETSC_USE_CTABLE) 2495 for (i=0; i<ismax; i++) { 2496 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2497 irow_i = irow[i]; 2498 jmax = nrow[i]; 2499 for (j=0; j<jmax; j++) { 2500 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2501 } 2502 } 2503 #else 2504 for (i=0; i<ismax; i++) { 2505 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2506 rmap_i = rmap[i]; 2507 irow_i = irow[i]; 2508 jmax = nrow[i]; 2509 for (j=0; j<jmax; j++) { 2510 rmap_i[irow_i[j]] = j; 2511 } 2512 } 2513 #endif 2514 2515 /* Update lens from offproc data */ 2516 { 2517 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2518 2519 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr); 2520 for (tmp2=0; tmp2<nrqs; tmp2++) { 2521 sbuf1_i = sbuf1[pa[tmp2]]; 2522 jmax = sbuf1_i[0]; 2523 ct1 = 2*jmax+1; 2524 ct2 = 0; 2525 rbuf2_i = rbuf2[tmp2]; 2526 rbuf3_i = rbuf3[tmp2]; 2527 for (j=1; j<=jmax; j++) { 2528 is_no = sbuf1_i[2*j-1]; 2529 max1 = sbuf1_i[2*j]; 2530 lens_i = lens[is_no]; 2531 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2532 rmap_i = rmap[is_no]; 2533 for (k=0; k<max1; k++,ct1++) { 2534 #if defined(PETSC_USE_CTABLE) 2535 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2536 row--; 2537 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2538 #else 2539 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2540 #endif 2541 max2 = rbuf2_i[ct1]; 2542 for (l=0; l<max2; l++,ct2++) { 2543 if (!allcolumns[is_no]) { 2544 #if defined(PETSC_USE_CTABLE) 2545 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2546 #else 2547 tcol = cmap_i[rbuf3_i[ct2]]; 2548 #endif 2549 if (tcol) lens_i[row]++; 2550 } else { /* allcolumns */ 2551 lens_i[row]++; /* lens_i[row] += max2 ? */ 2552 } 2553 } 2554 } 2555 } 2556 } 2557 } 2558 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2559 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);} 2560 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2561 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2562 2563 /* Create the submatrices */ 2564 for (i=0; i<ismax; i++) { 2565 PetscInt rbs,cbs; 2566 2567 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2568 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2569 2570 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2571 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2572 2573 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2574 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2575 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2576 2577 /* create struct Mat_SubSppt and attached it to submat */ 2578 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2579 subc = (Mat_SeqAIJ*)submats[i]->data; 2580 subc->submatis1 = smat_i; 2581 2582 smat_i->destroy = submats[i]->ops->destroy; 2583 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2584 submats[i]->factortype = C->factortype; 2585 2586 smat_i->id = i; 2587 smat_i->nrqs = nrqs; 2588 smat_i->nrqr = nrqr; 2589 smat_i->rbuf1 = rbuf1; 2590 smat_i->rbuf2 = rbuf2; 2591 smat_i->rbuf3 = rbuf3; 2592 smat_i->sbuf2 = sbuf2; 2593 smat_i->req_source2 = req_source2; 2594 2595 smat_i->sbuf1 = sbuf1; 2596 smat_i->ptr = ptr; 2597 smat_i->tmp = tmp; 2598 smat_i->ctr = ctr; 2599 2600 smat_i->pa = pa; 2601 smat_i->req_size = req_size; 2602 smat_i->req_source1 = req_source1; 2603 2604 smat_i->allcolumns = allcolumns[i]; 2605 smat_i->singleis = PETSC_FALSE; 2606 smat_i->row2proc = row2proc[i]; 2607 smat_i->rmap = rmap[i]; 2608 smat_i->cmap = cmap[i]; 2609 } 2610 2611 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2612 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2613 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2614 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2615 2616 /* create struct Mat_SubSppt and attached it to submat */ 2617 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2618 submats[0]->data = (void*)smat_i; 2619 2620 smat_i->destroy = submats[0]->ops->destroy; 2621 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2622 submats[0]->factortype = C->factortype; 2623 2624 smat_i->id = 0; 2625 smat_i->nrqs = nrqs; 2626 smat_i->nrqr = nrqr; 2627 smat_i->rbuf1 = rbuf1; 2628 smat_i->rbuf2 = rbuf2; 2629 smat_i->rbuf3 = rbuf3; 2630 smat_i->sbuf2 = sbuf2; 2631 smat_i->req_source2 = req_source2; 2632 2633 smat_i->sbuf1 = sbuf1; 2634 smat_i->ptr = ptr; 2635 smat_i->tmp = tmp; 2636 smat_i->ctr = ctr; 2637 2638 smat_i->pa = pa; 2639 smat_i->req_size = req_size; 2640 smat_i->req_source1 = req_source1; 2641 2642 smat_i->allcolumns = PETSC_FALSE; 2643 smat_i->singleis = PETSC_FALSE; 2644 smat_i->row2proc = NULL; 2645 smat_i->rmap = NULL; 2646 smat_i->cmap = NULL; 2647 } 2648 2649 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2650 ierr = PetscFree(lens);CHKERRQ(ierr); 2651 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2652 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2653 2654 } /* endof scall == MAT_INITIAL_MATRIX */ 2655 2656 /* Post recv matrix values */ 2657 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2658 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2659 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2660 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2661 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2662 for (i=0; i<nrqs; ++i) { 2663 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2664 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 2665 } 2666 2667 /* Allocate sending buffers for a->a, and send them off */ 2668 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2669 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2670 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2671 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2672 2673 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2674 { 2675 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2676 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2677 PetscInt cend = C->cmap->rend; 2678 PetscInt *b_j = b->j; 2679 PetscScalar *vworkA,*vworkB,*a_a,*b_a; 2680 2681 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2682 ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2683 for (i=0; i<nrqr; i++) { 2684 rbuf1_i = rbuf1[i]; 2685 sbuf_aa_i = sbuf_aa[i]; 2686 ct1 = 2*rbuf1_i[0]+1; 2687 ct2 = 0; 2688 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2689 kmax = rbuf1_i[2*j]; 2690 for (k=0; k<kmax; k++,ct1++) { 2691 row = rbuf1_i[ct1] - rstart; 2692 nzA = a_i[row+1] - a_i[row]; 2693 nzB = b_i[row+1] - b_i[row]; 2694 ncols = nzA + nzB; 2695 cworkB = b_j + b_i[row]; 2696 vworkA = a_a + a_i[row]; 2697 vworkB = b_a + b_i[row]; 2698 2699 /* load the column values for this row into vals*/ 2700 vals = sbuf_aa_i+ct2; 2701 2702 lwrite = 0; 2703 for (l=0; l<nzB; l++) { 2704 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2705 } 2706 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2707 for (l=0; l<nzB; l++) { 2708 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2709 } 2710 2711 ct2 += ncols; 2712 } 2713 } 2714 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 2715 } 2716 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2717 ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2718 } 2719 2720 /* Assemble the matrices */ 2721 /* First assemble the local rows */ 2722 for (i=0; i<ismax; i++) { 2723 row2proc_i = row2proc[i]; 2724 subc = (Mat_SeqAIJ*)submats[i]->data; 2725 imat_ilen = subc->ilen; 2726 imat_j = subc->j; 2727 imat_i = subc->i; 2728 imat_a = subc->a; 2729 2730 if (!allcolumns[i]) cmap_i = cmap[i]; 2731 rmap_i = rmap[i]; 2732 irow_i = irow[i]; 2733 jmax = nrow[i]; 2734 for (j=0; j<jmax; j++) { 2735 row = irow_i[j]; 2736 proc = row2proc_i[j]; 2737 if (proc == rank) { 2738 old_row = row; 2739 #if defined(PETSC_USE_CTABLE) 2740 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2741 row--; 2742 #else 2743 row = rmap_i[row]; 2744 #endif 2745 ilen_row = imat_ilen[row]; 2746 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2747 mat_i = imat_i[row]; 2748 mat_a = imat_a + mat_i; 2749 mat_j = imat_j + mat_i; 2750 if (!allcolumns[i]) { 2751 for (k=0; k<ncols; k++) { 2752 #if defined(PETSC_USE_CTABLE) 2753 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2754 #else 2755 tcol = cmap_i[cols[k]]; 2756 #endif 2757 if (tcol) { 2758 *mat_j++ = tcol - 1; 2759 *mat_a++ = vals[k]; 2760 ilen_row++; 2761 } 2762 } 2763 } else { /* allcolumns */ 2764 for (k=0; k<ncols; k++) { 2765 *mat_j++ = cols[k]; /* global col index! */ 2766 *mat_a++ = vals[k]; 2767 ilen_row++; 2768 } 2769 } 2770 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2771 2772 imat_ilen[row] = ilen_row; 2773 } 2774 } 2775 } 2776 2777 /* Now assemble the off proc rows */ 2778 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRMPI(ierr); 2779 for (tmp2=0; tmp2<nrqs; tmp2++) { 2780 sbuf1_i = sbuf1[pa[tmp2]]; 2781 jmax = sbuf1_i[0]; 2782 ct1 = 2*jmax + 1; 2783 ct2 = 0; 2784 rbuf2_i = rbuf2[tmp2]; 2785 rbuf3_i = rbuf3[tmp2]; 2786 rbuf4_i = rbuf4[tmp2]; 2787 for (j=1; j<=jmax; j++) { 2788 is_no = sbuf1_i[2*j-1]; 2789 rmap_i = rmap[is_no]; 2790 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2791 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2792 imat_ilen = subc->ilen; 2793 imat_j = subc->j; 2794 imat_i = subc->i; 2795 imat_a = subc->a; 2796 max1 = sbuf1_i[2*j]; 2797 for (k=0; k<max1; k++,ct1++) { 2798 row = sbuf1_i[ct1]; 2799 #if defined(PETSC_USE_CTABLE) 2800 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2801 row--; 2802 #else 2803 row = rmap_i[row]; 2804 #endif 2805 ilen = imat_ilen[row]; 2806 mat_i = imat_i[row]; 2807 mat_a = imat_a + mat_i; 2808 mat_j = imat_j + mat_i; 2809 max2 = rbuf2_i[ct1]; 2810 if (!allcolumns[is_no]) { 2811 for (l=0; l<max2; l++,ct2++) { 2812 #if defined(PETSC_USE_CTABLE) 2813 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2814 #else 2815 tcol = cmap_i[rbuf3_i[ct2]]; 2816 #endif 2817 if (tcol) { 2818 *mat_j++ = tcol - 1; 2819 *mat_a++ = rbuf4_i[ct2]; 2820 ilen++; 2821 } 2822 } 2823 } else { /* allcolumns */ 2824 for (l=0; l<max2; l++,ct2++) { 2825 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2826 *mat_a++ = rbuf4_i[ct2]; 2827 ilen++; 2828 } 2829 } 2830 imat_ilen[row] = ilen; 2831 } 2832 } 2833 } 2834 2835 if (!iscsorted) { /* sort column indices of the rows */ 2836 for (i=0; i<ismax; i++) { 2837 subc = (Mat_SeqAIJ*)submats[i]->data; 2838 imat_j = subc->j; 2839 imat_i = subc->i; 2840 imat_a = subc->a; 2841 imat_ilen = subc->ilen; 2842 2843 if (allcolumns[i]) continue; 2844 jmax = nrow[i]; 2845 for (j=0; j<jmax; j++) { 2846 mat_i = imat_i[j]; 2847 mat_a = imat_a + mat_i; 2848 mat_j = imat_j + mat_i; 2849 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2850 } 2851 } 2852 } 2853 2854 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2855 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2856 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);} 2857 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2858 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2859 2860 /* Restore the indices */ 2861 for (i=0; i<ismax; i++) { 2862 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2863 if (!allcolumns[i]) { 2864 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2865 } 2866 } 2867 2868 for (i=0; i<ismax; i++) { 2869 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2870 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2871 } 2872 2873 /* Destroy allocated memory */ 2874 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2875 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2876 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2877 2878 for (i=0; i<nrqs; ++i) { 2879 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2880 } 2881 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2882 2883 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2884 PetscFunctionReturn(0); 2885 } 2886 2887 /* 2888 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2889 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2890 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2891 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2892 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2893 state, and needs to be "assembled" later by compressing B's column space. 2894 2895 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2896 Following this call, C->A & C->B have been created, even if empty. 2897 */ 2898 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2899 { 2900 /* If making this function public, change the error returned in this function away from _PLIB. */ 2901 PetscErrorCode ierr; 2902 Mat_MPIAIJ *aij; 2903 Mat_SeqAIJ *Baij; 2904 PetscBool seqaij,Bdisassembled; 2905 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2906 PetscScalar v; 2907 const PetscInt *rowindices,*colindices; 2908 2909 PetscFunctionBegin; 2910 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2911 if (A) { 2912 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2913 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2914 if (rowemb) { 2915 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2916 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2917 } else { 2918 if (C->rmap->n != A->rmap->n) { 2919 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2920 } 2921 } 2922 if (dcolemb) { 2923 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2924 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2925 } else { 2926 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2927 } 2928 } 2929 if (B) { 2930 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2931 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2932 if (rowemb) { 2933 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2934 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2935 } else { 2936 if (C->rmap->n != B->rmap->n) { 2937 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2938 } 2939 } 2940 if (ocolemb) { 2941 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2942 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2943 } else { 2944 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2945 } 2946 } 2947 2948 aij = (Mat_MPIAIJ*)C->data; 2949 if (!aij->A) { 2950 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2951 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2952 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2953 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2954 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2955 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2956 } 2957 if (A) { 2958 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2959 } else { 2960 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2961 } 2962 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2963 /* 2964 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2965 need to "disassemble" B -- convert it to using C's global indices. 2966 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2967 2968 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2969 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2970 2971 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2972 At least avoid calling MatSetValues() and the implied searches? 2973 */ 2974 2975 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2976 #if defined(PETSC_USE_CTABLE) 2977 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2978 #else 2979 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2980 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2981 if (aij->B) { 2982 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2983 } 2984 #endif 2985 ngcol = 0; 2986 if (aij->lvec) { 2987 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2988 } 2989 if (aij->garray) { 2990 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2991 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2992 } 2993 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2994 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2995 } 2996 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2997 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2998 } 2999 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 3000 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 3001 } 3002 } 3003 Bdisassembled = PETSC_FALSE; 3004 if (!aij->B) { 3005 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 3006 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 3007 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 3008 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 3009 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3010 Bdisassembled = PETSC_TRUE; 3011 } 3012 if (B) { 3013 Baij = (Mat_SeqAIJ*)B->data; 3014 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3015 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3016 for (i=0; i<B->rmap->n; i++) { 3017 nz[i] = Baij->i[i+1] - Baij->i[i]; 3018 } 3019 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3020 ierr = PetscFree(nz);CHKERRQ(ierr); 3021 } 3022 3023 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3024 shift = rend-rstart; 3025 count = 0; 3026 rowindices = NULL; 3027 colindices = NULL; 3028 if (rowemb) { 3029 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3030 } 3031 if (ocolemb) { 3032 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3033 } 3034 for (i=0; i<B->rmap->n; i++) { 3035 PetscInt row; 3036 row = i; 3037 if (rowindices) row = rowindices[i]; 3038 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3039 col = Baij->j[count]; 3040 if (colindices) col = colindices[col]; 3041 if (Bdisassembled && col>=rstart) col += shift; 3042 v = Baij->a[count]; 3043 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3044 ++count; 3045 } 3046 } 3047 /* No assembly for aij->B is necessary. */ 3048 /* FIXME: set aij->B's nonzerostate correctly. */ 3049 } else { 3050 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3051 } 3052 C->preallocated = PETSC_TRUE; 3053 C->was_assembled = PETSC_FALSE; 3054 C->assembled = PETSC_FALSE; 3055 /* 3056 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3057 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3058 */ 3059 PetscFunctionReturn(0); 3060 } 3061 3062 /* 3063 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3064 */ 3065 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3066 { 3067 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3068 3069 PetscFunctionBegin; 3070 PetscValidPointer(A,2); 3071 PetscValidPointer(B,3); 3072 /* FIXME: make sure C is assembled */ 3073 *A = aij->A; 3074 *B = aij->B; 3075 /* Note that we don't incref *A and *B, so be careful! */ 3076 PetscFunctionReturn(0); 3077 } 3078 3079 /* 3080 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3081 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3082 */ 3083 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3084 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3085 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3086 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3087 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3088 { 3089 PetscErrorCode ierr; 3090 PetscMPIInt size,flag; 3091 PetscInt i,ii,cismax,ispar; 3092 Mat *A,*B; 3093 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3094 3095 PetscFunctionBegin; 3096 if (!ismax) PetscFunctionReturn(0); 3097 3098 for (i = 0, cismax = 0; i < ismax; ++i) { 3099 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr); 3100 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3101 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr); 3102 if (size > 1) ++cismax; 3103 } 3104 3105 /* 3106 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3107 ispar counts the number of parallel ISs across C's comm. 3108 */ 3109 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr); 3110 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3111 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3112 PetscFunctionReturn(0); 3113 } 3114 3115 /* if (ispar) */ 3116 /* 3117 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3118 These are used to extract the off-diag portion of the resulting parallel matrix. 3119 The row IS for the off-diag portion is the same as for the diag portion, 3120 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3121 */ 3122 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3123 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3124 for (i = 0, ii = 0; i < ismax; ++i) { 3125 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3126 if (size > 1) { 3127 /* 3128 TODO: This is the part that's ***NOT SCALABLE***. 3129 To fix this we need to extract just the indices of C's nonzero columns 3130 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3131 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3132 to be done without serializing on the IS list, so, most likely, it is best 3133 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3134 */ 3135 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3136 /* Now we have to 3137 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3138 were sorted on each rank, concatenated they might no longer be sorted; 3139 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3140 indices in the nondecreasing order to the original index positions. 3141 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3142 */ 3143 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3144 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3145 ++ii; 3146 } 3147 } 3148 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3149 for (i = 0, ii = 0; i < ismax; ++i) { 3150 PetscInt j,issize; 3151 const PetscInt *indices; 3152 3153 /* 3154 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3155 */ 3156 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3157 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3158 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3159 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3160 for (j = 1; j < issize; ++j) { 3161 if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3162 } 3163 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3164 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3165 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3166 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3167 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3168 for (j = 1; j < issize; ++j) { 3169 if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3170 } 3171 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3172 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3173 if (size > 1) { 3174 cisrow[ii] = isrow[i]; 3175 ++ii; 3176 } 3177 } 3178 /* 3179 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3180 array of sequential matrices underlying the resulting parallel matrices. 3181 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3182 contain duplicates. 3183 3184 There are as many diag matrices as there are original index sets. There are only as many parallel 3185 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3186 3187 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3188 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3189 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3190 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3191 with A[i] and B[ii] extracted from the corresponding MPI submat. 3192 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3193 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3194 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3195 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3196 values into A[i] and B[ii] sitting inside the corresponding submat. 3197 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3198 A[i], B[ii], AA[i] or BB[ii] matrices. 3199 */ 3200 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3201 if (scall == MAT_INITIAL_MATRIX) { 3202 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3203 } 3204 3205 /* Now obtain the sequential A and B submatrices separately. */ 3206 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3207 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3208 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3209 3210 /* 3211 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3212 matrices A & B have been extracted directly into the parallel matrices containing them, or 3213 simply into the sequential matrix identical with the corresponding A (if size == 1). 3214 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3215 to have the same sparsity pattern. 3216 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3217 must be constructed for C. This is done by setseqmat(s). 3218 */ 3219 for (i = 0, ii = 0; i < ismax; ++i) { 3220 /* 3221 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3222 That way we can avoid sorting and computing permutations when reusing. 3223 To this end: 3224 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3225 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3226 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3227 */ 3228 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3229 3230 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3231 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3232 if (size > 1) { 3233 if (scall == MAT_INITIAL_MATRIX) { 3234 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3235 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3236 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3237 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3238 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3239 } 3240 /* 3241 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3242 */ 3243 { 3244 Mat AA = A[i],BB = B[ii]; 3245 3246 if (AA || BB) { 3247 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3248 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3249 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3250 } 3251 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3252 } 3253 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3254 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3255 ++ii; 3256 } else { /* if (size == 1) */ 3257 if (scall == MAT_REUSE_MATRIX) { 3258 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3259 } 3260 if (isrow_p[i] || iscol_p[i]) { 3261 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3262 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3263 /* Otherwise A is extracted straight into (*submats)[i]. */ 3264 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3265 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3266 } else (*submat)[i] = A[i]; 3267 } 3268 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3269 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3270 } 3271 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3272 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3273 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3274 ierr = PetscFree(A);CHKERRQ(ierr); 3275 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3276 PetscFunctionReturn(0); 3277 } 3278 3279 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3280 { 3281 PetscErrorCode ierr; 3282 3283 PetscFunctionBegin; 3284 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3285 PetscFunctionReturn(0); 3286 } 3287