1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 22 { 23 PetscErrorCode ierr; 24 PetscInt i; 25 26 PetscFunctionBegin; 27 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 for (i=0; i<ov; ++i) { 29 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 30 } 31 PetscFunctionReturn(0); 32 } 33 34 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 35 { 36 PetscErrorCode ierr; 37 PetscInt i; 38 39 PetscFunctionBegin; 40 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 41 for (i=0; i<ov; ++i) { 42 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 43 } 44 PetscFunctionReturn(0); 45 } 46 47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 48 { 49 PetscErrorCode ierr; 50 MPI_Comm comm; 51 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 52 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 53 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 54 const PetscInt *indices_i,**indices; 55 PetscLayout rmap; 56 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 57 PetscSF sf; 58 PetscSFNode *remote; 59 60 PetscFunctionBegin; 61 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 62 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 63 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 64 /* get row map to determine where rows should be going */ 65 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 66 /* retrieve IS data and put all together so that we 67 * can optimize communication 68 * */ 69 ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 70 for (i=0,tlength=0; i<nidx; i++) { 71 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 72 tlength += length[i]; 73 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 74 } 75 /* find these rows on remote processors */ 76 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 77 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 78 nrrows = 0; 79 for (i=0; i<nidx; i++) { 80 length_i = length[i]; 81 indices_i = indices[i]; 82 for (j=0; j<length_i; j++) { 83 owner = -1; 84 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 85 /* remote processors */ 86 if (owner != rank) { 87 tosizes_temp[owner]++; /* number of rows to owner */ 88 rrow_ranks[nrrows] = owner; /* processor */ 89 rrow_isids[nrrows] = i; /* is id */ 90 remoterows[nrrows++] = indices_i[j]; /* row */ 91 } 92 } 93 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 94 } 95 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 96 /* test if we need to exchange messages 97 * generally speaking, we do not need to exchange 98 * data when overlap is 1 99 * */ 100 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRMPI(ierr); 101 /* we do not have any messages 102 * It usually corresponds to overlap 1 103 * */ 104 if (!reducednrrows) { 105 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 106 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 107 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 nto = 0; 111 /* send sizes and ranks for building a two-sided communcation */ 112 for (i=0; i<size; i++) { 113 if (tosizes_temp[i]) { 114 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 115 tosizes_temp[i] = nto; /* a map from processor to index */ 116 toranks[nto++] = i; /* processor */ 117 } 118 } 119 ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr); 120 toffsets[0] = 0; 121 for (i=0; i<nto; i++) { 122 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 123 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 124 } 125 /* send information to other processors */ 126 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 127 nrecvrows = 0; 128 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 129 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 130 nrecvrows = 0; 131 for (i=0; i<nfrom; i++) { 132 for (j=0; j<fromsizes[2*i]; j++) { 133 remote[nrecvrows].rank = fromranks[i]; 134 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 135 } 136 } 137 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 138 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 139 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 140 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 141 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 142 /* message pair <no of is, row> */ 143 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 144 for (i=0; i<nrrows; i++) { 145 owner = rrow_ranks[i]; /* processor */ 146 j = tosizes_temp[owner]; /* index */ 147 todata[toffsets[j]++] = rrow_isids[i]; 148 todata[toffsets[j]++] = remoterows[i]; 149 } 150 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 151 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 152 ierr = PetscFree(toffsets);CHKERRQ(ierr); 153 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 154 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 155 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 156 /* send rows belonging to the remote so that then we could get the overlapping data back */ 157 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 158 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 159 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 160 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 161 ierr = PetscFree(fromranks);CHKERRQ(ierr); 162 nrecvrows = 0; 163 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 164 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 165 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 166 nrecvrows = 0; 167 for (i=0; i<nto; i++) { 168 for (j=0; j<tosizes[2*i]; j++) { 169 remote[nrecvrows].rank = toranks[i]; 170 remote[nrecvrows++].index = tosizes[2*i+1]+j; 171 } 172 } 173 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 174 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 175 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 176 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 177 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 178 /* overlap communication and computation */ 179 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 180 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 181 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 182 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 183 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 184 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 185 ierr = PetscFree(toranks);CHKERRQ(ierr); 186 ierr = PetscFree(tosizes);CHKERRQ(ierr); 187 ierr = PetscFree(todata);CHKERRQ(ierr); 188 PetscFunctionReturn(0); 189 } 190 191 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 192 { 193 PetscInt *isz,isz_i,i,j,is_id, data_size; 194 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 195 const PetscInt *indices_i_temp; 196 MPI_Comm *iscomms; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++) { 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++) { 209 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 210 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 211 ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr); 212 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 213 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 214 } 215 /* retrieve information to get row id and its overlap */ 216 for (i=0; i<nrecvs;) { 217 is_id = recvdata[i++]; 218 data_size = recvdata[i++]; 219 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 220 isz_i = isz[is_id]; 221 for (j=0; j< data_size; j++) { 222 col = recvdata[i++]; 223 indices_i[isz_i++] = col; 224 } 225 isz[is_id] = isz_i; 226 } 227 /* remove duplicate entities */ 228 for (i=0; i<nidx; i++) { 229 indices_i = indices_temp+(max_lsize+nrecvs)*i; 230 isz_i = isz[i]; 231 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 232 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 233 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 234 } 235 ierr = PetscFree(isz);CHKERRQ(ierr); 236 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 237 PetscFunctionReturn(0); 238 } 239 240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 241 { 242 PetscLayout rmap,cmap; 243 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 244 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 245 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 246 const PetscInt *gcols,*ai,*aj,*bi,*bj; 247 Mat amat,bmat; 248 PetscMPIInt rank; 249 PetscBool done; 250 MPI_Comm comm; 251 PetscErrorCode ierr; 252 253 PetscFunctionBegin; 254 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 255 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 256 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 257 /* Even if the mat is symmetric, we still assume it is not symmetric */ 258 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 261 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 262 /* total number of nonzero values is used to estimate the memory usage in the next step */ 263 tnz = ai[an]+bi[bn]; 264 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 265 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 266 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 267 /* to find the longest message */ 268 max_fszs = 0; 269 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 270 /* better way to estimate number of nonzero in the mat??? */ 271 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 272 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 273 rows_pos = 0; 274 totalrows = 0; 275 for (i=0; i<nfrom; i++) { 276 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 277 /* group data together */ 278 for (j=0; j<fromsizes[2*i]; j+=2) { 279 is_id = fromrows[rows_pos++];/* no of is */ 280 rows_i = rows_data[is_id]; 281 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 282 } 283 /* estimate a space to avoid multiple allocations */ 284 for (j=0; j<nidx; j++) { 285 indvc_ij = 0; 286 rows_i = rows_data[j]; 287 for (l=0; l<rows_pos_i[j]; l++) { 288 row = rows_i[l]-rstart; 289 start = ai[row]; 290 end = ai[row+1]; 291 for (k=start; k<end; k++) { /* Amat */ 292 col = aj[k] + cstart; 293 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 294 } 295 start = bi[row]; 296 end = bi[row+1]; 297 for (k=start; k<end; k++) { /* Bmat */ 298 col = gcols[bj[k]]; 299 indices_tmp[indvc_ij++] = col; 300 } 301 } 302 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 303 indv_counts[i*nidx+j] = indvc_ij; 304 totalrows += indvc_ij; 305 } 306 } 307 /* message triple <no of is, number of rows, rows> */ 308 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 309 totalrows = 0; 310 rows_pos = 0; 311 /* use this code again */ 312 for (i=0;i<nfrom;i++) { 313 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 314 for (j=0; j<fromsizes[2*i]; j+=2) { 315 is_id = fromrows[rows_pos++]; 316 rows_i = rows_data[is_id]; 317 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 318 } 319 /* add data */ 320 for (j=0; j<nidx; j++) { 321 if (!indv_counts[i*nidx+j]) continue; 322 indvc_ij = 0; 323 sbdata[totalrows++] = j; 324 sbdata[totalrows++] = indv_counts[i*nidx+j]; 325 sbsizes[2*i] += 2; 326 rows_i = rows_data[j]; 327 for (l=0; l<rows_pos_i[j]; l++) { 328 row = rows_i[l]-rstart; 329 start = ai[row]; 330 end = ai[row+1]; 331 for (k=start; k<end; k++) { /* Amat */ 332 col = aj[k] + cstart; 333 indices_tmp[indvc_ij++] = col; 334 } 335 start = bi[row]; 336 end = bi[row+1]; 337 for (k=start; k<end; k++) { /* Bmat */ 338 col = gcols[bj[k]]; 339 indices_tmp[indvc_ij++] = col; 340 } 341 } 342 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 343 sbsizes[2*i] += indvc_ij; 344 ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr); 345 totalrows += indvc_ij; 346 } 347 } 348 ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 349 offsets[0] = 0; 350 for (i=0; i<nfrom; i++) { 351 offsets[i+1] = offsets[i] + sbsizes[2*i]; 352 sbsizes[2*i+1] = offsets[i]; 353 } 354 ierr = PetscFree(offsets);CHKERRQ(ierr); 355 if (sbrowsizes) *sbrowsizes = sbsizes; 356 if (sbrows) *sbrows = sbdata; 357 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 358 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 359 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 360 PetscFunctionReturn(0); 361 } 362 363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 364 { 365 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 366 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 367 PetscInt lsize,lsize_tmp; 368 PetscMPIInt rank,owner; 369 Mat amat,bmat; 370 PetscBool done; 371 PetscLayout cmap,rmap; 372 MPI_Comm comm; 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 377 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 378 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 379 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 380 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 381 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 382 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 383 /* is it a safe way to compute number of nonzero values ? */ 384 tnz = ai[an]+bi[bn]; 385 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 386 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 387 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 388 /* it is a better way to estimate memory than the old implementation 389 * where global size of matrix is used 390 * */ 391 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 392 for (i=0; i<nidx; i++) { 393 MPI_Comm iscomm; 394 395 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 396 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 397 lsize_tmp = 0; 398 for (j=0; j<lsize; j++) { 399 owner = -1; 400 row = indices[j]; 401 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 402 if (owner != rank) continue; 403 /* local number */ 404 row -= rstart; 405 start = ai[row]; 406 end = ai[row+1]; 407 for (k=start; k<end; k++) { /* Amat */ 408 col = aj[k] + cstart; 409 indices_temp[lsize_tmp++] = col; 410 } 411 start = bi[row]; 412 end = bi[row+1]; 413 for (k=start; k<end; k++) { /* Bmat */ 414 col = gcols[bj[k]]; 415 indices_temp[lsize_tmp++] = col; 416 } 417 } 418 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 419 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 420 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 421 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 422 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 423 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 424 } 425 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 426 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 427 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 /* 432 Sample message format: 433 If a processor A wants processor B to process some elements corresponding 434 to index sets is[1],is[5] 435 mesg [0] = 2 (no of index sets in the mesg) 436 ----------- 437 mesg [1] = 1 => is[1] 438 mesg [2] = sizeof(is[1]); 439 ----------- 440 mesg [3] = 5 => is[5] 441 mesg [4] = sizeof(is[5]); 442 ----------- 443 mesg [5] 444 mesg [n] datas[1] 445 ----------- 446 mesg[n+1] 447 mesg[m] data(is[5]) 448 ----------- 449 450 Notes: 451 nrqs - no of requests sent (or to be sent out) 452 nrqr - no of requests received (which have to be or which have been processed) 453 */ 454 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 455 { 456 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 457 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 458 const PetscInt **idx,*idx_i; 459 PetscInt *n,**data,len; 460 #if defined(PETSC_USE_CTABLE) 461 PetscTable *table_data,table_data_i; 462 PetscInt *tdata,tcount,tcount_max; 463 #else 464 PetscInt *data_i,*d_p; 465 #endif 466 PetscErrorCode ierr; 467 PetscMPIInt size,rank,tag1,tag2,proc = 0; 468 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 469 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 470 PetscBT *table; 471 MPI_Comm comm; 472 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 473 MPI_Status *recv_status; 474 MPI_Comm *iscomms; 475 char *t_p; 476 477 PetscFunctionBegin; 478 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 479 size = c->size; 480 rank = c->rank; 481 M = C->rmap->N; 482 483 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 484 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 485 486 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 487 488 for (i=0; i<imax; i++) { 489 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 490 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 491 } 492 493 /* evaluate communication - mesg to who,length of mesg, and buffer space 494 required. Based on this, buffers are allocated, and data copied into them */ 495 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 496 for (i=0; i<imax; i++) { 497 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 498 idx_i = idx[i]; 499 len = n[i]; 500 for (j=0; j<len; j++) { 501 row = idx_i[j]; 502 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 503 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 504 w4[proc]++; 505 } 506 for (j=0; j<size; j++) { 507 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 508 } 509 } 510 511 nrqs = 0; /* no of outgoing messages */ 512 msz = 0; /* total mesg length (for all proc */ 513 w1[rank] = 0; /* no mesg sent to intself */ 514 w3[rank] = 0; 515 for (i=0; i<size; i++) { 516 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 517 } 518 /* pa - is list of processors to communicate with */ 519 ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); 520 for (i=0,j=0; i<size; i++) { 521 if (w1[i]) {pa[j] = i; j++;} 522 } 523 524 /* Each message would have a header = 1 + 2*(no of IS) + data */ 525 for (i=0; i<nrqs; i++) { 526 j = pa[i]; 527 w1[j] += w2[j] + 2*w3[j]; 528 msz += w1[j]; 529 } 530 531 /* Determine the number of messages to expect, their lengths, from from-ids */ 532 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 533 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 534 535 /* Now post the Irecvs corresponding to these messages */ 536 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 537 538 /* Allocate Memory for outgoing messages */ 539 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 540 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 541 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 542 543 { 544 PetscInt *iptr = tmp,ict = 0; 545 for (i=0; i<nrqs; i++) { 546 j = pa[i]; 547 iptr += ict; 548 outdat[j] = iptr; 549 ict = w1[j]; 550 } 551 } 552 553 /* Form the outgoing messages */ 554 /* plug in the headers */ 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 outdat[j][0] = 0; 558 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 559 ptr[j] = outdat[j] + 2*w3[j] + 1; 560 } 561 562 /* Memory for doing local proc's work */ 563 { 564 PetscInt M_BPB_imax = 0; 565 #if defined(PETSC_USE_CTABLE) 566 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 567 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 ierr = PetscTableCreate(n[i],M,&table_data[i]);CHKERRQ(ierr); 570 } 571 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 572 for (i=0; i<imax; i++) { 573 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 574 } 575 #else 576 PetscInt Mimax = 0; 577 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 578 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 579 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 580 for (i=0; i<imax; i++) { 581 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 582 data[i] = d_p + M*i; 583 } 584 #endif 585 } 586 587 /* Parse the IS and update local tables and the outgoing buf with the data */ 588 { 589 PetscInt n_i,isz_i,*outdat_j,ctr_j; 590 PetscBT table_i; 591 592 for (i=0; i<imax; i++) { 593 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 594 n_i = n[i]; 595 table_i = table[i]; 596 idx_i = idx[i]; 597 #if defined(PETSC_USE_CTABLE) 598 table_data_i = table_data[i]; 599 #else 600 data_i = data[i]; 601 #endif 602 isz_i = isz[i]; 603 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 604 row = idx_i[j]; 605 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 606 if (proc != rank) { /* copy to the outgoing buffer */ 607 ctr[proc]++; 608 *ptr[proc] = row; 609 ptr[proc]++; 610 } else if (!PetscBTLookupSet(table_i,row)) { 611 #if defined(PETSC_USE_CTABLE) 612 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 613 #else 614 data_i[isz_i] = row; /* Update the local table */ 615 #endif 616 isz_i++; 617 } 618 } 619 /* Update the headers for the current IS */ 620 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 621 if ((ctr_j = ctr[j])) { 622 outdat_j = outdat[j]; 623 k = ++outdat_j[0]; 624 outdat_j[2*k] = ctr_j; 625 outdat_j[2*k-1] = i; 626 } 627 } 628 isz[i] = isz_i; 629 } 630 } 631 632 /* Now post the sends */ 633 ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr); 634 for (i=0; i<nrqs; ++i) { 635 j = pa[i]; 636 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 637 } 638 639 /* No longer need the original indices */ 640 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 641 for (i=0; i<imax; ++i) { 642 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 643 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 644 } 645 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 646 647 for (i=0; i<imax; ++i) { 648 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 649 } 650 651 /* Do Local work */ 652 #if defined(PETSC_USE_CTABLE) 653 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 654 #else 655 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 656 #endif 657 658 /* Receive messages */ 659 ierr = PetscMalloc1(nrqr,&recv_status);CHKERRQ(ierr); 660 ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr); 661 ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 662 663 /* Phase 1 sends are complete - deallocate buffers */ 664 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 665 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 666 667 ierr = PetscMalloc1(nrqr,&xdata);CHKERRQ(ierr); 668 ierr = PetscMalloc1(nrqr,&isz1);CHKERRQ(ierr); 669 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 670 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 671 ierr = PetscFree(rbuf);CHKERRQ(ierr); 672 673 /* Send the data back */ 674 /* Do a global reduction to know the buffer space req for incoming messages */ 675 { 676 PetscMPIInt *rw1; 677 678 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 679 680 for (i=0; i<nrqr; ++i) { 681 proc = recv_status[i].MPI_SOURCE; 682 683 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 684 rw1[proc] = isz1[i]; 685 } 686 ierr = PetscFree(onodes1);CHKERRQ(ierr); 687 ierr = PetscFree(olengths1);CHKERRQ(ierr); 688 689 /* Determine the number of messages to expect, their lengths, from from-ids */ 690 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 691 ierr = PetscFree(rw1);CHKERRQ(ierr); 692 } 693 /* Now post the Irecvs corresponding to these messages */ 694 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 695 696 /* Now post the sends */ 697 ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr); 698 for (i=0; i<nrqr; ++i) { 699 j = recv_status[i].MPI_SOURCE; 700 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr); 701 } 702 703 /* receive work done on other processors */ 704 { 705 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 706 PetscMPIInt idex; 707 PetscBT table_i; 708 709 for (i=0; i<nrqs; ++i) { 710 ierr = MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);CHKERRMPI(ierr); 711 /* Process the message */ 712 rbuf2_i = rbuf2[idex]; 713 ct1 = 2*rbuf2_i[0]+1; 714 jmax = rbuf2[idex][0]; 715 for (j=1; j<=jmax; j++) { 716 max = rbuf2_i[2*j]; 717 is_no = rbuf2_i[2*j-1]; 718 isz_i = isz[is_no]; 719 table_i = table[is_no]; 720 #if defined(PETSC_USE_CTABLE) 721 table_data_i = table_data[is_no]; 722 #else 723 data_i = data[is_no]; 724 #endif 725 for (k=0; k<max; k++,ct1++) { 726 row = rbuf2_i[ct1]; 727 if (!PetscBTLookupSet(table_i,row)) { 728 #if defined(PETSC_USE_CTABLE) 729 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 730 #else 731 data_i[isz_i] = row; 732 #endif 733 isz_i++; 734 } 735 } 736 isz[is_no] = isz_i; 737 } 738 } 739 740 ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 768 } 769 770 ierr = PetscFree(iscomms);CHKERRQ(ierr); 771 ierr = PetscFree(onodes2);CHKERRQ(ierr); 772 ierr = PetscFree(olengths2);CHKERRQ(ierr); 773 774 ierr = PetscFree(pa);CHKERRQ(ierr); 775 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 776 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 779 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 780 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 781 ierr = PetscFree(recv_status);CHKERRQ(ierr); 782 if (xdata) { 783 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 784 ierr = PetscFree(xdata);CHKERRQ(ierr); 785 } 786 ierr = PetscFree(isz1);CHKERRQ(ierr); 787 #if defined(PETSC_USE_CTABLE) 788 for (i=0; i<imax; i++) { 789 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 790 } 791 ierr = PetscFree(table_data);CHKERRQ(ierr); 792 ierr = PetscFree(tdata);CHKERRQ(ierr); 793 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 794 #else 795 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 796 #endif 797 PetscFunctionReturn(0); 798 } 799 800 /* 801 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 802 the work on the local processor. 803 804 Inputs: 805 C - MAT_MPIAIJ; 806 imax - total no of index sets processed at a time; 807 table - an array of char - size = m bits. 808 809 Output: 810 isz - array containing the count of the solution elements corresponding 811 to each index set; 812 data or table_data - pointer to the solutions 813 */ 814 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 815 { 816 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 817 Mat A = c->A,B = c->B; 818 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 819 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 820 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 821 PetscBT table_i; 822 #if defined(PETSC_USE_CTABLE) 823 PetscTable table_data_i; 824 PetscErrorCode ierr; 825 PetscTablePosition tpos; 826 PetscInt tcount,*tdata; 827 #else 828 PetscInt *data_i; 829 #endif 830 831 PetscFunctionBegin; 832 rstart = C->rmap->rstart; 833 cstart = C->cmap->rstart; 834 ai = a->i; 835 aj = a->j; 836 bi = b->i; 837 bj = b->j; 838 garray = c->garray; 839 840 for (i=0; i<imax; i++) { 841 #if defined(PETSC_USE_CTABLE) 842 /* copy existing entries of table_data_i into tdata[] */ 843 table_data_i = table_data[i]; 844 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 845 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]); 846 847 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 848 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 849 while (tpos) { 850 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 851 tdata[--j] = --row; 852 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount); 853 } 854 #else 855 data_i = data[i]; 856 #endif 857 table_i = table[i]; 858 isz_i = isz[i]; 859 max = isz[i]; 860 861 for (j=0; j<max; j++) { 862 #if defined(PETSC_USE_CTABLE) 863 row = tdata[j] - rstart; 864 #else 865 row = data_i[j] - rstart; 866 #endif 867 start = ai[row]; 868 end = ai[row+1]; 869 for (k=start; k<end; k++) { /* Amat */ 870 val = aj[k] + cstart; 871 if (!PetscBTLookupSet(table_i,val)) { 872 #if defined(PETSC_USE_CTABLE) 873 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 874 #else 875 data_i[isz_i] = val; 876 #endif 877 isz_i++; 878 } 879 } 880 start = bi[row]; 881 end = bi[row+1]; 882 for (k=start; k<end; k++) { /* Bmat */ 883 val = garray[bj[k]]; 884 if (!PetscBTLookupSet(table_i,val)) { 885 #if defined(PETSC_USE_CTABLE) 886 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 887 #else 888 data_i[isz_i] = val; 889 #endif 890 isz_i++; 891 } 892 } 893 } 894 isz[i] = isz_i; 895 896 #if defined(PETSC_USE_CTABLE) 897 ierr = PetscFree(tdata);CHKERRQ(ierr); 898 #endif 899 } 900 PetscFunctionReturn(0); 901 } 902 903 /* 904 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 905 and return the output 906 907 Input: 908 C - the matrix 909 nrqr - no of messages being processed. 910 rbuf - an array of pointers to the received requests 911 912 Output: 913 xdata - array of messages to be sent back 914 isz1 - size of each message 915 916 For better efficiency perhaps we should malloc separately each xdata[i], 917 then if a remalloc is required we need only copy the data for that one row 918 rather then all previous rows as it is now where a single large chunck of 919 memory is used. 920 921 */ 922 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 923 { 924 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 925 Mat A = c->A,B = c->B; 926 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 927 PetscErrorCode ierr; 928 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 929 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 930 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 931 PetscInt *rbuf_i,kmax,rbuf_0; 932 PetscBT xtable; 933 934 PetscFunctionBegin; 935 m = C->rmap->N; 936 rstart = C->rmap->rstart; 937 cstart = C->cmap->rstart; 938 ai = a->i; 939 aj = a->j; 940 bi = b->i; 941 bj = b->j; 942 garray = c->garray; 943 944 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 945 rbuf_i = rbuf[i]; 946 rbuf_0 = rbuf_i[0]; 947 ct += rbuf_0; 948 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 949 } 950 951 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 952 else max1 = 1; 953 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 954 if (nrqr) { 955 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 956 ++no_malloc; 957 } 958 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 959 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 960 961 ct3 = 0; 962 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 963 rbuf_i = rbuf[i]; 964 rbuf_0 = rbuf_i[0]; 965 ct1 = 2*rbuf_0+1; 966 ct2 = ct1; 967 ct3 += ct1; 968 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 969 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 970 oct2 = ct2; 971 kmax = rbuf_i[2*j]; 972 for (k=0; k<kmax; k++,ct1++) { 973 row = rbuf_i[ct1]; 974 if (!PetscBTLookupSet(xtable,row)) { 975 if (!(ct3 < mem_estimate)) { 976 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 977 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 978 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 979 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 980 xdata[0] = tmp; 981 mem_estimate = new_estimate; ++no_malloc; 982 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 983 } 984 xdata[i][ct2++] = row; 985 ct3++; 986 } 987 } 988 for (k=oct2,max2=ct2; k<max2; k++) { 989 row = xdata[i][k] - rstart; 990 start = ai[row]; 991 end = ai[row+1]; 992 for (l=start; l<end; l++) { 993 val = aj[l] + cstart; 994 if (!PetscBTLookupSet(xtable,val)) { 995 if (!(ct3 < mem_estimate)) { 996 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 997 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 998 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 999 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1000 xdata[0] = tmp; 1001 mem_estimate = new_estimate; ++no_malloc; 1002 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1003 } 1004 xdata[i][ct2++] = val; 1005 ct3++; 1006 } 1007 } 1008 start = bi[row]; 1009 end = bi[row+1]; 1010 for (l=start; l<end; l++) { 1011 val = garray[bj[l]]; 1012 if (!PetscBTLookupSet(xtable,val)) { 1013 if (!(ct3 < mem_estimate)) { 1014 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1015 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1016 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1017 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1018 xdata[0] = tmp; 1019 mem_estimate = new_estimate; ++no_malloc; 1020 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1021 } 1022 xdata[i][ct2++] = val; 1023 ct3++; 1024 } 1025 } 1026 } 1027 /* Update the header*/ 1028 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1029 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1030 } 1031 xdata[i][0] = rbuf_0; 1032 if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2; 1033 isz1[i] = ct2; /* size of each message */ 1034 } 1035 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1036 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1037 PetscFunctionReturn(0); 1038 } 1039 /* -------------------------------------------------------------------------*/ 1040 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1041 /* 1042 Every processor gets the entire matrix 1043 */ 1044 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1045 { 1046 Mat B; 1047 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1048 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1049 PetscErrorCode ierr; 1050 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1051 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1052 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1053 1054 PetscFunctionBegin; 1055 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 1056 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr); 1057 if (scall == MAT_INITIAL_MATRIX) { 1058 /* ---------------------------------------------------------------- 1059 Tell every processor the number of nonzeros per row 1060 */ 1061 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1062 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1063 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1064 } 1065 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1066 for (i=0; i<size; i++) { 1067 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1068 displs[i] = A->rmap->range[i]; 1069 } 1070 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1071 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1072 #else 1073 sendcount = A->rmap->rend - A->rmap->rstart; 1074 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1075 #endif 1076 /* --------------------------------------------------------------- 1077 Create the sequential matrix of the same type as the local block diagonal 1078 */ 1079 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1080 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1081 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1082 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1083 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1084 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1085 **Bin = B; 1086 b = (Mat_SeqAIJ*)B->data; 1087 1088 /*-------------------------------------------------------------------- 1089 Copy my part of matrix column indices over 1090 */ 1091 sendcount = ad->nz + bd->nz; 1092 jsendbuf = b->j + b->i[rstarts[rank]]; 1093 a_jsendbuf = ad->j; 1094 b_jsendbuf = bd->j; 1095 n = A->rmap->rend - A->rmap->rstart; 1096 cnt = 0; 1097 for (i=0; i<n; i++) { 1098 /* put in lower diagonal portion */ 1099 m = bd->i[i+1] - bd->i[i]; 1100 while (m > 0) { 1101 /* is it above diagonal (in bd (compressed) numbering) */ 1102 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1103 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1104 m--; 1105 } 1106 1107 /* put in diagonal portion */ 1108 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1109 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1110 } 1111 1112 /* put in upper diagonal portion */ 1113 while (m-- > 0) { 1114 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1115 } 1116 } 1117 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1118 1119 /*-------------------------------------------------------------------- 1120 Gather all column indices to all processors 1121 */ 1122 for (i=0; i<size; i++) { 1123 recvcounts[i] = 0; 1124 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1125 recvcounts[i] += lens[j]; 1126 } 1127 } 1128 displs[0] = 0; 1129 for (i=1; i<size; i++) { 1130 displs[i] = displs[i-1] + recvcounts[i-1]; 1131 } 1132 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1133 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1134 #else 1135 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1136 #endif 1137 /*-------------------------------------------------------------------- 1138 Assemble the matrix into useable form (note numerical values not yet set) 1139 */ 1140 /* set the b->ilen (length of each row) values */ 1141 ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr); 1142 /* set the b->i indices */ 1143 b->i[0] = 0; 1144 for (i=1; i<=A->rmap->N; i++) { 1145 b->i[i] = b->i[i-1] + lens[i-1]; 1146 } 1147 ierr = PetscFree(lens);CHKERRQ(ierr); 1148 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1150 1151 } else { 1152 B = **Bin; 1153 b = (Mat_SeqAIJ*)B->data; 1154 } 1155 1156 /*-------------------------------------------------------------------- 1157 Copy my part of matrix numerical values into the values location 1158 */ 1159 if (flag == MAT_GET_VALUES) { 1160 const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf; 1161 MatScalar *sendbuf,*recvbuf; 1162 1163 ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr); 1164 ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr); 1165 sendcount = ad->nz + bd->nz; 1166 sendbuf = b->a + b->i[rstarts[rank]]; 1167 a_sendbuf = ada; 1168 b_sendbuf = bda; 1169 b_sendj = bd->j; 1170 n = A->rmap->rend - A->rmap->rstart; 1171 cnt = 0; 1172 for (i=0; i<n; i++) { 1173 /* put in lower diagonal portion */ 1174 m = bd->i[i+1] - bd->i[i]; 1175 while (m > 0) { 1176 /* is it above diagonal (in bd (compressed) numbering) */ 1177 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1178 sendbuf[cnt++] = *b_sendbuf++; 1179 m--; 1180 b_sendj++; 1181 } 1182 1183 /* put in diagonal portion */ 1184 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1185 sendbuf[cnt++] = *a_sendbuf++; 1186 } 1187 1188 /* put in upper diagonal portion */ 1189 while (m-- > 0) { 1190 sendbuf[cnt++] = *b_sendbuf++; 1191 b_sendj++; 1192 } 1193 } 1194 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1195 1196 /* ----------------------------------------------------------------- 1197 Gather all numerical values to all processors 1198 */ 1199 if (!recvcounts) { 1200 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1201 } 1202 for (i=0; i<size; i++) { 1203 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1204 } 1205 displs[0] = 0; 1206 for (i=1; i<size; i++) { 1207 displs[i] = displs[i-1] + recvcounts[i-1]; 1208 } 1209 recvbuf = b->a; 1210 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1211 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1212 #else 1213 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1214 #endif 1215 ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr); 1216 ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr); 1217 } /* endof (flag == MAT_GET_VALUES) */ 1218 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1219 1220 if (A->symmetric) { 1221 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1222 } else if (A->hermitian) { 1223 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1224 } else if (A->structurally_symmetric) { 1225 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1226 } 1227 PetscFunctionReturn(0); 1228 } 1229 1230 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1231 { 1232 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1233 Mat submat,A = c->A,B = c->B; 1234 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1235 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1236 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1237 const PetscInt *icol,*irow; 1238 PetscInt nrow,ncol,start; 1239 PetscErrorCode ierr; 1240 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1241 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1242 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1243 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1244 PetscInt *lens,rmax,ncols,*cols,Crow; 1245 #if defined(PETSC_USE_CTABLE) 1246 PetscTable cmap,rmap; 1247 PetscInt *cmap_loc,*rmap_loc; 1248 #else 1249 PetscInt *cmap,*rmap; 1250 #endif 1251 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1252 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1253 PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL; 1254 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1255 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1256 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1257 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1258 MPI_Comm comm; 1259 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1260 PetscMPIInt *onodes1,*olengths1,idex,end; 1261 Mat_SubSppt *smatis1; 1262 PetscBool isrowsorted,iscolsorted; 1263 1264 PetscFunctionBegin; 1265 PetscValidLogicalCollectiveInt(C,ismax,2); 1266 PetscValidLogicalCollectiveEnum(C,scall,5); 1267 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1268 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1269 ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1270 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1271 size = c->size; 1272 rank = c->rank; 1273 1274 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1275 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1276 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1277 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1278 if (allcolumns) { 1279 icol = NULL; 1280 ncol = C->cmap->N; 1281 } else { 1282 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1283 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1284 } 1285 1286 if (scall == MAT_INITIAL_MATRIX) { 1287 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1288 1289 /* Get some new tags to keep the communication clean */ 1290 tag1 = ((PetscObject)C)->tag; 1291 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1292 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1293 1294 /* evaluate communication - mesg to who, length of mesg, and buffer space 1295 required. Based on this, buffers are allocated, and data copied into them */ 1296 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1297 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1298 1299 /* w1[proc] = num of rows owned by proc -- to be requested */ 1300 proc = 0; 1301 nrqs = 0; /* num of outgoing messages */ 1302 for (j=0; j<nrow; j++) { 1303 row = irow[j]; 1304 if (!isrowsorted) proc = 0; 1305 while (row >= C->rmap->range[proc+1]) proc++; 1306 w1[proc]++; 1307 row2proc[j] = proc; /* map row index to proc */ 1308 1309 if (proc != rank && !w2[proc]) { 1310 w2[proc] = 1; nrqs++; 1311 } 1312 } 1313 w1[rank] = 0; /* rows owned by self will not be requested */ 1314 1315 ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1316 for (proc=0,j=0; proc<size; proc++) { 1317 if (w1[proc]) { pa[j++] = proc;} 1318 } 1319 1320 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1321 msz = 0; /* total mesg length (for all procs) */ 1322 for (i=0; i<nrqs; i++) { 1323 proc = pa[i]; 1324 w1[proc] += 3; 1325 msz += w1[proc]; 1326 } 1327 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1328 1329 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1330 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1331 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1332 1333 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1334 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1335 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1336 1337 /* Now post the Irecvs corresponding to these messages */ 1338 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1339 1340 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1341 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1342 1343 /* Allocate Memory for outgoing messages */ 1344 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1345 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 1346 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 1347 1348 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1349 iptr = tmp; 1350 for (i=0; i<nrqs; i++) { 1351 proc = pa[i]; 1352 sbuf1[proc] = iptr; 1353 iptr += w1[proc]; 1354 } 1355 1356 /* Form the outgoing messages */ 1357 /* Initialize the header space */ 1358 for (i=0; i<nrqs; i++) { 1359 proc = pa[i]; 1360 ierr = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr); 1361 ptr[proc] = sbuf1[proc] + 3; 1362 } 1363 1364 /* Parse the isrow and copy data into outbuf */ 1365 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 1366 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1367 proc = row2proc[j]; 1368 if (proc != rank) { /* copy to the outgoing buf*/ 1369 *ptr[proc] = irow[j]; 1370 ctr[proc]++; ptr[proc]++; 1371 } 1372 } 1373 1374 /* Update the headers for the current IS */ 1375 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1376 if ((ctr_j = ctr[j])) { 1377 sbuf1_j = sbuf1[j]; 1378 k = ++sbuf1_j[0]; 1379 sbuf1_j[2*k] = ctr_j; 1380 sbuf1_j[2*k-1] = 0; 1381 } 1382 } 1383 1384 /* Now post the sends */ 1385 ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr); 1386 for (i=0; i<nrqs; ++i) { 1387 proc = pa[i]; 1388 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 1389 } 1390 1391 /* Post Receives to capture the buffer size */ 1392 ierr = PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2);CHKERRQ(ierr); 1393 ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr); 1394 1395 if (nrqs) rbuf2[0] = tmp + msz; 1396 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1397 1398 for (i=0; i<nrqs; ++i) { 1399 proc = pa[i]; 1400 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 1401 } 1402 1403 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1404 1405 /* Send to other procs the buf size they should allocate */ 1406 /* Receive messages*/ 1407 ierr = PetscMalloc1(nrqr,&r_status1);CHKERRQ(ierr); 1408 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1409 1410 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 1411 for (i=0; i<nrqr; ++i) { 1412 req_size[i] = 0; 1413 rbuf1_i = rbuf1[i]; 1414 start = 2*rbuf1_i[0] + 1; 1415 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr); 1416 ierr = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr); 1417 sbuf2_i = sbuf2[i]; 1418 for (j=start; j<end; j++) { 1419 k = rbuf1_i[j] - rstart; 1420 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1421 sbuf2_i[j] = ncols; 1422 req_size[i] += ncols; 1423 } 1424 req_source1[i] = r_status1[i].MPI_SOURCE; 1425 1426 /* form the header */ 1427 sbuf2_i[0] = req_size[i]; 1428 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1429 1430 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 1431 } 1432 1433 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1434 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1435 1436 /* rbuf2 is received, Post recv column indices a->j */ 1437 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 1438 1439 ierr = PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3);CHKERRQ(ierr); 1440 for (i=0; i<nrqs; ++i) { 1441 ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr); 1442 req_source2[i] = r_status2[i].MPI_SOURCE; 1443 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 1444 } 1445 1446 /* Wait on sends1 and sends2 */ 1447 ierr = PetscMalloc1(nrqs,&s_status1);CHKERRQ(ierr); 1448 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr); 1449 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1450 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1451 1452 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr); 1453 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1454 1455 /* Now allocate sending buffers for a->j, and send them off */ 1456 ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr); 1457 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1458 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);} 1459 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1460 1461 for (i=0; i<nrqr; i++) { /* for each requested message */ 1462 rbuf1_i = rbuf1[i]; 1463 sbuf_aj_i = sbuf_aj[i]; 1464 ct1 = 2*rbuf1_i[0] + 1; 1465 ct2 = 0; 1466 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1467 1468 kmax = rbuf1[i][2]; 1469 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1470 row = rbuf1_i[ct1] - rstart; 1471 nzA = ai[row+1] - ai[row]; 1472 nzB = bi[row+1] - bi[row]; 1473 ncols = nzA + nzB; 1474 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1475 1476 /* load the column indices for this row into cols*/ 1477 cols = sbuf_aj_i + ct2; 1478 1479 lwrite = 0; 1480 for (l=0; l<nzB; l++) { 1481 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1482 } 1483 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1484 for (l=0; l<nzB; l++) { 1485 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1486 } 1487 1488 ct2 += ncols; 1489 } 1490 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 1491 } 1492 1493 /* create column map (cmap): global col of C -> local col of submat */ 1494 #if defined(PETSC_USE_CTABLE) 1495 if (!allcolumns) { 1496 ierr = PetscTableCreate(ncol,C->cmap->N,&cmap);CHKERRQ(ierr); 1497 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1498 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1499 if (icol[j] >= cstart && icol[j] <cend) { 1500 cmap_loc[icol[j] - cstart] = j+1; 1501 } else { /* use PetscTable for non-local col indices */ 1502 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1503 } 1504 } 1505 } else { 1506 cmap = NULL; 1507 cmap_loc = NULL; 1508 } 1509 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1510 #else 1511 if (!allcolumns) { 1512 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1513 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1514 } else { 1515 cmap = NULL; 1516 } 1517 #endif 1518 1519 /* Create lens for MatSeqAIJSetPreallocation() */ 1520 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1521 1522 /* Compute lens from local part of C */ 1523 for (j=0; j<nrow; j++) { 1524 row = irow[j]; 1525 proc = row2proc[j]; 1526 if (proc == rank) { 1527 /* diagonal part A = c->A */ 1528 ncols = ai[row-rstart+1] - ai[row-rstart]; 1529 cols = aj + ai[row-rstart]; 1530 if (!allcolumns) { 1531 for (k=0; k<ncols; k++) { 1532 #if defined(PETSC_USE_CTABLE) 1533 tcol = cmap_loc[cols[k]]; 1534 #else 1535 tcol = cmap[cols[k]+cstart]; 1536 #endif 1537 if (tcol) lens[j]++; 1538 } 1539 } else { /* allcolumns */ 1540 lens[j] = ncols; 1541 } 1542 1543 /* off-diagonal part B = c->B */ 1544 ncols = bi[row-rstart+1] - bi[row-rstart]; 1545 cols = bj + bi[row-rstart]; 1546 if (!allcolumns) { 1547 for (k=0; k<ncols; k++) { 1548 #if defined(PETSC_USE_CTABLE) 1549 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1550 #else 1551 tcol = cmap[bmap[cols[k]]]; 1552 #endif 1553 if (tcol) lens[j]++; 1554 } 1555 } else { /* allcolumns */ 1556 lens[j] += ncols; 1557 } 1558 } 1559 } 1560 1561 /* Create row map (rmap): global row of C -> local row of submat */ 1562 #if defined(PETSC_USE_CTABLE) 1563 ierr = PetscTableCreate(nrow,C->rmap->N,&rmap);CHKERRQ(ierr); 1564 for (j=0; j<nrow; j++) { 1565 row = irow[j]; 1566 proc = row2proc[j]; 1567 if (proc == rank) { /* a local row */ 1568 rmap_loc[row - rstart] = j; 1569 } else { 1570 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1571 } 1572 } 1573 #else 1574 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1575 for (j=0; j<nrow; j++) { 1576 rmap[irow[j]] = j; 1577 } 1578 #endif 1579 1580 /* Update lens from offproc data */ 1581 /* recv a->j is done */ 1582 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr); 1583 for (i=0; i<nrqs; i++) { 1584 proc = pa[i]; 1585 sbuf1_i = sbuf1[proc]; 1586 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1587 ct1 = 2 + 1; 1588 ct2 = 0; 1589 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1590 rbuf3_i = rbuf3[i]; /* received C->j */ 1591 1592 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1593 max1 = sbuf1_i[2]; 1594 for (k=0; k<max1; k++,ct1++) { 1595 #if defined(PETSC_USE_CTABLE) 1596 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1597 row--; 1598 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1599 #else 1600 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1601 #endif 1602 /* Now, store row index of submat in sbuf1_i[ct1] */ 1603 sbuf1_i[ct1] = row; 1604 1605 nnz = rbuf2_i[ct1]; 1606 if (!allcolumns) { 1607 for (l=0; l<nnz; l++,ct2++) { 1608 #if defined(PETSC_USE_CTABLE) 1609 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1610 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1611 } else { 1612 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1613 } 1614 #else 1615 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1616 #endif 1617 if (tcol) lens[row]++; 1618 } 1619 } else { /* allcolumns */ 1620 lens[row] += nnz; 1621 } 1622 } 1623 } 1624 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr); 1625 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1626 1627 /* Create the submatrices */ 1628 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1629 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1630 1631 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1632 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1633 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1634 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1635 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1636 1637 /* create struct Mat_SubSppt and attached it to submat */ 1638 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1639 subc = (Mat_SeqAIJ*)submat->data; 1640 subc->submatis1 = smatis1; 1641 1642 smatis1->id = 0; 1643 smatis1->nrqs = nrqs; 1644 smatis1->nrqr = nrqr; 1645 smatis1->rbuf1 = rbuf1; 1646 smatis1->rbuf2 = rbuf2; 1647 smatis1->rbuf3 = rbuf3; 1648 smatis1->sbuf2 = sbuf2; 1649 smatis1->req_source2 = req_source2; 1650 1651 smatis1->sbuf1 = sbuf1; 1652 smatis1->ptr = ptr; 1653 smatis1->tmp = tmp; 1654 smatis1->ctr = ctr; 1655 1656 smatis1->pa = pa; 1657 smatis1->req_size = req_size; 1658 smatis1->req_source1 = req_source1; 1659 1660 smatis1->allcolumns = allcolumns; 1661 smatis1->singleis = PETSC_TRUE; 1662 smatis1->row2proc = row2proc; 1663 smatis1->rmap = rmap; 1664 smatis1->cmap = cmap; 1665 #if defined(PETSC_USE_CTABLE) 1666 smatis1->rmap_loc = rmap_loc; 1667 smatis1->cmap_loc = cmap_loc; 1668 #endif 1669 1670 smatis1->destroy = submat->ops->destroy; 1671 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1672 submat->factortype = C->factortype; 1673 1674 /* compute rmax */ 1675 rmax = 0; 1676 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1677 1678 } else { /* scall == MAT_REUSE_MATRIX */ 1679 submat = submats[0]; 1680 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1681 1682 subc = (Mat_SeqAIJ*)submat->data; 1683 rmax = subc->rmax; 1684 smatis1 = subc->submatis1; 1685 nrqs = smatis1->nrqs; 1686 nrqr = smatis1->nrqr; 1687 rbuf1 = smatis1->rbuf1; 1688 rbuf2 = smatis1->rbuf2; 1689 rbuf3 = smatis1->rbuf3; 1690 req_source2 = smatis1->req_source2; 1691 1692 sbuf1 = smatis1->sbuf1; 1693 sbuf2 = smatis1->sbuf2; 1694 ptr = smatis1->ptr; 1695 tmp = smatis1->tmp; 1696 ctr = smatis1->ctr; 1697 1698 pa = smatis1->pa; 1699 req_size = smatis1->req_size; 1700 req_source1 = smatis1->req_source1; 1701 1702 allcolumns = smatis1->allcolumns; 1703 row2proc = smatis1->row2proc; 1704 rmap = smatis1->rmap; 1705 cmap = smatis1->cmap; 1706 #if defined(PETSC_USE_CTABLE) 1707 rmap_loc = smatis1->rmap_loc; 1708 cmap_loc = smatis1->cmap_loc; 1709 #endif 1710 } 1711 1712 /* Post recv matrix values */ 1713 ierr = PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1714 ierr = PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4);CHKERRQ(ierr); 1715 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1716 for (i=0; i<nrqs; ++i) { 1717 ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr); 1718 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 1719 } 1720 1721 /* Allocate sending buffers for a->a, and send them off */ 1722 ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr); 1723 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1724 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);} 1725 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1726 1727 for (i=0; i<nrqr; i++) { 1728 rbuf1_i = rbuf1[i]; 1729 sbuf_aa_i = sbuf_aa[i]; 1730 ct1 = 2*rbuf1_i[0]+1; 1731 ct2 = 0; 1732 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1733 1734 kmax = rbuf1_i[2]; 1735 for (k=0; k<kmax; k++,ct1++) { 1736 row = rbuf1_i[ct1] - rstart; 1737 nzA = ai[row+1] - ai[row]; 1738 nzB = bi[row+1] - bi[row]; 1739 ncols = nzA + nzB; 1740 cworkB = bj + bi[row]; 1741 vworkA = a_a + ai[row]; 1742 vworkB = b_a + bi[row]; 1743 1744 /* load the column values for this row into vals*/ 1745 vals = sbuf_aa_i + ct2; 1746 1747 lwrite = 0; 1748 for (l=0; l<nzB; l++) { 1749 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1750 } 1751 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1752 for (l=0; l<nzB; l++) { 1753 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1754 } 1755 1756 ct2 += ncols; 1757 } 1758 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 1759 } 1760 1761 /* Assemble submat */ 1762 /* First assemble the local rows */ 1763 for (j=0; j<nrow; j++) { 1764 row = irow[j]; 1765 proc = row2proc[j]; 1766 if (proc == rank) { 1767 Crow = row - rstart; /* local row index of C */ 1768 #if defined(PETSC_USE_CTABLE) 1769 row = rmap_loc[Crow]; /* row index of submat */ 1770 #else 1771 row = rmap[row]; 1772 #endif 1773 1774 if (allcolumns) { 1775 /* diagonal part A = c->A */ 1776 ncols = ai[Crow+1] - ai[Crow]; 1777 cols = aj + ai[Crow]; 1778 vals = a_a + ai[Crow]; 1779 i = 0; 1780 for (k=0; k<ncols; k++) { 1781 subcols[i] = cols[k] + cstart; 1782 subvals[i++] = vals[k]; 1783 } 1784 1785 /* off-diagonal part B = c->B */ 1786 ncols = bi[Crow+1] - bi[Crow]; 1787 cols = bj + bi[Crow]; 1788 vals = b_a + bi[Crow]; 1789 for (k=0; k<ncols; k++) { 1790 subcols[i] = bmap[cols[k]]; 1791 subvals[i++] = vals[k]; 1792 } 1793 1794 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1795 1796 } else { /* !allcolumns */ 1797 #if defined(PETSC_USE_CTABLE) 1798 /* diagonal part A = c->A */ 1799 ncols = ai[Crow+1] - ai[Crow]; 1800 cols = aj + ai[Crow]; 1801 vals = a_a + ai[Crow]; 1802 i = 0; 1803 for (k=0; k<ncols; k++) { 1804 tcol = cmap_loc[cols[k]]; 1805 if (tcol) { 1806 subcols[i] = --tcol; 1807 subvals[i++] = vals[k]; 1808 } 1809 } 1810 1811 /* off-diagonal part B = c->B */ 1812 ncols = bi[Crow+1] - bi[Crow]; 1813 cols = bj + bi[Crow]; 1814 vals = b_a + bi[Crow]; 1815 for (k=0; k<ncols; k++) { 1816 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1817 if (tcol) { 1818 subcols[i] = --tcol; 1819 subvals[i++] = vals[k]; 1820 } 1821 } 1822 #else 1823 /* diagonal part A = c->A */ 1824 ncols = ai[Crow+1] - ai[Crow]; 1825 cols = aj + ai[Crow]; 1826 vals = a_a + ai[Crow]; 1827 i = 0; 1828 for (k=0; k<ncols; k++) { 1829 tcol = cmap[cols[k]+cstart]; 1830 if (tcol) { 1831 subcols[i] = --tcol; 1832 subvals[i++] = vals[k]; 1833 } 1834 } 1835 1836 /* off-diagonal part B = c->B */ 1837 ncols = bi[Crow+1] - bi[Crow]; 1838 cols = bj + bi[Crow]; 1839 vals = b_a + bi[Crow]; 1840 for (k=0; k<ncols; k++) { 1841 tcol = cmap[bmap[cols[k]]]; 1842 if (tcol) { 1843 subcols[i] = --tcol; 1844 subvals[i++] = vals[k]; 1845 } 1846 } 1847 #endif 1848 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1849 } 1850 } 1851 } 1852 1853 /* Now assemble the off-proc rows */ 1854 for (i=0; i<nrqs; i++) { /* for each requested message */ 1855 /* recv values from other processes */ 1856 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr); 1857 proc = pa[idex]; 1858 sbuf1_i = sbuf1[proc]; 1859 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1860 ct1 = 2 + 1; 1861 ct2 = 0; /* count of received C->j */ 1862 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1863 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1864 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1865 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1866 1867 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1868 max1 = sbuf1_i[2]; /* num of rows */ 1869 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1870 row = sbuf1_i[ct1]; /* row index of submat */ 1871 if (!allcolumns) { 1872 idex = 0; 1873 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1874 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1875 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1876 #if defined(PETSC_USE_CTABLE) 1877 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1878 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1879 } else { 1880 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1881 } 1882 #else 1883 tcol = cmap[rbuf3_i[ct2]]; 1884 #endif 1885 if (tcol) { 1886 subcols[idex] = --tcol; /* may not be sorted */ 1887 subvals[idex++] = rbuf4_i[ct2]; 1888 1889 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1890 For reuse, we replace received C->j with index that should be inserted to submat */ 1891 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1892 } 1893 } 1894 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1895 } else { /* scall == MAT_REUSE_MATRIX */ 1896 submat = submats[0]; 1897 subc = (Mat_SeqAIJ*)submat->data; 1898 1899 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1900 for (l=0; l<nnz; l++) { 1901 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1902 subvals[idex++] = rbuf4_i[ct2]; 1903 } 1904 1905 bj = subc->j + subc->i[row]; /* sorted column indices */ 1906 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1907 } 1908 } else { /* allcolumns */ 1909 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1910 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1911 ct2 += nnz; 1912 } 1913 } 1914 } 1915 1916 /* sending a->a are done */ 1917 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr); 1918 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1919 1920 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1921 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1922 submats[0] = submat; 1923 1924 /* Restore the indices */ 1925 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1926 if (!allcolumns) { 1927 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1928 } 1929 1930 /* Destroy allocated memory */ 1931 for (i=0; i<nrqs; ++i) { 1932 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1933 } 1934 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1935 if (sbuf_aa) { 1936 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1937 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1938 } 1939 1940 if (scall == MAT_INITIAL_MATRIX) { 1941 ierr = PetscFree(lens);CHKERRQ(ierr); 1942 if (sbuf_aj) { 1943 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1944 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1945 } 1946 } 1947 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1948 ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1949 PetscFunctionReturn(0); 1950 } 1951 1952 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1953 { 1954 PetscErrorCode ierr; 1955 PetscInt ncol; 1956 PetscBool colflag,allcolumns=PETSC_FALSE; 1957 1958 PetscFunctionBegin; 1959 /* Allocate memory to hold all the submatrices */ 1960 if (scall == MAT_INITIAL_MATRIX) { 1961 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1962 } 1963 1964 /* Check for special case: each processor gets entire matrix columns */ 1965 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1966 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1967 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1968 1969 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1970 PetscFunctionReturn(0); 1971 } 1972 1973 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1974 { 1975 PetscErrorCode ierr; 1976 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1977 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1978 Mat_SeqAIJ *subc; 1979 Mat_SubSppt *smat; 1980 1981 PetscFunctionBegin; 1982 /* Check for special case: each processor has a single IS */ 1983 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1984 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1985 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1986 PetscFunctionReturn(0); 1987 } 1988 1989 /* Collect global wantallmatrix and nstages */ 1990 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1991 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1992 if (!nmax) nmax = 1; 1993 1994 if (scall == MAT_INITIAL_MATRIX) { 1995 /* Collect global wantallmatrix and nstages */ 1996 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1997 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1998 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1999 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2000 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2001 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2002 wantallmatrix = PETSC_TRUE; 2003 2004 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2005 } 2006 } 2007 2008 /* Determine the number of stages through which submatrices are done 2009 Each stage will extract nmax submatrices. 2010 nmax is determined by the matrix column dimension. 2011 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2012 */ 2013 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2014 2015 in[0] = -1*(PetscInt)wantallmatrix; 2016 in[1] = nstages; 2017 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr); 2018 wantallmatrix = (PetscBool)(-out[0]); 2019 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2020 2021 } else { /* MAT_REUSE_MATRIX */ 2022 if (ismax) { 2023 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2024 smat = subc->submatis1; 2025 } else { /* (*submat)[0] is a dummy matrix */ 2026 smat = (Mat_SubSppt*)(*submat)[0]->data; 2027 } 2028 if (!smat) { 2029 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2030 wantallmatrix = PETSC_TRUE; 2031 } else if (smat->singleis) { 2032 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2033 PetscFunctionReturn(0); 2034 } else { 2035 nstages = smat->nstages; 2036 } 2037 } 2038 2039 if (wantallmatrix) { 2040 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2041 PetscFunctionReturn(0); 2042 } 2043 2044 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2045 if (scall == MAT_INITIAL_MATRIX) { 2046 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2047 } 2048 2049 for (i=0,pos=0; i<nstages; i++) { 2050 if (pos+nmax <= ismax) max_no = nmax; 2051 else if (pos >= ismax) max_no = 0; 2052 else max_no = ismax-pos; 2053 2054 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2055 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2056 smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++; 2057 smat->nstages = nstages; 2058 } 2059 pos += max_no; 2060 } 2061 2062 if (ismax && scall == MAT_INITIAL_MATRIX) { 2063 /* save nstages for reuse */ 2064 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2065 smat = subc->submatis1; 2066 smat->nstages = nstages; 2067 } 2068 PetscFunctionReturn(0); 2069 } 2070 2071 /* -------------------------------------------------------------------------*/ 2072 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2073 { 2074 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2075 Mat A = c->A; 2076 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2077 const PetscInt **icol,**irow; 2078 PetscInt *nrow,*ncol,start; 2079 PetscErrorCode ierr; 2080 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2081 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2082 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2083 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2084 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2085 #if defined(PETSC_USE_CTABLE) 2086 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2087 #else 2088 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2089 #endif 2090 const PetscInt *irow_i; 2091 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2092 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2093 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2094 MPI_Status *r_status1,*r_status2; 2095 MPI_Comm comm; 2096 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2097 PetscMPIInt *onodes1,*olengths1,end; 2098 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2099 Mat_SubSppt *smat_i; 2100 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2101 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2102 2103 PetscFunctionBegin; 2104 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2105 size = c->size; 2106 rank = c->rank; 2107 2108 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2109 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2110 2111 for (i=0; i<ismax; i++) { 2112 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2113 if (!issorted[i]) iscsorted = issorted[i]; 2114 2115 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2116 2117 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2118 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2119 2120 /* Check for special case: allcolumn */ 2121 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2122 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2123 if (colflag && ncol[i] == C->cmap->N) { 2124 allcolumns[i] = PETSC_TRUE; 2125 icol[i] = NULL; 2126 } else { 2127 allcolumns[i] = PETSC_FALSE; 2128 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2129 } 2130 } 2131 2132 if (scall == MAT_REUSE_MATRIX) { 2133 /* Assumes new rows are same length as the old rows */ 2134 for (i=0; i<ismax; i++) { 2135 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2136 subc = (Mat_SeqAIJ*)submats[i]->data; 2137 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2138 2139 /* Initial matrix as if empty */ 2140 ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr); 2141 2142 smat_i = subc->submatis1; 2143 2144 nrqs = smat_i->nrqs; 2145 nrqr = smat_i->nrqr; 2146 rbuf1 = smat_i->rbuf1; 2147 rbuf2 = smat_i->rbuf2; 2148 rbuf3 = smat_i->rbuf3; 2149 req_source2 = smat_i->req_source2; 2150 2151 sbuf1 = smat_i->sbuf1; 2152 sbuf2 = smat_i->sbuf2; 2153 ptr = smat_i->ptr; 2154 tmp = smat_i->tmp; 2155 ctr = smat_i->ctr; 2156 2157 pa = smat_i->pa; 2158 req_size = smat_i->req_size; 2159 req_source1 = smat_i->req_source1; 2160 2161 allcolumns[i] = smat_i->allcolumns; 2162 row2proc[i] = smat_i->row2proc; 2163 rmap[i] = smat_i->rmap; 2164 cmap[i] = smat_i->cmap; 2165 } 2166 2167 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2168 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2169 smat_i = (Mat_SubSppt*)submats[0]->data; 2170 2171 nrqs = smat_i->nrqs; 2172 nrqr = smat_i->nrqr; 2173 rbuf1 = smat_i->rbuf1; 2174 rbuf2 = smat_i->rbuf2; 2175 rbuf3 = smat_i->rbuf3; 2176 req_source2 = smat_i->req_source2; 2177 2178 sbuf1 = smat_i->sbuf1; 2179 sbuf2 = smat_i->sbuf2; 2180 ptr = smat_i->ptr; 2181 tmp = smat_i->tmp; 2182 ctr = smat_i->ctr; 2183 2184 pa = smat_i->pa; 2185 req_size = smat_i->req_size; 2186 req_source1 = smat_i->req_source1; 2187 2188 allcolumns[0] = PETSC_FALSE; 2189 } 2190 } else { /* scall == MAT_INITIAL_MATRIX */ 2191 /* Get some new tags to keep the communication clean */ 2192 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2193 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2194 2195 /* evaluate communication - mesg to who, length of mesg, and buffer space 2196 required. Based on this, buffers are allocated, and data copied into them*/ 2197 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2198 2199 for (i=0; i<ismax; i++) { 2200 jmax = nrow[i]; 2201 irow_i = irow[i]; 2202 2203 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2204 row2proc[i] = row2proc_i; 2205 2206 if (issorted[i]) proc = 0; 2207 for (j=0; j<jmax; j++) { 2208 if (!issorted[i]) proc = 0; 2209 row = irow_i[j]; 2210 while (row >= C->rmap->range[proc+1]) proc++; 2211 w4[proc]++; 2212 row2proc_i[j] = proc; /* map row index to proc */ 2213 } 2214 for (j=0; j<size; j++) { 2215 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2216 } 2217 } 2218 2219 nrqs = 0; /* no of outgoing messages */ 2220 msz = 0; /* total mesg length (for all procs) */ 2221 w1[rank] = 0; /* no mesg sent to self */ 2222 w3[rank] = 0; 2223 for (i=0; i<size; i++) { 2224 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2225 } 2226 ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2227 for (i=0,j=0; i<size; i++) { 2228 if (w1[i]) { pa[j] = i; j++; } 2229 } 2230 2231 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2232 for (i=0; i<nrqs; i++) { 2233 j = pa[i]; 2234 w1[j] += w2[j] + 2* w3[j]; 2235 msz += w1[j]; 2236 } 2237 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2238 2239 /* Determine the number of messages to expect, their lengths, from from-ids */ 2240 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2241 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2242 2243 /* Now post the Irecvs corresponding to these messages */ 2244 tag0 = ((PetscObject)C)->tag; 2245 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2246 2247 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2248 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2249 2250 /* Allocate Memory for outgoing messages */ 2251 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2252 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 2253 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 2254 2255 { 2256 PetscInt *iptr = tmp; 2257 k = 0; 2258 for (i=0; i<nrqs; i++) { 2259 j = pa[i]; 2260 iptr += k; 2261 sbuf1[j] = iptr; 2262 k = w1[j]; 2263 } 2264 } 2265 2266 /* Form the outgoing messages. Initialize the header space */ 2267 for (i=0; i<nrqs; i++) { 2268 j = pa[i]; 2269 sbuf1[j][0] = 0; 2270 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 2271 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2272 } 2273 2274 /* Parse the isrow and copy data into outbuf */ 2275 for (i=0; i<ismax; i++) { 2276 row2proc_i = row2proc[i]; 2277 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 2278 irow_i = irow[i]; 2279 jmax = nrow[i]; 2280 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2281 proc = row2proc_i[j]; 2282 if (proc != rank) { /* copy to the outgoing buf*/ 2283 ctr[proc]++; 2284 *ptr[proc] = irow_i[j]; 2285 ptr[proc]++; 2286 } 2287 } 2288 /* Update the headers for the current IS */ 2289 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2290 if ((ctr_j = ctr[j])) { 2291 sbuf1_j = sbuf1[j]; 2292 k = ++sbuf1_j[0]; 2293 sbuf1_j[2*k] = ctr_j; 2294 sbuf1_j[2*k-1] = i; 2295 } 2296 } 2297 } 2298 2299 /* Now post the sends */ 2300 ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr); 2301 for (i=0; i<nrqs; ++i) { 2302 j = pa[i]; 2303 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr); 2304 } 2305 2306 /* Post Receives to capture the buffer size */ 2307 ierr = PetscMalloc1(nrqs,&r_waits2);CHKERRQ(ierr); 2308 ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr); 2309 if (nrqs) rbuf2[0] = tmp + msz; 2310 for (i=1; i<nrqs; ++i) { 2311 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2312 } 2313 for (i=0; i<nrqs; ++i) { 2314 j = pa[i]; 2315 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 2316 } 2317 2318 /* Send to other procs the buf size they should allocate */ 2319 /* Receive messages*/ 2320 ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr); 2321 ierr = PetscMalloc1(nrqr,&r_status1);CHKERRQ(ierr); 2322 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2323 { 2324 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2325 PetscInt *sbuf2_i; 2326 2327 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 2328 for (i=0; i<nrqr; ++i) { 2329 req_size[i] = 0; 2330 rbuf1_i = rbuf1[i]; 2331 start = 2*rbuf1_i[0] + 1; 2332 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr); 2333 ierr = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr); 2334 sbuf2_i = sbuf2[i]; 2335 for (j=start; j<end; j++) { 2336 id = rbuf1_i[j] - rstart; 2337 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2338 sbuf2_i[j] = ncols; 2339 req_size[i] += ncols; 2340 } 2341 req_source1[i] = r_status1[i].MPI_SOURCE; 2342 /* form the header */ 2343 sbuf2_i[0] = req_size[i]; 2344 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2345 2346 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 2347 } 2348 } 2349 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2350 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2351 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2352 2353 /* Receive messages*/ 2354 ierr = PetscMalloc1(nrqs,&r_waits3);CHKERRQ(ierr); 2355 ierr = PetscMalloc1(nrqs,&r_status2);CHKERRQ(ierr); 2356 2357 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 2358 for (i=0; i<nrqs; ++i) { 2359 ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr); 2360 req_source2[i] = r_status2[i].MPI_SOURCE; 2361 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 2362 } 2363 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2364 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2365 2366 /* Wait on sends1 and sends2 */ 2367 ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2368 ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2369 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2370 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2371 2372 /* Now allocate sending buffers for a->j, and send them off */ 2373 ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr); 2374 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2375 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);} 2376 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2377 2378 ierr = PetscMalloc1(nrqr,&s_waits3);CHKERRQ(ierr); 2379 { 2380 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2381 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2382 PetscInt cend = C->cmap->rend; 2383 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2384 2385 for (i=0; i<nrqr; i++) { 2386 rbuf1_i = rbuf1[i]; 2387 sbuf_aj_i = sbuf_aj[i]; 2388 ct1 = 2*rbuf1_i[0] + 1; 2389 ct2 = 0; 2390 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2391 kmax = rbuf1[i][2*j]; 2392 for (k=0; k<kmax; k++,ct1++) { 2393 row = rbuf1_i[ct1] - rstart; 2394 nzA = a_i[row+1] - a_i[row]; 2395 nzB = b_i[row+1] - b_i[row]; 2396 ncols = nzA + nzB; 2397 cworkA = a_j + a_i[row]; 2398 cworkB = b_j + b_i[row]; 2399 2400 /* load the column indices for this row into cols */ 2401 cols = sbuf_aj_i + ct2; 2402 2403 lwrite = 0; 2404 for (l=0; l<nzB; l++) { 2405 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2406 } 2407 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2408 for (l=0; l<nzB; l++) { 2409 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2410 } 2411 2412 ct2 += ncols; 2413 } 2414 } 2415 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 2416 } 2417 } 2418 2419 /* create col map: global col of C -> local col of submatrices */ 2420 { 2421 const PetscInt *icol_i; 2422 #if defined(PETSC_USE_CTABLE) 2423 for (i=0; i<ismax; i++) { 2424 if (!allcolumns[i]) { 2425 ierr = PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2426 2427 jmax = ncol[i]; 2428 icol_i = icol[i]; 2429 cmap_i = cmap[i]; 2430 for (j=0; j<jmax; j++) { 2431 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2432 } 2433 } else cmap[i] = NULL; 2434 } 2435 #else 2436 for (i=0; i<ismax; i++) { 2437 if (!allcolumns[i]) { 2438 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2439 jmax = ncol[i]; 2440 icol_i = icol[i]; 2441 cmap_i = cmap[i]; 2442 for (j=0; j<jmax; j++) { 2443 cmap_i[icol_i[j]] = j+1; 2444 } 2445 } else cmap[i] = NULL; 2446 } 2447 #endif 2448 } 2449 2450 /* Create lens which is required for MatCreate... */ 2451 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2452 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2453 2454 if (ismax) { 2455 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2456 } 2457 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2458 2459 /* Update lens from local data */ 2460 for (i=0; i<ismax; i++) { 2461 row2proc_i = row2proc[i]; 2462 jmax = nrow[i]; 2463 if (!allcolumns[i]) cmap_i = cmap[i]; 2464 irow_i = irow[i]; 2465 lens_i = lens[i]; 2466 for (j=0; j<jmax; j++) { 2467 row = irow_i[j]; 2468 proc = row2proc_i[j]; 2469 if (proc == rank) { 2470 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2471 if (!allcolumns[i]) { 2472 for (k=0; k<ncols; k++) { 2473 #if defined(PETSC_USE_CTABLE) 2474 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2475 #else 2476 tcol = cmap_i[cols[k]]; 2477 #endif 2478 if (tcol) lens_i[j]++; 2479 } 2480 } else { /* allcolumns */ 2481 lens_i[j] = ncols; 2482 } 2483 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2484 } 2485 } 2486 } 2487 2488 /* Create row map: global row of C -> local row of submatrices */ 2489 #if defined(PETSC_USE_CTABLE) 2490 for (i=0; i<ismax; i++) { 2491 ierr = PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2492 irow_i = irow[i]; 2493 jmax = nrow[i]; 2494 for (j=0; j<jmax; j++) { 2495 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2496 } 2497 } 2498 #else 2499 for (i=0; i<ismax; i++) { 2500 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2501 rmap_i = rmap[i]; 2502 irow_i = irow[i]; 2503 jmax = nrow[i]; 2504 for (j=0; j<jmax; j++) { 2505 rmap_i[irow_i[j]] = j; 2506 } 2507 } 2508 #endif 2509 2510 /* Update lens from offproc data */ 2511 { 2512 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2513 2514 ierr = MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2515 for (tmp2=0; tmp2<nrqs; tmp2++) { 2516 sbuf1_i = sbuf1[pa[tmp2]]; 2517 jmax = sbuf1_i[0]; 2518 ct1 = 2*jmax+1; 2519 ct2 = 0; 2520 rbuf2_i = rbuf2[tmp2]; 2521 rbuf3_i = rbuf3[tmp2]; 2522 for (j=1; j<=jmax; j++) { 2523 is_no = sbuf1_i[2*j-1]; 2524 max1 = sbuf1_i[2*j]; 2525 lens_i = lens[is_no]; 2526 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2527 rmap_i = rmap[is_no]; 2528 for (k=0; k<max1; k++,ct1++) { 2529 #if defined(PETSC_USE_CTABLE) 2530 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2531 row--; 2532 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2533 #else 2534 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2535 #endif 2536 max2 = rbuf2_i[ct1]; 2537 for (l=0; l<max2; l++,ct2++) { 2538 if (!allcolumns[is_no]) { 2539 #if defined(PETSC_USE_CTABLE) 2540 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2541 #else 2542 tcol = cmap_i[rbuf3_i[ct2]]; 2543 #endif 2544 if (tcol) lens_i[row]++; 2545 } else { /* allcolumns */ 2546 lens_i[row]++; /* lens_i[row] += max2 ? */ 2547 } 2548 } 2549 } 2550 } 2551 } 2552 } 2553 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2554 ierr = MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2555 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2556 2557 /* Create the submatrices */ 2558 for (i=0; i<ismax; i++) { 2559 PetscInt rbs,cbs; 2560 2561 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2562 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2563 2564 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2565 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2566 2567 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2568 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2569 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2570 2571 /* create struct Mat_SubSppt and attached it to submat */ 2572 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2573 subc = (Mat_SeqAIJ*)submats[i]->data; 2574 subc->submatis1 = smat_i; 2575 2576 smat_i->destroy = submats[i]->ops->destroy; 2577 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2578 submats[i]->factortype = C->factortype; 2579 2580 smat_i->id = i; 2581 smat_i->nrqs = nrqs; 2582 smat_i->nrqr = nrqr; 2583 smat_i->rbuf1 = rbuf1; 2584 smat_i->rbuf2 = rbuf2; 2585 smat_i->rbuf3 = rbuf3; 2586 smat_i->sbuf2 = sbuf2; 2587 smat_i->req_source2 = req_source2; 2588 2589 smat_i->sbuf1 = sbuf1; 2590 smat_i->ptr = ptr; 2591 smat_i->tmp = tmp; 2592 smat_i->ctr = ctr; 2593 2594 smat_i->pa = pa; 2595 smat_i->req_size = req_size; 2596 smat_i->req_source1 = req_source1; 2597 2598 smat_i->allcolumns = allcolumns[i]; 2599 smat_i->singleis = PETSC_FALSE; 2600 smat_i->row2proc = row2proc[i]; 2601 smat_i->rmap = rmap[i]; 2602 smat_i->cmap = cmap[i]; 2603 } 2604 2605 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2606 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2607 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2608 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2609 2610 /* create struct Mat_SubSppt and attached it to submat */ 2611 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2612 submats[0]->data = (void*)smat_i; 2613 2614 smat_i->destroy = submats[0]->ops->destroy; 2615 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2616 submats[0]->factortype = C->factortype; 2617 2618 smat_i->id = 0; 2619 smat_i->nrqs = nrqs; 2620 smat_i->nrqr = nrqr; 2621 smat_i->rbuf1 = rbuf1; 2622 smat_i->rbuf2 = rbuf2; 2623 smat_i->rbuf3 = rbuf3; 2624 smat_i->sbuf2 = sbuf2; 2625 smat_i->req_source2 = req_source2; 2626 2627 smat_i->sbuf1 = sbuf1; 2628 smat_i->ptr = ptr; 2629 smat_i->tmp = tmp; 2630 smat_i->ctr = ctr; 2631 2632 smat_i->pa = pa; 2633 smat_i->req_size = req_size; 2634 smat_i->req_source1 = req_source1; 2635 2636 smat_i->allcolumns = PETSC_FALSE; 2637 smat_i->singleis = PETSC_FALSE; 2638 smat_i->row2proc = NULL; 2639 smat_i->rmap = NULL; 2640 smat_i->cmap = NULL; 2641 } 2642 2643 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2644 ierr = PetscFree(lens);CHKERRQ(ierr); 2645 if (sbuf_aj) { 2646 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2647 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2648 } 2649 2650 } /* endof scall == MAT_INITIAL_MATRIX */ 2651 2652 /* Post recv matrix values */ 2653 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2654 ierr = PetscMalloc1(nrqs,&rbuf4);CHKERRQ(ierr); 2655 ierr = PetscMalloc1(nrqs,&r_waits4);CHKERRQ(ierr); 2656 for (i=0; i<nrqs; ++i) { 2657 ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr); 2658 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 2659 } 2660 2661 /* Allocate sending buffers for a->a, and send them off */ 2662 ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr); 2663 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2664 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);} 2665 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2666 2667 ierr = PetscMalloc1(nrqr,&s_waits4);CHKERRQ(ierr); 2668 { 2669 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2670 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2671 PetscInt cend = C->cmap->rend; 2672 PetscInt *b_j = b->j; 2673 PetscScalar *vworkA,*vworkB,*a_a,*b_a; 2674 2675 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2676 ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2677 for (i=0; i<nrqr; i++) { 2678 rbuf1_i = rbuf1[i]; 2679 sbuf_aa_i = sbuf_aa[i]; 2680 ct1 = 2*rbuf1_i[0]+1; 2681 ct2 = 0; 2682 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2683 kmax = rbuf1_i[2*j]; 2684 for (k=0; k<kmax; k++,ct1++) { 2685 row = rbuf1_i[ct1] - rstart; 2686 nzA = a_i[row+1] - a_i[row]; 2687 nzB = b_i[row+1] - b_i[row]; 2688 ncols = nzA + nzB; 2689 cworkB = b_j + b_i[row]; 2690 vworkA = a_a + a_i[row]; 2691 vworkB = b_a + b_i[row]; 2692 2693 /* load the column values for this row into vals*/ 2694 vals = sbuf_aa_i+ct2; 2695 2696 lwrite = 0; 2697 for (l=0; l<nzB; l++) { 2698 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2699 } 2700 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2701 for (l=0; l<nzB; l++) { 2702 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2703 } 2704 2705 ct2 += ncols; 2706 } 2707 } 2708 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 2709 } 2710 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2711 ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2712 } 2713 2714 /* Assemble the matrices */ 2715 /* First assemble the local rows */ 2716 for (i=0; i<ismax; i++) { 2717 row2proc_i = row2proc[i]; 2718 subc = (Mat_SeqAIJ*)submats[i]->data; 2719 imat_ilen = subc->ilen; 2720 imat_j = subc->j; 2721 imat_i = subc->i; 2722 imat_a = subc->a; 2723 2724 if (!allcolumns[i]) cmap_i = cmap[i]; 2725 rmap_i = rmap[i]; 2726 irow_i = irow[i]; 2727 jmax = nrow[i]; 2728 for (j=0; j<jmax; j++) { 2729 row = irow_i[j]; 2730 proc = row2proc_i[j]; 2731 if (proc == rank) { 2732 old_row = row; 2733 #if defined(PETSC_USE_CTABLE) 2734 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2735 row--; 2736 #else 2737 row = rmap_i[row]; 2738 #endif 2739 ilen_row = imat_ilen[row]; 2740 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2741 mat_i = imat_i[row]; 2742 mat_a = imat_a + mat_i; 2743 mat_j = imat_j + mat_i; 2744 if (!allcolumns[i]) { 2745 for (k=0; k<ncols; k++) { 2746 #if defined(PETSC_USE_CTABLE) 2747 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2748 #else 2749 tcol = cmap_i[cols[k]]; 2750 #endif 2751 if (tcol) { 2752 *mat_j++ = tcol - 1; 2753 *mat_a++ = vals[k]; 2754 ilen_row++; 2755 } 2756 } 2757 } else { /* allcolumns */ 2758 for (k=0; k<ncols; k++) { 2759 *mat_j++ = cols[k]; /* global col index! */ 2760 *mat_a++ = vals[k]; 2761 ilen_row++; 2762 } 2763 } 2764 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2765 2766 imat_ilen[row] = ilen_row; 2767 } 2768 } 2769 } 2770 2771 /* Now assemble the off proc rows */ 2772 ierr = MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2773 for (tmp2=0; tmp2<nrqs; tmp2++) { 2774 sbuf1_i = sbuf1[pa[tmp2]]; 2775 jmax = sbuf1_i[0]; 2776 ct1 = 2*jmax + 1; 2777 ct2 = 0; 2778 rbuf2_i = rbuf2[tmp2]; 2779 rbuf3_i = rbuf3[tmp2]; 2780 rbuf4_i = rbuf4[tmp2]; 2781 for (j=1; j<=jmax; j++) { 2782 is_no = sbuf1_i[2*j-1]; 2783 rmap_i = rmap[is_no]; 2784 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2785 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2786 imat_ilen = subc->ilen; 2787 imat_j = subc->j; 2788 imat_i = subc->i; 2789 imat_a = subc->a; 2790 max1 = sbuf1_i[2*j]; 2791 for (k=0; k<max1; k++,ct1++) { 2792 row = sbuf1_i[ct1]; 2793 #if defined(PETSC_USE_CTABLE) 2794 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2795 row--; 2796 #else 2797 row = rmap_i[row]; 2798 #endif 2799 ilen = imat_ilen[row]; 2800 mat_i = imat_i[row]; 2801 mat_a = imat_a + mat_i; 2802 mat_j = imat_j + mat_i; 2803 max2 = rbuf2_i[ct1]; 2804 if (!allcolumns[is_no]) { 2805 for (l=0; l<max2; l++,ct2++) { 2806 #if defined(PETSC_USE_CTABLE) 2807 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2808 #else 2809 tcol = cmap_i[rbuf3_i[ct2]]; 2810 #endif 2811 if (tcol) { 2812 *mat_j++ = tcol - 1; 2813 *mat_a++ = rbuf4_i[ct2]; 2814 ilen++; 2815 } 2816 } 2817 } else { /* allcolumns */ 2818 for (l=0; l<max2; l++,ct2++) { 2819 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2820 *mat_a++ = rbuf4_i[ct2]; 2821 ilen++; 2822 } 2823 } 2824 imat_ilen[row] = ilen; 2825 } 2826 } 2827 } 2828 2829 if (!iscsorted) { /* sort column indices of the rows */ 2830 for (i=0; i<ismax; i++) { 2831 subc = (Mat_SeqAIJ*)submats[i]->data; 2832 imat_j = subc->j; 2833 imat_i = subc->i; 2834 imat_a = subc->a; 2835 imat_ilen = subc->ilen; 2836 2837 if (allcolumns[i]) continue; 2838 jmax = nrow[i]; 2839 for (j=0; j<jmax; j++) { 2840 mat_i = imat_i[j]; 2841 mat_a = imat_a + mat_i; 2842 mat_j = imat_j + mat_i; 2843 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2844 } 2845 } 2846 } 2847 2848 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2849 ierr = MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2850 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2851 2852 /* Restore the indices */ 2853 for (i=0; i<ismax; i++) { 2854 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2855 if (!allcolumns[i]) { 2856 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2857 } 2858 } 2859 2860 for (i=0; i<ismax; i++) { 2861 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2862 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2863 } 2864 2865 /* Destroy allocated memory */ 2866 if (sbuf_aa) { 2867 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2868 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2869 } 2870 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2871 2872 for (i=0; i<nrqs; ++i) { 2873 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2874 } 2875 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2876 2877 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2878 PetscFunctionReturn(0); 2879 } 2880 2881 /* 2882 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2883 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2884 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2885 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2886 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2887 state, and needs to be "assembled" later by compressing B's column space. 2888 2889 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2890 Following this call, C->A & C->B have been created, even if empty. 2891 */ 2892 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2893 { 2894 /* If making this function public, change the error returned in this function away from _PLIB. */ 2895 PetscErrorCode ierr; 2896 Mat_MPIAIJ *aij; 2897 Mat_SeqAIJ *Baij; 2898 PetscBool seqaij,Bdisassembled; 2899 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2900 PetscScalar v; 2901 const PetscInt *rowindices,*colindices; 2902 2903 PetscFunctionBegin; 2904 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2905 if (A) { 2906 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2907 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2908 if (rowemb) { 2909 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2910 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2911 } else { 2912 if (C->rmap->n != A->rmap->n) { 2913 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2914 } 2915 } 2916 if (dcolemb) { 2917 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2918 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2919 } else { 2920 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2921 } 2922 } 2923 if (B) { 2924 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2925 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2926 if (rowemb) { 2927 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2928 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2929 } else { 2930 if (C->rmap->n != B->rmap->n) { 2931 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2932 } 2933 } 2934 if (ocolemb) { 2935 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2936 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2937 } else { 2938 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2939 } 2940 } 2941 2942 aij = (Mat_MPIAIJ*)C->data; 2943 if (!aij->A) { 2944 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2945 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2946 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2947 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2948 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2949 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2950 } 2951 if (A) { 2952 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2953 } else { 2954 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2955 } 2956 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2957 /* 2958 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2959 need to "disassemble" B -- convert it to using C's global indices. 2960 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2961 2962 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2963 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2964 2965 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2966 At least avoid calling MatSetValues() and the implied searches? 2967 */ 2968 2969 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2970 #if defined(PETSC_USE_CTABLE) 2971 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2972 #else 2973 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2974 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2975 if (aij->B) { 2976 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2977 } 2978 #endif 2979 ngcol = 0; 2980 if (aij->lvec) { 2981 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2982 } 2983 if (aij->garray) { 2984 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2985 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2986 } 2987 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2988 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2989 } 2990 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2991 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2992 } 2993 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2994 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2995 } 2996 } 2997 Bdisassembled = PETSC_FALSE; 2998 if (!aij->B) { 2999 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 3000 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 3001 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 3002 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 3003 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3004 Bdisassembled = PETSC_TRUE; 3005 } 3006 if (B) { 3007 Baij = (Mat_SeqAIJ*)B->data; 3008 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3009 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3010 for (i=0; i<B->rmap->n; i++) { 3011 nz[i] = Baij->i[i+1] - Baij->i[i]; 3012 } 3013 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3014 ierr = PetscFree(nz);CHKERRQ(ierr); 3015 } 3016 3017 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3018 shift = rend-rstart; 3019 count = 0; 3020 rowindices = NULL; 3021 colindices = NULL; 3022 if (rowemb) { 3023 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3024 } 3025 if (ocolemb) { 3026 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3027 } 3028 for (i=0; i<B->rmap->n; i++) { 3029 PetscInt row; 3030 row = i; 3031 if (rowindices) row = rowindices[i]; 3032 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3033 col = Baij->j[count]; 3034 if (colindices) col = colindices[col]; 3035 if (Bdisassembled && col>=rstart) col += shift; 3036 v = Baij->a[count]; 3037 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3038 ++count; 3039 } 3040 } 3041 /* No assembly for aij->B is necessary. */ 3042 /* FIXME: set aij->B's nonzerostate correctly. */ 3043 } else { 3044 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3045 } 3046 C->preallocated = PETSC_TRUE; 3047 C->was_assembled = PETSC_FALSE; 3048 C->assembled = PETSC_FALSE; 3049 /* 3050 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3051 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3052 */ 3053 PetscFunctionReturn(0); 3054 } 3055 3056 /* 3057 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3058 */ 3059 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3060 { 3061 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3062 3063 PetscFunctionBegin; 3064 PetscValidPointer(A,2); 3065 PetscValidPointer(B,3); 3066 /* FIXME: make sure C is assembled */ 3067 *A = aij->A; 3068 *B = aij->B; 3069 /* Note that we don't incref *A and *B, so be careful! */ 3070 PetscFunctionReturn(0); 3071 } 3072 3073 /* 3074 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3075 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3076 */ 3077 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3078 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3079 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3080 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3081 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3082 { 3083 PetscErrorCode ierr; 3084 PetscMPIInt size,flag; 3085 PetscInt i,ii,cismax,ispar; 3086 Mat *A,*B; 3087 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3088 3089 PetscFunctionBegin; 3090 if (!ismax) PetscFunctionReturn(0); 3091 3092 for (i = 0, cismax = 0; i < ismax; ++i) { 3093 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr); 3094 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3095 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr); 3096 if (size > 1) ++cismax; 3097 } 3098 3099 /* 3100 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3101 ispar counts the number of parallel ISs across C's comm. 3102 */ 3103 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr); 3104 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3105 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3106 PetscFunctionReturn(0); 3107 } 3108 3109 /* if (ispar) */ 3110 /* 3111 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3112 These are used to extract the off-diag portion of the resulting parallel matrix. 3113 The row IS for the off-diag portion is the same as for the diag portion, 3114 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3115 */ 3116 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3117 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3118 for (i = 0, ii = 0; i < ismax; ++i) { 3119 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3120 if (size > 1) { 3121 /* 3122 TODO: This is the part that's ***NOT SCALABLE***. 3123 To fix this we need to extract just the indices of C's nonzero columns 3124 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3125 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3126 to be done without serializing on the IS list, so, most likely, it is best 3127 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3128 */ 3129 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3130 /* Now we have to 3131 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3132 were sorted on each rank, concatenated they might no longer be sorted; 3133 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3134 indices in the nondecreasing order to the original index positions. 3135 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3136 */ 3137 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3138 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3139 ++ii; 3140 } 3141 } 3142 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3143 for (i = 0, ii = 0; i < ismax; ++i) { 3144 PetscInt j,issize; 3145 const PetscInt *indices; 3146 3147 /* 3148 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3149 */ 3150 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3151 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3152 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3153 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3154 for (j = 1; j < issize; ++j) { 3155 if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3156 } 3157 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3158 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3159 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3160 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3161 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3162 for (j = 1; j < issize; ++j) { 3163 if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3164 } 3165 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3166 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3167 if (size > 1) { 3168 cisrow[ii] = isrow[i]; 3169 ++ii; 3170 } 3171 } 3172 /* 3173 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3174 array of sequential matrices underlying the resulting parallel matrices. 3175 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3176 contain duplicates. 3177 3178 There are as many diag matrices as there are original index sets. There are only as many parallel 3179 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3180 3181 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3182 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3183 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3184 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3185 with A[i] and B[ii] extracted from the corresponding MPI submat. 3186 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3187 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3188 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3189 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3190 values into A[i] and B[ii] sitting inside the corresponding submat. 3191 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3192 A[i], B[ii], AA[i] or BB[ii] matrices. 3193 */ 3194 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3195 if (scall == MAT_INITIAL_MATRIX) { 3196 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3197 } 3198 3199 /* Now obtain the sequential A and B submatrices separately. */ 3200 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3201 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3202 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3203 3204 /* 3205 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3206 matrices A & B have been extracted directly into the parallel matrices containing them, or 3207 simply into the sequential matrix identical with the corresponding A (if size == 1). 3208 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3209 to have the same sparsity pattern. 3210 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3211 must be constructed for C. This is done by setseqmat(s). 3212 */ 3213 for (i = 0, ii = 0; i < ismax; ++i) { 3214 /* 3215 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3216 That way we can avoid sorting and computing permutations when reusing. 3217 To this end: 3218 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3219 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3220 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3221 */ 3222 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3223 3224 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3225 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3226 if (size > 1) { 3227 if (scall == MAT_INITIAL_MATRIX) { 3228 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3229 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3230 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3231 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3232 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3233 } 3234 /* 3235 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3236 */ 3237 { 3238 Mat AA = A[i],BB = B[ii]; 3239 3240 if (AA || BB) { 3241 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3242 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3243 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3244 } 3245 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3246 } 3247 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3248 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3249 ++ii; 3250 } else { /* if (size == 1) */ 3251 if (scall == MAT_REUSE_MATRIX) { 3252 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3253 } 3254 if (isrow_p[i] || iscol_p[i]) { 3255 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3256 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3257 /* Otherwise A is extracted straight into (*submats)[i]. */ 3258 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3259 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3260 } else (*submat)[i] = A[i]; 3261 } 3262 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3263 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3264 } 3265 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3266 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3267 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3268 ierr = PetscFree(A);CHKERRQ(ierr); 3269 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3270 PetscFunctionReturn(0); 3271 } 3272 3273 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3274 { 3275 PetscErrorCode ierr; 3276 3277 PetscFunctionBegin; 3278 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3279 PetscFunctionReturn(0); 3280 } 3281