1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 #include <../src/mat/impls/aij/mpi/mpiaij.h> 8 #include <petscbt.h> 9 #include <petscsf.h> 10 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 16 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 20 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 21 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 25 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 26 { 27 PetscErrorCode ierr; 28 PetscInt i; 29 30 PetscFunctionBegin; 31 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 32 for (i=0; i<ov; ++i) { 33 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 34 } 35 PetscFunctionReturn(0); 36 } 37 38 #undef __FUNCT__ 39 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 40 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 41 { 42 PetscErrorCode ierr; 43 PetscInt i; 44 45 PetscFunctionBegin; 46 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 47 for (i=0; i<ov; ++i) { 48 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 49 } 50 PetscFunctionReturn(0); 51 } 52 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 56 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 57 { 58 PetscErrorCode ierr; 59 MPI_Comm comm; 60 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 61 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 62 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 63 const PetscInt *indices_i,**indices; 64 PetscLayout rmap; 65 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 66 PetscSF sf; 67 PetscSFNode *remote; 68 69 PetscFunctionBegin; 70 /*communicator */ 71 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 72 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 73 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 74 /*row map*/ 75 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 76 /*retrieve IS data*/ 77 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 78 /*get length and indices*/ 79 for (i=0,tlength=0; i<nidx; i++){ 80 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 81 tlength += length[i]; 82 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 83 } 84 /*find these rows on remote processors */ 85 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 86 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 87 nrrows = 0; 88 for (i=0; i<nidx; i++){ 89 length_i = length[i]; 90 indices_i = indices[i]; 91 for (j=0; j<length_i; j++){ 92 owner = -1; 93 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 94 /*remote processors*/ 95 if (owner != rank){ 96 tosizes_temp[owner]++; /*number of rows to owner*/ 97 rrow_ranks[nrrows] = owner; /*processor */ 98 rrow_isids[nrrows] = i; /*is id*/ 99 remoterows[nrrows++] = indices_i[j]; /* row */ 100 } 101 } 102 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 103 } 104 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 105 /*test if we need to exchange messages 106 * generally speaking, we do not need to exchange 107 * data when overlap is 1 108 * */ 109 ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 110 /*we do not have any messages 111 * It usually corresponds to overlap 1 112 * */ 113 if (!reducednrrows){ 114 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 115 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 116 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 117 PetscFunctionReturn(0); 118 } 119 nto = 0; 120 /*send sizes and ranks*/ 121 for (i=0; i<size; i++){ 122 if (tosizes_temp[i]){ 123 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 124 tosizes_temp[i] = nto; /* a map from processor to index */ 125 toranks[nto++] = i; /* processor */ 126 } 127 } 128 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 129 for (i=0; i<nto; i++){ 130 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /*offsets*/ 131 tosizes[2*i+1] = toffsets[i]; /*offsets to send*/ 132 } 133 /*send information to other processors*/ 134 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 135 /*build a star forest */ 136 nrecvrows = 0; 137 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 138 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 139 nrecvrows = 0; 140 for (i=0; i<nfrom; i++){ 141 for (j=0; j<fromsizes[2*i]; j++){ 142 remote[nrecvrows].rank = fromranks[i]; 143 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 144 } 145 } 146 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 147 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 148 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 149 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 150 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 151 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 152 /*message pair <no of is, row> */ 153 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 154 for (i=0; i<nrrows; i++){ 155 owner = rrow_ranks[i]; /* processor */ 156 j = tosizes_temp[owner]; /* index */ 157 todata[toffsets[j]++] = rrow_isids[i]; 158 todata[toffsets[j]++] = remoterows[i]; 159 } 160 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 161 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 162 ierr = PetscFree(toffsets);CHKERRQ(ierr); 163 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 164 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 165 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 166 /*deal with remote data */ 167 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 168 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 169 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 170 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 171 ierr = PetscFree(fromranks);CHKERRQ(ierr); 172 nrecvrows = 0; 173 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 174 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 175 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 176 nrecvrows = 0; 177 for (i=0; i<nto; i++){ 178 for (j=0; j<tosizes[2*i]; j++){ 179 remote[nrecvrows].rank = toranks[i]; 180 remote[nrecvrows++].index = tosizes[2*i+1]+j; 181 } 182 } 183 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 184 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 185 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 186 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 187 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 188 /*overlap communication and computation*/ 189 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 190 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 191 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 192 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 193 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 194 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 195 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 196 ierr = PetscFree(toranks);CHKERRQ(ierr); 197 ierr = PetscFree(tosizes);CHKERRQ(ierr); 198 ierr = PetscFree(todata);CHKERRQ(ierr); 199 PetscFunctionReturn(0); 200 } 201 202 #undef __FUNCT__ 203 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 204 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 205 { 206 PetscInt *isz,isz_i,i,j,is_id, data_size; 207 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 208 const PetscInt *indices_i_temp; 209 PetscErrorCode ierr; 210 211 PetscFunctionBegin; 212 max_lsize = 0; 213 ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr); 214 for (i=0; i<nidx; i++){ 215 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 216 max_lsize = lsize>max_lsize ? lsize:max_lsize; 217 isz[i] = lsize; 218 } 219 ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr); 220 for (i=0; i<nidx; i++){ 221 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 222 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 223 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 224 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 225 } 226 /*retrieve information */ 227 for (i=0; i<nrecvs; ){ 228 is_id = recvdata[i++]; 229 data_size = recvdata[i++]; 230 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 231 isz_i = isz[is_id]; 232 for (j=0; j< data_size; j++){ 233 col = recvdata[i++]; 234 indices_i[isz_i++] = col; 235 } 236 isz[is_id] = isz_i; 237 } 238 /*remove duplicate entities*/ 239 for (i=0; i<nidx; i++){ 240 indices_i = indices_temp+(max_lsize+nrecvs)*i; 241 isz_i = isz[i]; 242 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 243 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 244 } 245 ierr = PetscFree(isz);CHKERRQ(ierr); 246 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 247 PetscFunctionReturn(0); 248 } 249 250 #undef __FUNCT__ 251 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 252 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 253 { 254 PetscLayout rmap,cmap; 255 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 256 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 257 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 258 const PetscInt *gcols,*ai,*aj,*bi,*bj; 259 Mat amat,bmat; 260 PetscMPIInt rank; 261 PetscBool done; 262 MPI_Comm comm; 263 PetscErrorCode ierr; 264 265 PetscFunctionBegin; 266 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 267 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 268 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 269 /* Even if the mat is symmetric, we still assume it is not symmetric*/ 270 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 271 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 272 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 273 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 274 /*total number of nonzero values */ 275 tnz = ai[an]+bi[bn]; 276 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 277 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 278 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 279 /*longest message */ 280 max_fszs = 0; 281 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 282 /*better way to estimate number of nonzero in the mat???*/ 283 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 284 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 285 rows_pos = 0; 286 totalrows = 0; 287 for (i=0; i<nfrom; i++){ 288 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 289 /*group data*/ 290 for (j=0; j<fromsizes[2*i]; j+=2){ 291 is_id = fromrows[rows_pos++];/*no of is*/ 292 rows_i = rows_data[is_id]; 293 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 294 } 295 /*estimate a space to avoid multiple allocations */ 296 for (j=0; j<nidx; j++){ 297 indvc_ij = 0; 298 rows_i = rows_data[j]; 299 for (l=0; l<rows_pos_i[j]; l++){ 300 row = rows_i[l]-rstart; 301 start = ai[row]; 302 end = ai[row+1]; 303 for (k=start; k<end; k++){ /* Amat */ 304 col = aj[k] + cstart; 305 indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/ 306 } 307 start = bi[row]; 308 end = bi[row+1]; 309 for (k=start; k<end; k++) { /* Bmat */ 310 col = gcols[bj[k]]; 311 indices_tmp[indvc_ij++] = col; 312 } 313 } 314 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 315 indv_counts[i*nidx+j] = indvc_ij; 316 totalrows += indvc_ij; 317 } 318 } 319 /*message triple <no of is, number of rows, rows> */ 320 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 321 totalrows = 0; 322 rows_pos = 0; 323 /* use this code again */ 324 for (i=0;i<nfrom;i++){ 325 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 326 for (j=0; j<fromsizes[2*i]; j+=2){ 327 is_id = fromrows[rows_pos++]; 328 rows_i = rows_data[is_id]; 329 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 330 } 331 /* add data */ 332 for (j=0; j<nidx; j++){ 333 if (!indv_counts[i*nidx+j]) continue; 334 indvc_ij = 0; 335 sbdata[totalrows++] = j; 336 sbdata[totalrows++] = indv_counts[i*nidx+j]; 337 sbsizes[2*i] += 2; 338 rows_i = rows_data[j]; 339 for (l=0; l<rows_pos_i[j]; l++){ 340 row = rows_i[l]-rstart; 341 start = ai[row]; 342 end = ai[row+1]; 343 for (k=start; k<end; k++){ /* Amat */ 344 col = aj[k] + cstart; 345 indices_tmp[indvc_ij++] = col; 346 } 347 start = bi[row]; 348 end = bi[row+1]; 349 for (k=start; k<end; k++) { /* Bmat */ 350 col = gcols[bj[k]]; 351 indices_tmp[indvc_ij++] = col; 352 } 353 } 354 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 355 sbsizes[2*i] += indvc_ij; 356 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 357 totalrows += indvc_ij; 358 } 359 } 360 /* offsets */ 361 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 362 for (i=0; i<nfrom; i++){ 363 offsets[i+1] = offsets[i] + sbsizes[2*i]; 364 sbsizes[2*i+1] = offsets[i]; 365 } 366 ierr = PetscFree(offsets);CHKERRQ(ierr); 367 if (sbrowsizes) *sbrowsizes = sbsizes; 368 if (sbrows) *sbrows = sbdata; 369 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 370 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 371 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 372 PetscFunctionReturn(0); 373 } 374 375 #undef __FUNCT__ 376 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 377 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 378 { 379 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 380 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 381 PetscInt lsize,lsize_tmp,owner; 382 PetscMPIInt rank; 383 Mat amat,bmat; 384 PetscBool done; 385 PetscLayout cmap,rmap; 386 MPI_Comm comm; 387 PetscErrorCode ierr; 388 389 PetscFunctionBegin; 390 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 391 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 392 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 393 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 394 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 395 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 396 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 397 /*is it a safe way to compute number of nonzero values ?*/ 398 tnz = ai[an]+bi[bn]; 399 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 400 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 401 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 402 /*it is a better way to estimate memory than the old implementation 403 * where global size of matrix is used 404 * */ 405 ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr); 406 for (i=0; i<nidx; i++) { 407 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 408 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 409 lsize_tmp = 0; 410 for (j=0; j<lsize; j++) { 411 owner = -1; 412 row = indices[j]; 413 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 414 if (owner != rank) continue; 415 /*local number*/ 416 row -= rstart; 417 start = ai[row]; 418 end = ai[row+1]; 419 for (k=start; k<end; k++) { /* Amat */ 420 col = aj[k] + cstart; 421 indices_temp[lsize_tmp++] = col; 422 } 423 start = bi[row]; 424 end = bi[row+1]; 425 for (k=start; k<end; k++) { /* Bmat */ 426 col = gcols[bj[k]]; 427 indices_temp[lsize_tmp++] = col; 428 } 429 } 430 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 431 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 432 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 433 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 434 } 435 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 436 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 437 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 438 PetscFunctionReturn(0); 439 } 440 441 442 /* 443 Sample message format: 444 If a processor A wants processor B to process some elements corresponding 445 to index sets is[1],is[5] 446 mesg [0] = 2 (no of index sets in the mesg) 447 ----------- 448 mesg [1] = 1 => is[1] 449 mesg [2] = sizeof(is[1]); 450 ----------- 451 mesg [3] = 5 => is[5] 452 mesg [4] = sizeof(is[5]); 453 ----------- 454 mesg [5] 455 mesg [n] datas[1] 456 ----------- 457 mesg[n+1] 458 mesg[m] data(is[5]) 459 ----------- 460 461 Notes: 462 nrqs - no of requests sent (or to be sent out) 463 nrqr - no of requests recieved (which have to be or which have been processed 464 */ 465 #undef __FUNCT__ 466 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 467 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 468 { 469 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 470 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 471 const PetscInt **idx,*idx_i; 472 PetscInt *n,**data,len; 473 PetscErrorCode ierr; 474 PetscMPIInt size,rank,tag1,tag2; 475 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 476 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 477 PetscBT *table; 478 MPI_Comm comm; 479 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 480 MPI_Status *s_status,*recv_status; 481 char *t_p; 482 483 PetscFunctionBegin; 484 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 485 size = c->size; 486 rank = c->rank; 487 M = C->rmap->N; 488 489 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 490 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 491 492 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 493 494 for (i=0; i<imax; i++) { 495 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 496 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 497 } 498 499 /* evaluate communication - mesg to who,length of mesg, and buffer space 500 required. Based on this, buffers are allocated, and data copied into them*/ 501 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 502 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 503 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 504 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 505 for (i=0; i<imax; i++) { 506 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 507 idx_i = idx[i]; 508 len = n[i]; 509 for (j=0; j<len; j++) { 510 row = idx_i[j]; 511 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 512 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 513 w4[proc]++; 514 } 515 for (j=0; j<size; j++) { 516 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 517 } 518 } 519 520 nrqs = 0; /* no of outgoing messages */ 521 msz = 0; /* total mesg length (for all proc */ 522 w1[rank] = 0; /* no mesg sent to intself */ 523 w3[rank] = 0; 524 for (i=0; i<size; i++) { 525 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 526 } 527 /* pa - is list of processors to communicate with */ 528 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 529 for (i=0,j=0; i<size; i++) { 530 if (w1[i]) {pa[j] = i; j++;} 531 } 532 533 /* Each message would have a header = 1 + 2*(no of IS) + data */ 534 for (i=0; i<nrqs; i++) { 535 j = pa[i]; 536 w1[j] += w2[j] + 2*w3[j]; 537 msz += w1[j]; 538 } 539 540 /* Determine the number of messages to expect, their lengths, from from-ids */ 541 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 542 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 543 544 /* Now post the Irecvs corresponding to these messages */ 545 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 546 547 /* Allocate Memory for outgoing messages */ 548 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 549 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 550 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 551 552 { 553 PetscInt *iptr = tmp,ict = 0; 554 for (i=0; i<nrqs; i++) { 555 j = pa[i]; 556 iptr += ict; 557 outdat[j] = iptr; 558 ict = w1[j]; 559 } 560 } 561 562 /* Form the outgoing messages */ 563 /*plug in the headers*/ 564 for (i=0; i<nrqs; i++) { 565 j = pa[i]; 566 outdat[j][0] = 0; 567 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 568 ptr[j] = outdat[j] + 2*w3[j] + 1; 569 } 570 571 /* Memory for doing local proc's work*/ 572 { 573 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 574 575 for (i=0; i<imax; i++) { 576 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 577 data[i] = d_p + M*i; 578 } 579 } 580 581 /* Parse the IS and update local tables and the outgoing buf with the data*/ 582 { 583 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 584 PetscBT table_i; 585 586 for (i=0; i<imax; i++) { 587 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 588 n_i = n[i]; 589 table_i = table[i]; 590 idx_i = idx[i]; 591 data_i = data[i]; 592 isz_i = isz[i]; 593 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 594 row = idx_i[j]; 595 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 596 if (proc != rank) { /* copy to the outgoing buffer */ 597 ctr[proc]++; 598 *ptr[proc] = row; 599 ptr[proc]++; 600 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 601 } 602 /* Update the headers for the current IS */ 603 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 604 if ((ctr_j = ctr[j])) { 605 outdat_j = outdat[j]; 606 k = ++outdat_j[0]; 607 outdat_j[2*k] = ctr_j; 608 outdat_j[2*k-1] = i; 609 } 610 } 611 isz[i] = isz_i; 612 } 613 } 614 615 /* Now post the sends */ 616 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 617 for (i=0; i<nrqs; ++i) { 618 j = pa[i]; 619 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 620 } 621 622 /* No longer need the original indices*/ 623 for (i=0; i<imax; ++i) { 624 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 625 } 626 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 627 628 for (i=0; i<imax; ++i) { 629 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 630 } 631 632 /* Do Local work*/ 633 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 634 635 /* Receive messages*/ 636 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 637 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 638 639 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 640 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 641 642 /* Phase 1 sends are complete - deallocate buffers */ 643 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 644 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 645 646 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 647 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 648 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 649 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 650 ierr = PetscFree(rbuf);CHKERRQ(ierr); 651 652 653 /* Send the data back*/ 654 /* Do a global reduction to know the buffer space req for incoming messages*/ 655 { 656 PetscMPIInt *rw1; 657 658 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 659 660 for (i=0; i<nrqr; ++i) { 661 proc = recv_status[i].MPI_SOURCE; 662 663 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 664 rw1[proc] = isz1[i]; 665 } 666 ierr = PetscFree(onodes1);CHKERRQ(ierr); 667 ierr = PetscFree(olengths1);CHKERRQ(ierr); 668 669 /* Determine the number of messages to expect, their lengths, from from-ids */ 670 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 671 ierr = PetscFree(rw1);CHKERRQ(ierr); 672 } 673 /* Now post the Irecvs corresponding to these messages */ 674 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 675 676 /* Now post the sends */ 677 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 678 for (i=0; i<nrqr; ++i) { 679 j = recv_status[i].MPI_SOURCE; 680 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 681 } 682 683 /* receive work done on other processors*/ 684 { 685 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 686 PetscMPIInt idex; 687 PetscBT table_i; 688 MPI_Status *status2; 689 690 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 691 for (i=0; i<nrqs; ++i) { 692 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 693 /* Process the message*/ 694 rbuf2_i = rbuf2[idex]; 695 ct1 = 2*rbuf2_i[0]+1; 696 jmax = rbuf2[idex][0]; 697 for (j=1; j<=jmax; j++) { 698 max = rbuf2_i[2*j]; 699 is_no = rbuf2_i[2*j-1]; 700 isz_i = isz[is_no]; 701 data_i = data[is_no]; 702 table_i = table[is_no]; 703 for (k=0; k<max; k++,ct1++) { 704 row = rbuf2_i[ct1]; 705 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 706 } 707 isz[is_no] = isz_i; 708 } 709 } 710 711 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 712 ierr = PetscFree(status2);CHKERRQ(ierr); 713 } 714 715 for (i=0; i<imax; ++i) { 716 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 717 } 718 719 ierr = PetscFree(onodes2);CHKERRQ(ierr); 720 ierr = PetscFree(olengths2);CHKERRQ(ierr); 721 722 ierr = PetscFree(pa);CHKERRQ(ierr); 723 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 724 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 725 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 726 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 727 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 728 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 729 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 730 ierr = PetscFree(s_status);CHKERRQ(ierr); 731 ierr = PetscFree(recv_status);CHKERRQ(ierr); 732 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 733 ierr = PetscFree(xdata);CHKERRQ(ierr); 734 ierr = PetscFree(isz1);CHKERRQ(ierr); 735 PetscFunctionReturn(0); 736 } 737 738 #undef __FUNCT__ 739 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 740 /* 741 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 742 the work on the local processor. 743 744 Inputs: 745 C - MAT_MPIAIJ; 746 imax - total no of index sets processed at a time; 747 table - an array of char - size = m bits. 748 749 Output: 750 isz - array containing the count of the solution elements corresponding 751 to each index set; 752 data - pointer to the solutions 753 */ 754 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 755 { 756 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 757 Mat A = c->A,B = c->B; 758 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 759 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 760 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 761 PetscBT table_i; 762 763 PetscFunctionBegin; 764 rstart = C->rmap->rstart; 765 cstart = C->cmap->rstart; 766 ai = a->i; 767 aj = a->j; 768 bi = b->i; 769 bj = b->j; 770 garray = c->garray; 771 772 773 for (i=0; i<imax; i++) { 774 data_i = data[i]; 775 table_i = table[i]; 776 isz_i = isz[i]; 777 for (j=0,max=isz[i]; j<max; j++) { 778 row = data_i[j] - rstart; 779 start = ai[row]; 780 end = ai[row+1]; 781 for (k=start; k<end; k++) { /* Amat */ 782 val = aj[k] + cstart; 783 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 784 } 785 start = bi[row]; 786 end = bi[row+1]; 787 for (k=start; k<end; k++) { /* Bmat */ 788 val = garray[bj[k]]; 789 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 790 } 791 } 792 isz[i] = isz_i; 793 } 794 PetscFunctionReturn(0); 795 } 796 797 #undef __FUNCT__ 798 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 799 /* 800 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 801 and return the output 802 803 Input: 804 C - the matrix 805 nrqr - no of messages being processed. 806 rbuf - an array of pointers to the recieved requests 807 808 Output: 809 xdata - array of messages to be sent back 810 isz1 - size of each message 811 812 For better efficiency perhaps we should malloc separately each xdata[i], 813 then if a remalloc is required we need only copy the data for that one row 814 rather then all previous rows as it is now where a single large chunck of 815 memory is used. 816 817 */ 818 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 819 { 820 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 821 Mat A = c->A,B = c->B; 822 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 823 PetscErrorCode ierr; 824 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 825 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 826 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 827 PetscInt *rbuf_i,kmax,rbuf_0; 828 PetscBT xtable; 829 830 PetscFunctionBegin; 831 m = C->rmap->N; 832 rstart = C->rmap->rstart; 833 cstart = C->cmap->rstart; 834 ai = a->i; 835 aj = a->j; 836 bi = b->i; 837 bj = b->j; 838 garray = c->garray; 839 840 841 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 842 rbuf_i = rbuf[i]; 843 rbuf_0 = rbuf_i[0]; 844 ct += rbuf_0; 845 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 846 } 847 848 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 849 else max1 = 1; 850 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 851 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 852 ++no_malloc; 853 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 854 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 855 856 ct3 = 0; 857 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 858 rbuf_i = rbuf[i]; 859 rbuf_0 = rbuf_i[0]; 860 ct1 = 2*rbuf_0+1; 861 ct2 = ct1; 862 ct3 += ct1; 863 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 864 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 865 oct2 = ct2; 866 kmax = rbuf_i[2*j]; 867 for (k=0; k<kmax; k++,ct1++) { 868 row = rbuf_i[ct1]; 869 if (!PetscBTLookupSet(xtable,row)) { 870 if (!(ct3 < mem_estimate)) { 871 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 872 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 873 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 874 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 875 xdata[0] = tmp; 876 mem_estimate = new_estimate; ++no_malloc; 877 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 878 } 879 xdata[i][ct2++] = row; 880 ct3++; 881 } 882 } 883 for (k=oct2,max2=ct2; k<max2; k++) { 884 row = xdata[i][k] - rstart; 885 start = ai[row]; 886 end = ai[row+1]; 887 for (l=start; l<end; l++) { 888 val = aj[l] + cstart; 889 if (!PetscBTLookupSet(xtable,val)) { 890 if (!(ct3 < mem_estimate)) { 891 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 892 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 893 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 894 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 895 xdata[0] = tmp; 896 mem_estimate = new_estimate; ++no_malloc; 897 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 898 } 899 xdata[i][ct2++] = val; 900 ct3++; 901 } 902 } 903 start = bi[row]; 904 end = bi[row+1]; 905 for (l=start; l<end; l++) { 906 val = garray[bj[l]]; 907 if (!PetscBTLookupSet(xtable,val)) { 908 if (!(ct3 < mem_estimate)) { 909 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 910 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 911 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 912 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 913 xdata[0] = tmp; 914 mem_estimate = new_estimate; ++no_malloc; 915 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 916 } 917 xdata[i][ct2++] = val; 918 ct3++; 919 } 920 } 921 } 922 /* Update the header*/ 923 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 924 xdata[i][2*j-1] = rbuf_i[2*j-1]; 925 } 926 xdata[i][0] = rbuf_0; 927 xdata[i+1] = xdata[i] + ct2; 928 isz1[i] = ct2; /* size of each message */ 929 } 930 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 931 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 932 PetscFunctionReturn(0); 933 } 934 /* -------------------------------------------------------------------------*/ 935 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 936 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 937 /* 938 Every processor gets the entire matrix 939 */ 940 #undef __FUNCT__ 941 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 942 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 943 { 944 Mat B; 945 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 946 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 947 PetscErrorCode ierr; 948 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 949 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 950 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 951 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 952 953 PetscFunctionBegin; 954 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 955 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 956 957 if (scall == MAT_INITIAL_MATRIX) { 958 /* ---------------------------------------------------------------- 959 Tell every processor the number of nonzeros per row 960 */ 961 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 962 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 963 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 964 } 965 sendcount = A->rmap->rend - A->rmap->rstart; 966 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 967 for (i=0; i<size; i++) { 968 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 969 displs[i] = A->rmap->range[i]; 970 } 971 #if defined(PETSC_HAVE_MPI_IN_PLACE) 972 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 973 #else 974 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 975 #endif 976 /* --------------------------------------------------------------- 977 Create the sequential matrix of the same type as the local block diagonal 978 */ 979 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 980 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 981 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 982 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 983 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 984 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 985 **Bin = B; 986 b = (Mat_SeqAIJ*)B->data; 987 988 /*-------------------------------------------------------------------- 989 Copy my part of matrix column indices over 990 */ 991 sendcount = ad->nz + bd->nz; 992 jsendbuf = b->j + b->i[rstarts[rank]]; 993 a_jsendbuf = ad->j; 994 b_jsendbuf = bd->j; 995 n = A->rmap->rend - A->rmap->rstart; 996 cnt = 0; 997 for (i=0; i<n; i++) { 998 999 /* put in lower diagonal portion */ 1000 m = bd->i[i+1] - bd->i[i]; 1001 while (m > 0) { 1002 /* is it above diagonal (in bd (compressed) numbering) */ 1003 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1004 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1005 m--; 1006 } 1007 1008 /* put in diagonal portion */ 1009 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1010 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1011 } 1012 1013 /* put in upper diagonal portion */ 1014 while (m-- > 0) { 1015 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1016 } 1017 } 1018 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1019 1020 /*-------------------------------------------------------------------- 1021 Gather all column indices to all processors 1022 */ 1023 for (i=0; i<size; i++) { 1024 recvcounts[i] = 0; 1025 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1026 recvcounts[i] += lens[j]; 1027 } 1028 } 1029 displs[0] = 0; 1030 for (i=1; i<size; i++) { 1031 displs[i] = displs[i-1] + recvcounts[i-1]; 1032 } 1033 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1034 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1035 #else 1036 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1037 #endif 1038 /*-------------------------------------------------------------------- 1039 Assemble the matrix into useable form (note numerical values not yet set) 1040 */ 1041 /* set the b->ilen (length of each row) values */ 1042 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1043 /* set the b->i indices */ 1044 b->i[0] = 0; 1045 for (i=1; i<=A->rmap->N; i++) { 1046 b->i[i] = b->i[i-1] + lens[i-1]; 1047 } 1048 ierr = PetscFree(lens);CHKERRQ(ierr); 1049 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1050 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1051 1052 } else { 1053 B = **Bin; 1054 b = (Mat_SeqAIJ*)B->data; 1055 } 1056 1057 /*-------------------------------------------------------------------- 1058 Copy my part of matrix numerical values into the values location 1059 */ 1060 if (flag == MAT_GET_VALUES) { 1061 sendcount = ad->nz + bd->nz; 1062 sendbuf = b->a + b->i[rstarts[rank]]; 1063 a_sendbuf = ad->a; 1064 b_sendbuf = bd->a; 1065 b_sendj = bd->j; 1066 n = A->rmap->rend - A->rmap->rstart; 1067 cnt = 0; 1068 for (i=0; i<n; i++) { 1069 1070 /* put in lower diagonal portion */ 1071 m = bd->i[i+1] - bd->i[i]; 1072 while (m > 0) { 1073 /* is it above diagonal (in bd (compressed) numbering) */ 1074 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1075 sendbuf[cnt++] = *b_sendbuf++; 1076 m--; 1077 b_sendj++; 1078 } 1079 1080 /* put in diagonal portion */ 1081 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1082 sendbuf[cnt++] = *a_sendbuf++; 1083 } 1084 1085 /* put in upper diagonal portion */ 1086 while (m-- > 0) { 1087 sendbuf[cnt++] = *b_sendbuf++; 1088 b_sendj++; 1089 } 1090 } 1091 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1092 1093 /* ----------------------------------------------------------------- 1094 Gather all numerical values to all processors 1095 */ 1096 if (!recvcounts) { 1097 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1098 } 1099 for (i=0; i<size; i++) { 1100 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1101 } 1102 displs[0] = 0; 1103 for (i=1; i<size; i++) { 1104 displs[i] = displs[i-1] + recvcounts[i-1]; 1105 } 1106 recvbuf = b->a; 1107 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1108 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1109 #else 1110 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1111 #endif 1112 } /* endof (flag == MAT_GET_VALUES) */ 1113 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1114 1115 if (A->symmetric) { 1116 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1117 } else if (A->hermitian) { 1118 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1119 } else if (A->structurally_symmetric) { 1120 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1121 } 1122 PetscFunctionReturn(0); 1123 } 1124 1125 1126 1127 #undef __FUNCT__ 1128 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 1129 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1130 { 1131 PetscErrorCode ierr; 1132 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 1133 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 1134 1135 PetscFunctionBegin; 1136 1137 /* 1138 Check for special case: each processor gets entire matrix 1139 */ 1140 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1141 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1142 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1143 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1144 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1145 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1146 wantallmatrix = PETSC_TRUE; 1147 1148 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1149 } 1150 } 1151 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1152 if (twantallmatrix) { 1153 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 1154 PetscFunctionReturn(0); 1155 } 1156 1157 /* Allocate memory to hold all the submatrices */ 1158 if (scall != MAT_REUSE_MATRIX) { 1159 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 1160 } 1161 1162 /* Check for special case: each processor gets entire matrix columns */ 1163 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 1164 for (i=0; i<ismax; i++) { 1165 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 1166 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 1167 if (colflag && ncol == C->cmap->N) { 1168 allcolumns[i] = PETSC_TRUE; 1169 } else { 1170 allcolumns[i] = PETSC_FALSE; 1171 } 1172 } 1173 1174 /* Determine the number of stages through which submatrices are done */ 1175 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1176 1177 /* 1178 Each stage will extract nmax submatrices. 1179 nmax is determined by the matrix column dimension. 1180 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1181 */ 1182 if (!nmax) nmax = 1; 1183 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 1184 1185 /* Make sure every processor loops through the nstages */ 1186 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1187 1188 for (i=0,pos=0; i<nstages; i++) { 1189 if (pos+nmax <= ismax) max_no = nmax; 1190 else if (pos == ismax) max_no = 0; 1191 else max_no = ismax-pos; 1192 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 1193 pos += max_no; 1194 } 1195 1196 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 1197 PetscFunctionReturn(0); 1198 } 1199 1200 /* -------------------------------------------------------------------------*/ 1201 #undef __FUNCT__ 1202 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 1203 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 1204 { 1205 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1206 Mat A = c->A; 1207 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 1208 const PetscInt **icol,**irow; 1209 PetscInt *nrow,*ncol,start; 1210 PetscErrorCode ierr; 1211 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 1212 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 1213 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 1214 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 1215 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 1216 #if defined(PETSC_USE_CTABLE) 1217 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 1218 #else 1219 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 1220 #endif 1221 const PetscInt *irow_i; 1222 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 1223 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1224 MPI_Request *r_waits4,*s_waits3,*s_waits4; 1225 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 1226 MPI_Status *r_status3,*r_status4,*s_status4; 1227 MPI_Comm comm; 1228 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 1229 PetscMPIInt *onodes1,*olengths1; 1230 PetscMPIInt idex,idex2,end; 1231 1232 PetscFunctionBegin; 1233 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1234 tag0 = ((PetscObject)C)->tag; 1235 size = c->size; 1236 rank = c->rank; 1237 1238 /* Get some new tags to keep the communication clean */ 1239 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 1240 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1241 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1242 1243 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 1244 1245 for (i=0; i<ismax; i++) { 1246 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 1247 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 1248 if (allcolumns[i]) { 1249 icol[i] = NULL; 1250 ncol[i] = C->cmap->N; 1251 } else { 1252 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 1253 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 1254 } 1255 } 1256 1257 /* evaluate communication - mesg to who, length of mesg, and buffer space 1258 required. Based on this, buffers are allocated, and data copied into them*/ 1259 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 1260 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1261 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1262 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1263 for (i=0; i<ismax; i++) { 1264 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1265 jmax = nrow[i]; 1266 irow_i = irow[i]; 1267 for (j=0; j<jmax; j++) { 1268 l = 0; 1269 row = irow_i[j]; 1270 while (row >= C->rmap->range[l+1]) l++; 1271 proc = l; 1272 w4[proc]++; 1273 } 1274 for (j=0; j<size; j++) { 1275 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 1276 } 1277 } 1278 1279 nrqs = 0; /* no of outgoing messages */ 1280 msz = 0; /* total mesg length (for all procs) */ 1281 w1[rank] = 0; /* no mesg sent to self */ 1282 w3[rank] = 0; 1283 for (i=0; i<size; i++) { 1284 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 1285 } 1286 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1287 for (i=0,j=0; i<size; i++) { 1288 if (w1[i]) { pa[j] = i; j++; } 1289 } 1290 1291 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1292 for (i=0; i<nrqs; i++) { 1293 j = pa[i]; 1294 w1[j] += w2[j] + 2* w3[j]; 1295 msz += w1[j]; 1296 } 1297 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1298 1299 /* Determine the number of messages to expect, their lengths, from from-ids */ 1300 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1301 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1302 1303 /* Now post the Irecvs corresponding to these messages */ 1304 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1305 1306 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1307 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1308 1309 /* Allocate Memory for outgoing messages */ 1310 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1311 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1312 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1313 1314 { 1315 PetscInt *iptr = tmp,ict = 0; 1316 for (i=0; i<nrqs; i++) { 1317 j = pa[i]; 1318 iptr += ict; 1319 sbuf1[j] = iptr; 1320 ict = w1[j]; 1321 } 1322 } 1323 1324 /* Form the outgoing messages */ 1325 /* Initialize the header space */ 1326 for (i=0; i<nrqs; i++) { 1327 j = pa[i]; 1328 sbuf1[j][0] = 0; 1329 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 1330 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 1331 } 1332 1333 /* Parse the isrow and copy data into outbuf */ 1334 for (i=0; i<ismax; i++) { 1335 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1336 irow_i = irow[i]; 1337 jmax = nrow[i]; 1338 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 1339 l = 0; 1340 row = irow_i[j]; 1341 while (row >= C->rmap->range[l+1]) l++; 1342 proc = l; 1343 if (proc != rank) { /* copy to the outgoing buf*/ 1344 ctr[proc]++; 1345 *ptr[proc] = row; 1346 ptr[proc]++; 1347 } 1348 } 1349 /* Update the headers for the current IS */ 1350 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1351 if ((ctr_j = ctr[j])) { 1352 sbuf1_j = sbuf1[j]; 1353 k = ++sbuf1_j[0]; 1354 sbuf1_j[2*k] = ctr_j; 1355 sbuf1_j[2*k-1] = i; 1356 } 1357 } 1358 } 1359 1360 /* Now post the sends */ 1361 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1362 for (i=0; i<nrqs; ++i) { 1363 j = pa[i]; 1364 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 1365 } 1366 1367 /* Post Receives to capture the buffer size */ 1368 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 1369 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 1370 rbuf2[0] = tmp + msz; 1371 for (i=1; i<nrqs; ++i) { 1372 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 1373 } 1374 for (i=0; i<nrqs; ++i) { 1375 j = pa[i]; 1376 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 1377 } 1378 1379 /* Send to other procs the buf size they should allocate */ 1380 1381 1382 /* Receive messages*/ 1383 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 1384 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1385 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 1386 { 1387 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 1388 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 1389 PetscInt *sbuf2_i; 1390 1391 for (i=0; i<nrqr; ++i) { 1392 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1393 1394 req_size[idex] = 0; 1395 rbuf1_i = rbuf1[idex]; 1396 start = 2*rbuf1_i[0] + 1; 1397 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1398 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 1399 sbuf2_i = sbuf2[idex]; 1400 for (j=start; j<end; j++) { 1401 id = rbuf1_i[j] - rstart; 1402 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1403 sbuf2_i[j] = ncols; 1404 req_size[idex] += ncols; 1405 } 1406 req_source[idex] = r_status1[i].MPI_SOURCE; 1407 /* form the header */ 1408 sbuf2_i[0] = req_size[idex]; 1409 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1410 1411 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1412 } 1413 } 1414 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1415 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1416 1417 /* recv buffer sizes */ 1418 /* Receive messages*/ 1419 1420 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 1421 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1422 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 1423 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1424 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 1425 1426 for (i=0; i<nrqs; ++i) { 1427 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1428 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 1429 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 1430 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1431 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1432 } 1433 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1434 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1435 1436 /* Wait on sends1 and sends2 */ 1437 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1438 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 1439 1440 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1441 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1442 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1443 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1444 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1445 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1446 1447 /* Now allocate buffers for a->j, and send them off */ 1448 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1449 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1450 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1451 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1452 1453 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 1454 { 1455 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1456 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1457 PetscInt cend = C->cmap->rend; 1458 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1459 1460 for (i=0; i<nrqr; i++) { 1461 rbuf1_i = rbuf1[i]; 1462 sbuf_aj_i = sbuf_aj[i]; 1463 ct1 = 2*rbuf1_i[0] + 1; 1464 ct2 = 0; 1465 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1466 kmax = rbuf1[i][2*j]; 1467 for (k=0; k<kmax; k++,ct1++) { 1468 row = rbuf1_i[ct1] - rstart; 1469 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1470 ncols = nzA + nzB; 1471 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1472 1473 /* load the column indices for this row into cols*/ 1474 cols = sbuf_aj_i + ct2; 1475 1476 lwrite = 0; 1477 for (l=0; l<nzB; l++) { 1478 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1479 } 1480 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1481 for (l=0; l<nzB; l++) { 1482 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1483 } 1484 1485 ct2 += ncols; 1486 } 1487 } 1488 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1489 } 1490 } 1491 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 1492 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 1493 1494 /* Allocate buffers for a->a, and send them off */ 1495 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1496 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1497 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1498 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1499 1500 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1501 { 1502 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1503 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1504 PetscInt cend = C->cmap->rend; 1505 PetscInt *b_j = b->j; 1506 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1507 1508 for (i=0; i<nrqr; i++) { 1509 rbuf1_i = rbuf1[i]; 1510 sbuf_aa_i = sbuf_aa[i]; 1511 ct1 = 2*rbuf1_i[0]+1; 1512 ct2 = 0; 1513 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1514 kmax = rbuf1_i[2*j]; 1515 for (k=0; k<kmax; k++,ct1++) { 1516 row = rbuf1_i[ct1] - rstart; 1517 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1518 ncols = nzA + nzB; 1519 cworkB = b_j + b_i[row]; 1520 vworkA = a_a + a_i[row]; 1521 vworkB = b_a + b_i[row]; 1522 1523 /* load the column values for this row into vals*/ 1524 vals = sbuf_aa_i+ct2; 1525 1526 lwrite = 0; 1527 for (l=0; l<nzB; l++) { 1528 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1529 } 1530 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1531 for (l=0; l<nzB; l++) { 1532 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1533 } 1534 1535 ct2 += ncols; 1536 } 1537 } 1538 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1539 } 1540 } 1541 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1542 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1543 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1544 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1545 1546 /* Form the matrix */ 1547 /* create col map: global col of C -> local col of submatrices */ 1548 { 1549 const PetscInt *icol_i; 1550 #if defined(PETSC_USE_CTABLE) 1551 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 1552 for (i=0; i<ismax; i++) { 1553 if (!allcolumns[i]) { 1554 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1555 1556 jmax = ncol[i]; 1557 icol_i = icol[i]; 1558 cmap_i = cmap[i]; 1559 for (j=0; j<jmax; j++) { 1560 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1561 } 1562 } else { 1563 cmap[i] = NULL; 1564 } 1565 } 1566 #else 1567 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1568 for (i=0; i<ismax; i++) { 1569 if (!allcolumns[i]) { 1570 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1571 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1572 jmax = ncol[i]; 1573 icol_i = icol[i]; 1574 cmap_i = cmap[i]; 1575 for (j=0; j<jmax; j++) { 1576 cmap_i[icol_i[j]] = j+1; 1577 } 1578 } else { 1579 cmap[i] = NULL; 1580 } 1581 } 1582 #endif 1583 } 1584 1585 /* Create lens which is required for MatCreate... */ 1586 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1587 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1588 if (ismax) { 1589 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1590 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1591 } 1592 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1593 1594 /* Update lens from local data */ 1595 for (i=0; i<ismax; i++) { 1596 jmax = nrow[i]; 1597 if (!allcolumns[i]) cmap_i = cmap[i]; 1598 irow_i = irow[i]; 1599 lens_i = lens[i]; 1600 for (j=0; j<jmax; j++) { 1601 l = 0; 1602 row = irow_i[j]; 1603 while (row >= C->rmap->range[l+1]) l++; 1604 proc = l; 1605 if (proc == rank) { 1606 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1607 if (!allcolumns[i]) { 1608 for (k=0; k<ncols; k++) { 1609 #if defined(PETSC_USE_CTABLE) 1610 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1611 #else 1612 tcol = cmap_i[cols[k]]; 1613 #endif 1614 if (tcol) lens_i[j]++; 1615 } 1616 } else { /* allcolumns */ 1617 lens_i[j] = ncols; 1618 } 1619 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1620 } 1621 } 1622 } 1623 1624 /* Create row map: global row of C -> local row of submatrices */ 1625 #if defined(PETSC_USE_CTABLE) 1626 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 1627 for (i=0; i<ismax; i++) { 1628 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1629 rmap_i = rmap[i]; 1630 irow_i = irow[i]; 1631 jmax = nrow[i]; 1632 for (j=0; j<jmax; j++) { 1633 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1634 } 1635 } 1636 #else 1637 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1638 if (ismax) { 1639 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1640 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1641 } 1642 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1643 for (i=0; i<ismax; i++) { 1644 rmap_i = rmap[i]; 1645 irow_i = irow[i]; 1646 jmax = nrow[i]; 1647 for (j=0; j<jmax; j++) { 1648 rmap_i[irow_i[j]] = j; 1649 } 1650 } 1651 #endif 1652 1653 /* Update lens from offproc data */ 1654 { 1655 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1656 1657 for (tmp2=0; tmp2<nrqs; tmp2++) { 1658 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1659 idex = pa[idex2]; 1660 sbuf1_i = sbuf1[idex]; 1661 jmax = sbuf1_i[0]; 1662 ct1 = 2*jmax+1; 1663 ct2 = 0; 1664 rbuf2_i = rbuf2[idex2]; 1665 rbuf3_i = rbuf3[idex2]; 1666 for (j=1; j<=jmax; j++) { 1667 is_no = sbuf1_i[2*j-1]; 1668 max1 = sbuf1_i[2*j]; 1669 lens_i = lens[is_no]; 1670 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1671 rmap_i = rmap[is_no]; 1672 for (k=0; k<max1; k++,ct1++) { 1673 #if defined(PETSC_USE_CTABLE) 1674 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1675 row--; 1676 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1677 #else 1678 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1679 #endif 1680 max2 = rbuf2_i[ct1]; 1681 for (l=0; l<max2; l++,ct2++) { 1682 if (!allcolumns[is_no]) { 1683 #if defined(PETSC_USE_CTABLE) 1684 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1685 #else 1686 tcol = cmap_i[rbuf3_i[ct2]]; 1687 #endif 1688 if (tcol) lens_i[row]++; 1689 } else { /* allcolumns */ 1690 lens_i[row]++; /* lens_i[row] += max2 ? */ 1691 } 1692 } 1693 } 1694 } 1695 } 1696 } 1697 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1698 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1699 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1700 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1701 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1702 1703 /* Create the submatrices */ 1704 if (scall == MAT_REUSE_MATRIX) { 1705 PetscBool flag; 1706 1707 /* 1708 Assumes new rows are same length as the old rows,hence bug! 1709 */ 1710 for (i=0; i<ismax; i++) { 1711 mat = (Mat_SeqAIJ*)(submats[i]->data); 1712 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1713 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1714 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1715 /* Initial matrix as if empty */ 1716 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1717 1718 submats[i]->factortype = C->factortype; 1719 } 1720 } else { 1721 for (i=0; i<ismax; i++) { 1722 PetscInt rbs,cbs; 1723 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1724 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1725 1726 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1727 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1728 1729 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1730 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1731 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1732 } 1733 } 1734 1735 /* Assemble the matrices */ 1736 /* First assemble the local rows */ 1737 { 1738 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1739 PetscScalar *imat_a; 1740 1741 for (i=0; i<ismax; i++) { 1742 mat = (Mat_SeqAIJ*)submats[i]->data; 1743 imat_ilen = mat->ilen; 1744 imat_j = mat->j; 1745 imat_i = mat->i; 1746 imat_a = mat->a; 1747 1748 if (!allcolumns[i]) cmap_i = cmap[i]; 1749 rmap_i = rmap[i]; 1750 irow_i = irow[i]; 1751 jmax = nrow[i]; 1752 for (j=0; j<jmax; j++) { 1753 l = 0; 1754 row = irow_i[j]; 1755 while (row >= C->rmap->range[l+1]) l++; 1756 proc = l; 1757 if (proc == rank) { 1758 old_row = row; 1759 #if defined(PETSC_USE_CTABLE) 1760 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1761 row--; 1762 #else 1763 row = rmap_i[row]; 1764 #endif 1765 ilen_row = imat_ilen[row]; 1766 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1767 mat_i = imat_i[row]; 1768 mat_a = imat_a + mat_i; 1769 mat_j = imat_j + mat_i; 1770 if (!allcolumns[i]) { 1771 for (k=0; k<ncols; k++) { 1772 #if defined(PETSC_USE_CTABLE) 1773 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1774 #else 1775 tcol = cmap_i[cols[k]]; 1776 #endif 1777 if (tcol) { 1778 *mat_j++ = tcol - 1; 1779 *mat_a++ = vals[k]; 1780 ilen_row++; 1781 } 1782 } 1783 } else { /* allcolumns */ 1784 for (k=0; k<ncols; k++) { 1785 *mat_j++ = cols[k]; /* global col index! */ 1786 *mat_a++ = vals[k]; 1787 ilen_row++; 1788 } 1789 } 1790 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1791 1792 imat_ilen[row] = ilen_row; 1793 } 1794 } 1795 } 1796 } 1797 1798 /* Now assemble the off proc rows*/ 1799 { 1800 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1801 PetscInt *imat_j,*imat_i; 1802 PetscScalar *imat_a,*rbuf4_i; 1803 1804 for (tmp2=0; tmp2<nrqs; tmp2++) { 1805 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1806 idex = pa[idex2]; 1807 sbuf1_i = sbuf1[idex]; 1808 jmax = sbuf1_i[0]; 1809 ct1 = 2*jmax + 1; 1810 ct2 = 0; 1811 rbuf2_i = rbuf2[idex2]; 1812 rbuf3_i = rbuf3[idex2]; 1813 rbuf4_i = rbuf4[idex2]; 1814 for (j=1; j<=jmax; j++) { 1815 is_no = sbuf1_i[2*j-1]; 1816 rmap_i = rmap[is_no]; 1817 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1818 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1819 imat_ilen = mat->ilen; 1820 imat_j = mat->j; 1821 imat_i = mat->i; 1822 imat_a = mat->a; 1823 max1 = sbuf1_i[2*j]; 1824 for (k=0; k<max1; k++,ct1++) { 1825 row = sbuf1_i[ct1]; 1826 #if defined(PETSC_USE_CTABLE) 1827 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1828 row--; 1829 #else 1830 row = rmap_i[row]; 1831 #endif 1832 ilen = imat_ilen[row]; 1833 mat_i = imat_i[row]; 1834 mat_a = imat_a + mat_i; 1835 mat_j = imat_j + mat_i; 1836 max2 = rbuf2_i[ct1]; 1837 if (!allcolumns[is_no]) { 1838 for (l=0; l<max2; l++,ct2++) { 1839 1840 #if defined(PETSC_USE_CTABLE) 1841 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1842 #else 1843 tcol = cmap_i[rbuf3_i[ct2]]; 1844 #endif 1845 if (tcol) { 1846 *mat_j++ = tcol - 1; 1847 *mat_a++ = rbuf4_i[ct2]; 1848 ilen++; 1849 } 1850 } 1851 } else { /* allcolumns */ 1852 for (l=0; l<max2; l++,ct2++) { 1853 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1854 *mat_a++ = rbuf4_i[ct2]; 1855 ilen++; 1856 } 1857 } 1858 imat_ilen[row] = ilen; 1859 } 1860 } 1861 } 1862 } 1863 1864 /* sort the rows */ 1865 { 1866 PetscInt *imat_ilen,*imat_j,*imat_i; 1867 PetscScalar *imat_a; 1868 1869 for (i=0; i<ismax; i++) { 1870 mat = (Mat_SeqAIJ*)submats[i]->data; 1871 imat_j = mat->j; 1872 imat_i = mat->i; 1873 imat_a = mat->a; 1874 imat_ilen = mat->ilen; 1875 1876 if (allcolumns[i]) continue; 1877 jmax = nrow[i]; 1878 for (j=0; j<jmax; j++) { 1879 PetscInt ilen; 1880 1881 mat_i = imat_i[j]; 1882 mat_a = imat_a + mat_i; 1883 mat_j = imat_j + mat_i; 1884 ilen = imat_ilen[j]; 1885 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 1886 } 1887 } 1888 } 1889 1890 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1891 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1892 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1893 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1894 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1895 1896 /* Restore the indices */ 1897 for (i=0; i<ismax; i++) { 1898 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1899 if (!allcolumns[i]) { 1900 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1901 } 1902 } 1903 1904 /* Destroy allocated memory */ 1905 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1906 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1907 ierr = PetscFree(pa);CHKERRQ(ierr); 1908 1909 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1910 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1911 for (i=0; i<nrqr; ++i) { 1912 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1913 } 1914 for (i=0; i<nrqs; ++i) { 1915 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1916 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1917 } 1918 1919 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1920 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1921 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1922 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1923 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1924 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1925 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1926 1927 #if defined(PETSC_USE_CTABLE) 1928 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1929 #else 1930 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1931 #endif 1932 ierr = PetscFree(rmap);CHKERRQ(ierr); 1933 1934 for (i=0; i<ismax; i++) { 1935 if (!allcolumns[i]) { 1936 #if defined(PETSC_USE_CTABLE) 1937 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1938 #else 1939 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1940 #endif 1941 } 1942 } 1943 ierr = PetscFree(cmap);CHKERRQ(ierr); 1944 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1945 ierr = PetscFree(lens);CHKERRQ(ierr); 1946 1947 for (i=0; i<ismax; i++) { 1948 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1949 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1950 } 1951 PetscFunctionReturn(0); 1952 } 1953 1954 /* 1955 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 1956 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 1957 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 1958 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 1959 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 1960 state, and needs to be "assembled" later by compressing B's column space. 1961 1962 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 1963 Following this call, C->A & C->B have been created, even if empty. 1964 */ 1965 #undef __FUNCT__ 1966 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 1967 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 1968 { 1969 /* If making this function public, change the error returned in this function away from _PLIB. */ 1970 PetscErrorCode ierr; 1971 Mat_MPIAIJ *aij; 1972 Mat_SeqAIJ *Baij; 1973 PetscBool seqaij,Bdisassembled; 1974 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 1975 PetscScalar v; 1976 const PetscInt *rowindices,*colindices; 1977 1978 PetscFunctionBegin; 1979 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 1980 if (A) { 1981 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 1982 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 1983 if (rowemb) { 1984 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 1985 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 1986 } else { 1987 if (C->rmap->n != A->rmap->n) { 1988 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 1989 } 1990 } 1991 if (dcolemb) { 1992 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 1993 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 1994 } else { 1995 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 1996 } 1997 } 1998 if (B) { 1999 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2000 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2001 if (rowemb) { 2002 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2003 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2004 } else { 2005 if (C->rmap->n != B->rmap->n) { 2006 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2007 } 2008 } 2009 if (ocolemb) { 2010 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2011 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2012 } else { 2013 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2014 } 2015 } 2016 2017 aij = (Mat_MPIAIJ*)(C->data); 2018 if (!aij->A) { 2019 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2020 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2021 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2022 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2023 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2024 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2025 } 2026 if (A) { 2027 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2028 } else { 2029 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2030 } 2031 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2032 /* 2033 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2034 need to "disassemble" B -- convert it to using C's global indices. 2035 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2036 2037 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2038 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2039 2040 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2041 At least avoid calling MatSetValues() and the implied searches? 2042 */ 2043 2044 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2045 #if defined(PETSC_USE_CTABLE) 2046 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2047 #else 2048 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2049 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2050 if (aij->B) { 2051 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2052 } 2053 #endif 2054 ngcol = 0; 2055 if (aij->lvec) { 2056 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2057 } 2058 if (aij->garray) { 2059 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2060 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2061 } 2062 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2063 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2064 } 2065 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2066 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2067 } 2068 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2069 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2070 } 2071 } 2072 Bdisassembled = PETSC_FALSE; 2073 if (!aij->B) { 2074 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2075 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2076 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2077 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2078 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2079 Bdisassembled = PETSC_TRUE; 2080 } 2081 if (B) { 2082 Baij = (Mat_SeqAIJ*)(B->data); 2083 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2084 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2085 for (i=0; i<B->rmap->n; i++) { 2086 nz[i] = Baij->i[i+1] - Baij->i[i]; 2087 } 2088 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2089 ierr = PetscFree(nz);CHKERRQ(ierr); 2090 } 2091 2092 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 2093 shift = rend-rstart; 2094 count = 0; 2095 rowindices = NULL; 2096 colindices = NULL; 2097 if (rowemb) { 2098 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 2099 } 2100 if (ocolemb) { 2101 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 2102 } 2103 for (i=0; i<B->rmap->n; i++) { 2104 PetscInt row; 2105 row = i; 2106 if (rowindices) row = rowindices[i]; 2107 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 2108 col = Baij->j[count]; 2109 if (colindices) col = colindices[col]; 2110 if (Bdisassembled && col>=rstart) col += shift; 2111 v = Baij->a[count]; 2112 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 2113 ++count; 2114 } 2115 } 2116 /* No assembly for aij->B is necessary. */ 2117 /* FIXME: set aij->B's nonzerostate correctly. */ 2118 } else { 2119 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 2120 } 2121 C->preallocated = PETSC_TRUE; 2122 C->was_assembled = PETSC_FALSE; 2123 C->assembled = PETSC_FALSE; 2124 /* 2125 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2126 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2127 */ 2128 PetscFunctionReturn(0); 2129 } 2130 2131 #undef __FUNCT__ 2132 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 2133 /* 2134 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 2135 */ 2136 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 2137 { 2138 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 2139 2140 PetscFunctionBegin; 2141 PetscValidPointer(A,2); 2142 PetscValidPointer(B,3); 2143 /* FIXME: make sure C is assembled */ 2144 *A = aij->A; 2145 *B = aij->B; 2146 /* Note that we don't incref *A and *B, so be careful! */ 2147 PetscFunctionReturn(0); 2148 } 2149 2150 /* 2151 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 2152 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 2153 */ 2154 #undef __FUNCT__ 2155 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 2156 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 2157 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 2158 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 2159 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 2160 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 2161 { 2162 PetscErrorCode ierr; 2163 PetscMPIInt isize,flag; 2164 PetscInt i,ii,cismax,ispar; 2165 Mat *A,*B; 2166 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 2167 2168 PetscFunctionBegin; 2169 if (!ismax) PetscFunctionReturn(0); 2170 2171 for (i = 0, cismax = 0; i < ismax; ++i) { 2172 PetscMPIInt isize; 2173 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 2174 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2175 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 2176 if (isize > 1) ++cismax; 2177 } 2178 /* 2179 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 2180 ispar counts the number of parallel ISs across C's comm. 2181 */ 2182 ierr = MPI_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2183 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 2184 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2185 PetscFunctionReturn(0); 2186 } 2187 2188 /* if (ispar) */ 2189 /* 2190 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 2191 These are used to extract the off-diag portion of the resulting parallel matrix. 2192 The row IS for the off-diag portion is the same as for the diag portion, 2193 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 2194 */ 2195 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 2196 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 2197 for (i = 0, ii = 0; i < ismax; ++i) { 2198 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2199 if (isize > 1) { 2200 /* 2201 TODO: This is the part that's ***NOT SCALABLE***. 2202 To fix this we need to extract just the indices of C's nonzero columns 2203 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 2204 part of iscol[i] -- without actually computing ciscol[ii]. This also has 2205 to be done without serializing on the IS list, so, most likely, it is best 2206 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 2207 */ 2208 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 2209 /* Now we have to 2210 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 2211 were sorted on each rank, concatenated they might no longer be sorted; 2212 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 2213 indices in the nondecreasing order to the original index positions. 2214 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 2215 */ 2216 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 2217 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 2218 ++ii; 2219 } 2220 } 2221 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 2222 for (i = 0, ii = 0; i < ismax; ++i) { 2223 PetscInt j,issize; 2224 const PetscInt *indices; 2225 2226 /* 2227 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 2228 */ 2229 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 2230 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 2231 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 2232 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 2233 for (j = 1; j < issize; ++j) { 2234 if (indices[j] == indices[j-1]) { 2235 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 2236 } 2237 } 2238 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 2239 2240 2241 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 2242 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 2243 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 2244 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 2245 for (j = 1; j < issize; ++j) { 2246 if (indices[j-1] == indices[j]) { 2247 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 2248 } 2249 } 2250 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 2251 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2252 if (isize > 1) { 2253 cisrow[ii] = isrow[i]; 2254 ++ii; 2255 } 2256 } 2257 /* 2258 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 2259 array of sequential matrices underlying the resulting parallel matrices. 2260 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 2261 contain duplicates. 2262 2263 There are as many diag matrices as there are original index sets. There are only as many parallel 2264 and off-diag matrices, as there are parallel (comm size > 1) index sets. 2265 2266 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 2267 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 2268 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 2269 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 2270 with A[i] and B[ii] extracted from the corresponding MPI submat. 2271 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 2272 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 2273 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 2274 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 2275 values into A[i] and B[ii] sitting inside the corresponding submat. 2276 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 2277 A[i], B[ii], AA[i] or BB[ii] matrices. 2278 */ 2279 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 2280 if (scall == MAT_INITIAL_MATRIX) { 2281 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 2282 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 2283 } else { 2284 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 2285 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 2286 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 2287 for (i = 0, ii = 0; i < ismax; ++i) { 2288 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2289 if (isize > 1) { 2290 Mat AA,BB; 2291 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 2292 if (!isrow_p[i] && !iscol_p[i]) { 2293 A[i] = AA; 2294 } else { 2295 /* TODO: extract A[i] composed on AA. */ 2296 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 2297 } 2298 if (!isrow_p[i] && !ciscol_p[ii]) { 2299 B[ii] = BB; 2300 } else { 2301 /* TODO: extract B[ii] composed on BB. */ 2302 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 2303 } 2304 ++ii; 2305 } else { 2306 if (!isrow_p[i] && !iscol_p[i]) { 2307 A[i] = (*submat)[i]; 2308 } else { 2309 /* TODO: extract A[i] composed on (*submat)[i]. */ 2310 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 2311 } 2312 } 2313 } 2314 } 2315 /* Now obtain the sequential A and B submatrices separately. */ 2316 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 2317 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 2318 /* 2319 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 2320 matrices A & B have been extracted directly into the parallel matrices containing them, or 2321 simply into the sequential matrix identical with the corresponding A (if isize == 1). 2322 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 2323 to have the same sparsity pattern. 2324 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 2325 must be constructed for C. This is done by setseqmat(s). 2326 */ 2327 for (i = 0, ii = 0; i < ismax; ++i) { 2328 /* 2329 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 2330 That way we can avoid sorting and computing permutations when reusing. 2331 To this end: 2332 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 2333 - if caching arrays to hold the ISs, make and compose a container for them so that it can 2334 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 2335 */ 2336 MatStructure pattern; 2337 if (scall == MAT_INITIAL_MATRIX) { 2338 pattern = DIFFERENT_NONZERO_PATTERN; 2339 } else { 2340 pattern = SAME_NONZERO_PATTERN; 2341 } 2342 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2343 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 2344 if (isize > 1) { 2345 if (scall == MAT_INITIAL_MATRIX) { 2346 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 2347 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2348 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 2349 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 2350 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 2351 } 2352 /* 2353 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 2354 */ 2355 { 2356 Mat AA,BB; 2357 AA = NULL; 2358 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 2359 BB = NULL; 2360 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 2361 if (AA || BB) { 2362 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 2363 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2364 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2365 } 2366 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 2367 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 2368 ierr = MatDestroy(&AA);CHKERRQ(ierr); 2369 } 2370 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 2371 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 2372 ierr = MatDestroy(&BB);CHKERRQ(ierr); 2373 } 2374 } 2375 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 2376 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 2377 ++ii; 2378 } else { /* if (isize == 1) */ 2379 if (scall == MAT_INITIAL_MATRIX) { 2380 if (isrow_p[i] || iscol_p[i]) { 2381 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 2382 } else (*submat)[i] = A[i]; 2383 } 2384 if (isrow_p[i] || iscol_p[i]) { 2385 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 2386 /* Otherwise A is extracted straight into (*submats)[i]. */ 2387 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 2388 ierr = MatDestroy(A+i);CHKERRQ(ierr); 2389 } 2390 } 2391 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 2392 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 2393 } 2394 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 2395 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 2396 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 2397 ierr = PetscFree(A);CHKERRQ(ierr); 2398 ierr = PetscFree(B);CHKERRQ(ierr); 2399 PetscFunctionReturn(0); 2400 } 2401 2402 2403 2404 #undef __FUNCT__ 2405 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 2406 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2407 { 2408 PetscErrorCode ierr; 2409 2410 PetscFunctionBegin; 2411 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 2412 PetscFunctionReturn(0); 2413 } 2414