1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 #include <../src/mat/impls/aij/mpi/mpiaij.h> 8 #include <petscbt.h> 9 #include <petscsf.h> 10 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 16 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 20 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 21 22 23 #undef __FUNCT__ 24 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 25 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 26 { 27 PetscErrorCode ierr; 28 PetscInt i; 29 30 PetscFunctionBegin; 31 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 32 for (i=0; i<ov; ++i) { 33 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 34 } 35 PetscFunctionReturn(0); 36 } 37 38 #undef __FUNCT__ 39 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 40 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 41 { 42 PetscErrorCode ierr; 43 PetscInt i; 44 45 PetscFunctionBegin; 46 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 47 for (i=0; i<ov; ++i) { 48 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 49 } 50 PetscFunctionReturn(0); 51 } 52 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 56 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 57 { 58 PetscErrorCode ierr; 59 MPI_Comm comm; 60 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 61 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 62 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 63 const PetscInt *indices_i,**indices; 64 PetscLayout rmap; 65 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 66 PetscSF sf; 67 PetscSFNode *remote; 68 69 PetscFunctionBegin; 70 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 71 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 72 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 73 /* get row map to determine where rows should be going */ 74 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 75 /* retrieve IS data and put all together so that we 76 * can optimize communication 77 * */ 78 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 79 for (i=0,tlength=0; i<nidx; i++){ 80 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 81 tlength += length[i]; 82 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 83 } 84 /* find these rows on remote processors */ 85 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 86 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 87 nrrows = 0; 88 for (i=0; i<nidx; i++){ 89 length_i = length[i]; 90 indices_i = indices[i]; 91 for (j=0; j<length_i; j++){ 92 owner = -1; 93 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 94 /* remote processors */ 95 if (owner != rank){ 96 tosizes_temp[owner]++; /* number of rows to owner */ 97 rrow_ranks[nrrows] = owner; /* processor */ 98 rrow_isids[nrrows] = i; /* is id */ 99 remoterows[nrrows++] = indices_i[j]; /* row */ 100 } 101 } 102 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 103 } 104 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 105 /* test if we need to exchange messages 106 * generally speaking, we do not need to exchange 107 * data when overlap is 1 108 * */ 109 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 110 /* we do not have any messages 111 * It usually corresponds to overlap 1 112 * */ 113 if (!reducednrrows){ 114 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 115 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 116 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 117 PetscFunctionReturn(0); 118 } 119 nto = 0; 120 /* send sizes and ranks for building a two-sided communcation */ 121 for (i=0; i<size; i++){ 122 if (tosizes_temp[i]){ 123 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 124 tosizes_temp[i] = nto; /* a map from processor to index */ 125 toranks[nto++] = i; /* processor */ 126 } 127 } 128 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 129 for (i=0; i<nto; i++){ 130 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 131 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 132 } 133 /* send information to other processors */ 134 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 135 nrecvrows = 0; 136 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 137 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 138 nrecvrows = 0; 139 for (i=0; i<nfrom; i++){ 140 for (j=0; j<fromsizes[2*i]; j++){ 141 remote[nrecvrows].rank = fromranks[i]; 142 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 143 } 144 } 145 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 146 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 147 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 148 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 149 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 150 /* message pair <no of is, row> */ 151 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 152 for (i=0; i<nrrows; i++){ 153 owner = rrow_ranks[i]; /* processor */ 154 j = tosizes_temp[owner]; /* index */ 155 todata[toffsets[j]++] = rrow_isids[i]; 156 todata[toffsets[j]++] = remoterows[i]; 157 } 158 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 159 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 160 ierr = PetscFree(toffsets);CHKERRQ(ierr); 161 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 163 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 164 /* send rows belonging to the remote so that then we could get the overlapping data back */ 165 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 166 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 167 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 168 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 169 ierr = PetscFree(fromranks);CHKERRQ(ierr); 170 nrecvrows = 0; 171 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 172 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 173 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 174 nrecvrows = 0; 175 for (i=0; i<nto; i++){ 176 for (j=0; j<tosizes[2*i]; j++){ 177 remote[nrecvrows].rank = toranks[i]; 178 remote[nrecvrows++].index = tosizes[2*i+1]+j; 179 } 180 } 181 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 182 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 183 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 184 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 185 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 186 /* overlap communication and computation */ 187 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 188 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 189 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 190 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 191 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 192 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 193 ierr = PetscFree(toranks);CHKERRQ(ierr); 194 ierr = PetscFree(tosizes);CHKERRQ(ierr); 195 ierr = PetscFree(todata);CHKERRQ(ierr); 196 PetscFunctionReturn(0); 197 } 198 199 #undef __FUNCT__ 200 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 201 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 202 { 203 PetscInt *isz,isz_i,i,j,is_id, data_size; 204 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 205 const PetscInt *indices_i_temp; 206 PetscErrorCode ierr; 207 208 PetscFunctionBegin; 209 max_lsize = 0; 210 ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr); 211 for (i=0; i<nidx; i++){ 212 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 213 max_lsize = lsize>max_lsize ? lsize:max_lsize; 214 isz[i] = lsize; 215 } 216 ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr); 217 for (i=0; i<nidx; i++){ 218 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 219 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 220 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 221 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 222 } 223 /* retrieve information to get row id and its overlap */ 224 for (i=0; i<nrecvs; ){ 225 is_id = recvdata[i++]; 226 data_size = recvdata[i++]; 227 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 228 isz_i = isz[is_id]; 229 for (j=0; j< data_size; j++){ 230 col = recvdata[i++]; 231 indices_i[isz_i++] = col; 232 } 233 isz[is_id] = isz_i; 234 } 235 /* remove duplicate entities */ 236 for (i=0; i<nidx; i++){ 237 indices_i = indices_temp+(max_lsize+nrecvs)*i; 238 isz_i = isz[i]; 239 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 240 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 241 } 242 ierr = PetscFree(isz);CHKERRQ(ierr); 243 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 244 PetscFunctionReturn(0); 245 } 246 247 #undef __FUNCT__ 248 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 249 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 250 { 251 PetscLayout rmap,cmap; 252 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 253 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 254 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 255 const PetscInt *gcols,*ai,*aj,*bi,*bj; 256 Mat amat,bmat; 257 PetscMPIInt rank; 258 PetscBool done; 259 MPI_Comm comm; 260 PetscErrorCode ierr; 261 262 PetscFunctionBegin; 263 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 264 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 265 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 266 /* Even if the mat is symmetric, we still assume it is not symmetric */ 267 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 268 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 269 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 270 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 271 /* total number of nonzero values is used to estimate the memory usage in the next step */ 272 tnz = ai[an]+bi[bn]; 273 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 274 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 275 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 276 /* to find the longest message */ 277 max_fszs = 0; 278 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 279 /* better way to estimate number of nonzero in the mat??? */ 280 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 281 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 282 rows_pos = 0; 283 totalrows = 0; 284 for (i=0; i<nfrom; i++){ 285 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 286 /* group data together */ 287 for (j=0; j<fromsizes[2*i]; j+=2){ 288 is_id = fromrows[rows_pos++];/* no of is */ 289 rows_i = rows_data[is_id]; 290 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 291 } 292 /* estimate a space to avoid multiple allocations */ 293 for (j=0; j<nidx; j++){ 294 indvc_ij = 0; 295 rows_i = rows_data[j]; 296 for (l=0; l<rows_pos_i[j]; l++){ 297 row = rows_i[l]-rstart; 298 start = ai[row]; 299 end = ai[row+1]; 300 for (k=start; k<end; k++){ /* Amat */ 301 col = aj[k] + cstart; 302 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 303 } 304 start = bi[row]; 305 end = bi[row+1]; 306 for (k=start; k<end; k++) { /* Bmat */ 307 col = gcols[bj[k]]; 308 indices_tmp[indvc_ij++] = col; 309 } 310 } 311 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 312 indv_counts[i*nidx+j] = indvc_ij; 313 totalrows += indvc_ij; 314 } 315 } 316 /* message triple <no of is, number of rows, rows> */ 317 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 318 totalrows = 0; 319 rows_pos = 0; 320 /* use this code again */ 321 for (i=0;i<nfrom;i++){ 322 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 323 for (j=0; j<fromsizes[2*i]; j+=2){ 324 is_id = fromrows[rows_pos++]; 325 rows_i = rows_data[is_id]; 326 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 327 } 328 /* add data */ 329 for (j=0; j<nidx; j++){ 330 if (!indv_counts[i*nidx+j]) continue; 331 indvc_ij = 0; 332 sbdata[totalrows++] = j; 333 sbdata[totalrows++] = indv_counts[i*nidx+j]; 334 sbsizes[2*i] += 2; 335 rows_i = rows_data[j]; 336 for (l=0; l<rows_pos_i[j]; l++){ 337 row = rows_i[l]-rstart; 338 start = ai[row]; 339 end = ai[row+1]; 340 for (k=start; k<end; k++){ /* Amat */ 341 col = aj[k] + cstart; 342 indices_tmp[indvc_ij++] = col; 343 } 344 start = bi[row]; 345 end = bi[row+1]; 346 for (k=start; k<end; k++) { /* Bmat */ 347 col = gcols[bj[k]]; 348 indices_tmp[indvc_ij++] = col; 349 } 350 } 351 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 352 sbsizes[2*i] += indvc_ij; 353 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 354 totalrows += indvc_ij; 355 } 356 } 357 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 358 for (i=0; i<nfrom; i++){ 359 offsets[i+1] = offsets[i] + sbsizes[2*i]; 360 sbsizes[2*i+1] = offsets[i]; 361 } 362 ierr = PetscFree(offsets);CHKERRQ(ierr); 363 if (sbrowsizes) *sbrowsizes = sbsizes; 364 if (sbrows) *sbrows = sbdata; 365 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 367 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 368 PetscFunctionReturn(0); 369 } 370 371 #undef __FUNCT__ 372 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 373 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 374 { 375 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 376 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 377 PetscInt lsize,lsize_tmp,owner; 378 PetscMPIInt rank; 379 Mat amat,bmat; 380 PetscBool done; 381 PetscLayout cmap,rmap; 382 MPI_Comm comm; 383 PetscErrorCode ierr; 384 385 PetscFunctionBegin; 386 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 387 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 388 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 389 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 390 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 391 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 392 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 393 /* is it a safe way to compute number of nonzero values ? */ 394 tnz = ai[an]+bi[bn]; 395 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 397 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 398 /* it is a better way to estimate memory than the old implementation 399 * where global size of matrix is used 400 * */ 401 ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr); 402 for (i=0; i<nidx; i++) { 403 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 404 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 405 lsize_tmp = 0; 406 for (j=0; j<lsize; j++) { 407 owner = -1; 408 row = indices[j]; 409 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 410 if (owner != rank) continue; 411 /* local number */ 412 row -= rstart; 413 start = ai[row]; 414 end = ai[row+1]; 415 for (k=start; k<end; k++) { /* Amat */ 416 col = aj[k] + cstart; 417 indices_temp[lsize_tmp++] = col; 418 } 419 start = bi[row]; 420 end = bi[row+1]; 421 for (k=start; k<end; k++) { /* Bmat */ 422 col = gcols[bj[k]]; 423 indices_temp[lsize_tmp++] = col; 424 } 425 } 426 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 427 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 428 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 429 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 430 } 431 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 433 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 434 PetscFunctionReturn(0); 435 } 436 437 438 /* 439 Sample message format: 440 If a processor A wants processor B to process some elements corresponding 441 to index sets is[1],is[5] 442 mesg [0] = 2 (no of index sets in the mesg) 443 ----------- 444 mesg [1] = 1 => is[1] 445 mesg [2] = sizeof(is[1]); 446 ----------- 447 mesg [3] = 5 => is[5] 448 mesg [4] = sizeof(is[5]); 449 ----------- 450 mesg [5] 451 mesg [n] datas[1] 452 ----------- 453 mesg[n+1] 454 mesg[m] data(is[5]) 455 ----------- 456 457 Notes: 458 nrqs - no of requests sent (or to be sent out) 459 nrqr - no of requests recieved (which have to be or which have been processed 460 */ 461 #undef __FUNCT__ 462 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 463 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 464 { 465 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 466 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 467 const PetscInt **idx,*idx_i; 468 PetscInt *n,**data,len; 469 PetscErrorCode ierr; 470 PetscMPIInt size,rank,tag1,tag2; 471 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 472 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 473 PetscBT *table; 474 MPI_Comm comm; 475 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 476 MPI_Status *s_status,*recv_status; 477 char *t_p; 478 479 PetscFunctionBegin; 480 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 481 size = c->size; 482 rank = c->rank; 483 M = C->rmap->N; 484 485 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 486 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 487 488 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 489 490 for (i=0; i<imax; i++) { 491 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 492 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 493 } 494 495 /* evaluate communication - mesg to who,length of mesg, and buffer space 496 required. Based on this, buffers are allocated, and data copied into them */ 497 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 498 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 499 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 500 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 501 for (i=0; i<imax; i++) { 502 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 503 idx_i = idx[i]; 504 len = n[i]; 505 for (j=0; j<len; j++) { 506 row = idx_i[j]; 507 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 508 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 509 w4[proc]++; 510 } 511 for (j=0; j<size; j++) { 512 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 513 } 514 } 515 516 nrqs = 0; /* no of outgoing messages */ 517 msz = 0; /* total mesg length (for all proc */ 518 w1[rank] = 0; /* no mesg sent to intself */ 519 w3[rank] = 0; 520 for (i=0; i<size; i++) { 521 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 522 } 523 /* pa - is list of processors to communicate with */ 524 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 525 for (i=0,j=0; i<size; i++) { 526 if (w1[i]) {pa[j] = i; j++;} 527 } 528 529 /* Each message would have a header = 1 + 2*(no of IS) + data */ 530 for (i=0; i<nrqs; i++) { 531 j = pa[i]; 532 w1[j] += w2[j] + 2*w3[j]; 533 msz += w1[j]; 534 } 535 536 /* Determine the number of messages to expect, their lengths, from from-ids */ 537 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 538 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 539 540 /* Now post the Irecvs corresponding to these messages */ 541 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 542 543 /* Allocate Memory for outgoing messages */ 544 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 545 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 546 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 547 548 { 549 PetscInt *iptr = tmp,ict = 0; 550 for (i=0; i<nrqs; i++) { 551 j = pa[i]; 552 iptr += ict; 553 outdat[j] = iptr; 554 ict = w1[j]; 555 } 556 } 557 558 /* Form the outgoing messages */ 559 /* plug in the headers */ 560 for (i=0; i<nrqs; i++) { 561 j = pa[i]; 562 outdat[j][0] = 0; 563 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 564 ptr[j] = outdat[j] + 2*w3[j] + 1; 565 } 566 567 /* Memory for doing local proc's work */ 568 { 569 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 570 571 for (i=0; i<imax; i++) { 572 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 573 data[i] = d_p + M*i; 574 } 575 } 576 577 /* Parse the IS and update local tables and the outgoing buf with the data */ 578 { 579 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 580 PetscBT table_i; 581 582 for (i=0; i<imax; i++) { 583 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 584 n_i = n[i]; 585 table_i = table[i]; 586 idx_i = idx[i]; 587 data_i = data[i]; 588 isz_i = isz[i]; 589 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 590 row = idx_i[j]; 591 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 592 if (proc != rank) { /* copy to the outgoing buffer */ 593 ctr[proc]++; 594 *ptr[proc] = row; 595 ptr[proc]++; 596 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 597 } 598 /* Update the headers for the current IS */ 599 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 600 if ((ctr_j = ctr[j])) { 601 outdat_j = outdat[j]; 602 k = ++outdat_j[0]; 603 outdat_j[2*k] = ctr_j; 604 outdat_j[2*k-1] = i; 605 } 606 } 607 isz[i] = isz_i; 608 } 609 } 610 611 /* Now post the sends */ 612 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 613 for (i=0; i<nrqs; ++i) { 614 j = pa[i]; 615 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 616 } 617 618 /* No longer need the original indices */ 619 for (i=0; i<imax; ++i) { 620 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 621 } 622 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 623 624 for (i=0; i<imax; ++i) { 625 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 626 } 627 628 /* Do Local work */ 629 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 630 631 /* Receive messages */ 632 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 633 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 634 635 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 636 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 637 638 /* Phase 1 sends are complete - deallocate buffers */ 639 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 640 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 641 642 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 643 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 644 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 645 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 646 ierr = PetscFree(rbuf);CHKERRQ(ierr); 647 648 649 /* Send the data back */ 650 /* Do a global reduction to know the buffer space req for incoming messages */ 651 { 652 PetscMPIInt *rw1; 653 654 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 655 656 for (i=0; i<nrqr; ++i) { 657 proc = recv_status[i].MPI_SOURCE; 658 659 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 660 rw1[proc] = isz1[i]; 661 } 662 ierr = PetscFree(onodes1);CHKERRQ(ierr); 663 ierr = PetscFree(olengths1);CHKERRQ(ierr); 664 665 /* Determine the number of messages to expect, their lengths, from from-ids */ 666 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 667 ierr = PetscFree(rw1);CHKERRQ(ierr); 668 } 669 /* Now post the Irecvs corresponding to these messages */ 670 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 671 672 /* Now post the sends */ 673 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 674 for (i=0; i<nrqr; ++i) { 675 j = recv_status[i].MPI_SOURCE; 676 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 677 } 678 679 /* receive work done on other processors */ 680 { 681 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 682 PetscMPIInt idex; 683 PetscBT table_i; 684 MPI_Status *status2; 685 686 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 687 for (i=0; i<nrqs; ++i) { 688 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 689 /* Process the message */ 690 rbuf2_i = rbuf2[idex]; 691 ct1 = 2*rbuf2_i[0]+1; 692 jmax = rbuf2[idex][0]; 693 for (j=1; j<=jmax; j++) { 694 max = rbuf2_i[2*j]; 695 is_no = rbuf2_i[2*j-1]; 696 isz_i = isz[is_no]; 697 data_i = data[is_no]; 698 table_i = table[is_no]; 699 for (k=0; k<max; k++,ct1++) { 700 row = rbuf2_i[ct1]; 701 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 702 } 703 isz[is_no] = isz_i; 704 } 705 } 706 707 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 708 ierr = PetscFree(status2);CHKERRQ(ierr); 709 } 710 711 for (i=0; i<imax; ++i) { 712 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 713 } 714 715 ierr = PetscFree(onodes2);CHKERRQ(ierr); 716 ierr = PetscFree(olengths2);CHKERRQ(ierr); 717 718 ierr = PetscFree(pa);CHKERRQ(ierr); 719 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 720 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 721 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 722 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 723 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 724 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 725 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 726 ierr = PetscFree(s_status);CHKERRQ(ierr); 727 ierr = PetscFree(recv_status);CHKERRQ(ierr); 728 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 729 ierr = PetscFree(xdata);CHKERRQ(ierr); 730 ierr = PetscFree(isz1);CHKERRQ(ierr); 731 PetscFunctionReturn(0); 732 } 733 734 #undef __FUNCT__ 735 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 736 /* 737 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 738 the work on the local processor. 739 740 Inputs: 741 C - MAT_MPIAIJ; 742 imax - total no of index sets processed at a time; 743 table - an array of char - size = m bits. 744 745 Output: 746 isz - array containing the count of the solution elements corresponding 747 to each index set; 748 data - pointer to the solutions 749 */ 750 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 751 { 752 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 753 Mat A = c->A,B = c->B; 754 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 755 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 756 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 757 PetscBT table_i; 758 759 PetscFunctionBegin; 760 rstart = C->rmap->rstart; 761 cstart = C->cmap->rstart; 762 ai = a->i; 763 aj = a->j; 764 bi = b->i; 765 bj = b->j; 766 garray = c->garray; 767 768 769 for (i=0; i<imax; i++) { 770 data_i = data[i]; 771 table_i = table[i]; 772 isz_i = isz[i]; 773 for (j=0,max=isz[i]; j<max; j++) { 774 row = data_i[j] - rstart; 775 start = ai[row]; 776 end = ai[row+1]; 777 for (k=start; k<end; k++) { /* Amat */ 778 val = aj[k] + cstart; 779 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 780 } 781 start = bi[row]; 782 end = bi[row+1]; 783 for (k=start; k<end; k++) { /* Bmat */ 784 val = garray[bj[k]]; 785 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 786 } 787 } 788 isz[i] = isz_i; 789 } 790 PetscFunctionReturn(0); 791 } 792 793 #undef __FUNCT__ 794 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 795 /* 796 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 797 and return the output 798 799 Input: 800 C - the matrix 801 nrqr - no of messages being processed. 802 rbuf - an array of pointers to the recieved requests 803 804 Output: 805 xdata - array of messages to be sent back 806 isz1 - size of each message 807 808 For better efficiency perhaps we should malloc separately each xdata[i], 809 then if a remalloc is required we need only copy the data for that one row 810 rather then all previous rows as it is now where a single large chunck of 811 memory is used. 812 813 */ 814 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 815 { 816 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 817 Mat A = c->A,B = c->B; 818 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 819 PetscErrorCode ierr; 820 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 821 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 822 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 823 PetscInt *rbuf_i,kmax,rbuf_0; 824 PetscBT xtable; 825 826 PetscFunctionBegin; 827 m = C->rmap->N; 828 rstart = C->rmap->rstart; 829 cstart = C->cmap->rstart; 830 ai = a->i; 831 aj = a->j; 832 bi = b->i; 833 bj = b->j; 834 garray = c->garray; 835 836 837 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 838 rbuf_i = rbuf[i]; 839 rbuf_0 = rbuf_i[0]; 840 ct += rbuf_0; 841 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 842 } 843 844 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 845 else max1 = 1; 846 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 847 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 848 ++no_malloc; 849 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 850 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 851 852 ct3 = 0; 853 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 854 rbuf_i = rbuf[i]; 855 rbuf_0 = rbuf_i[0]; 856 ct1 = 2*rbuf_0+1; 857 ct2 = ct1; 858 ct3 += ct1; 859 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 860 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 861 oct2 = ct2; 862 kmax = rbuf_i[2*j]; 863 for (k=0; k<kmax; k++,ct1++) { 864 row = rbuf_i[ct1]; 865 if (!PetscBTLookupSet(xtable,row)) { 866 if (!(ct3 < mem_estimate)) { 867 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 868 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 869 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 870 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 871 xdata[0] = tmp; 872 mem_estimate = new_estimate; ++no_malloc; 873 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 874 } 875 xdata[i][ct2++] = row; 876 ct3++; 877 } 878 } 879 for (k=oct2,max2=ct2; k<max2; k++) { 880 row = xdata[i][k] - rstart; 881 start = ai[row]; 882 end = ai[row+1]; 883 for (l=start; l<end; l++) { 884 val = aj[l] + cstart; 885 if (!PetscBTLookupSet(xtable,val)) { 886 if (!(ct3 < mem_estimate)) { 887 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 888 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 889 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 890 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 891 xdata[0] = tmp; 892 mem_estimate = new_estimate; ++no_malloc; 893 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 894 } 895 xdata[i][ct2++] = val; 896 ct3++; 897 } 898 } 899 start = bi[row]; 900 end = bi[row+1]; 901 for (l=start; l<end; l++) { 902 val = garray[bj[l]]; 903 if (!PetscBTLookupSet(xtable,val)) { 904 if (!(ct3 < mem_estimate)) { 905 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 906 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 907 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 908 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 909 xdata[0] = tmp; 910 mem_estimate = new_estimate; ++no_malloc; 911 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 912 } 913 xdata[i][ct2++] = val; 914 ct3++; 915 } 916 } 917 } 918 /* Update the header*/ 919 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 920 xdata[i][2*j-1] = rbuf_i[2*j-1]; 921 } 922 xdata[i][0] = rbuf_0; 923 xdata[i+1] = xdata[i] + ct2; 924 isz1[i] = ct2; /* size of each message */ 925 } 926 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 927 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 928 PetscFunctionReturn(0); 929 } 930 /* -------------------------------------------------------------------------*/ 931 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 932 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 933 /* 934 Every processor gets the entire matrix 935 */ 936 #undef __FUNCT__ 937 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 938 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 939 { 940 Mat B; 941 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 942 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 943 PetscErrorCode ierr; 944 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 945 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 946 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 947 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 948 949 PetscFunctionBegin; 950 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 951 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 952 953 if (scall == MAT_INITIAL_MATRIX) { 954 /* ---------------------------------------------------------------- 955 Tell every processor the number of nonzeros per row 956 */ 957 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 958 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 959 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 960 } 961 sendcount = A->rmap->rend - A->rmap->rstart; 962 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 963 for (i=0; i<size; i++) { 964 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 965 displs[i] = A->rmap->range[i]; 966 } 967 #if defined(PETSC_HAVE_MPI_IN_PLACE) 968 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 969 #else 970 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 971 #endif 972 /* --------------------------------------------------------------- 973 Create the sequential matrix of the same type as the local block diagonal 974 */ 975 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 976 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 977 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 978 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 979 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 980 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 981 **Bin = B; 982 b = (Mat_SeqAIJ*)B->data; 983 984 /*-------------------------------------------------------------------- 985 Copy my part of matrix column indices over 986 */ 987 sendcount = ad->nz + bd->nz; 988 jsendbuf = b->j + b->i[rstarts[rank]]; 989 a_jsendbuf = ad->j; 990 b_jsendbuf = bd->j; 991 n = A->rmap->rend - A->rmap->rstart; 992 cnt = 0; 993 for (i=0; i<n; i++) { 994 995 /* put in lower diagonal portion */ 996 m = bd->i[i+1] - bd->i[i]; 997 while (m > 0) { 998 /* is it above diagonal (in bd (compressed) numbering) */ 999 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1000 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1001 m--; 1002 } 1003 1004 /* put in diagonal portion */ 1005 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1006 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1007 } 1008 1009 /* put in upper diagonal portion */ 1010 while (m-- > 0) { 1011 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1012 } 1013 } 1014 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1015 1016 /*-------------------------------------------------------------------- 1017 Gather all column indices to all processors 1018 */ 1019 for (i=0; i<size; i++) { 1020 recvcounts[i] = 0; 1021 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1022 recvcounts[i] += lens[j]; 1023 } 1024 } 1025 displs[0] = 0; 1026 for (i=1; i<size; i++) { 1027 displs[i] = displs[i-1] + recvcounts[i-1]; 1028 } 1029 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1030 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1031 #else 1032 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1033 #endif 1034 /*-------------------------------------------------------------------- 1035 Assemble the matrix into useable form (note numerical values not yet set) 1036 */ 1037 /* set the b->ilen (length of each row) values */ 1038 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1039 /* set the b->i indices */ 1040 b->i[0] = 0; 1041 for (i=1; i<=A->rmap->N; i++) { 1042 b->i[i] = b->i[i-1] + lens[i-1]; 1043 } 1044 ierr = PetscFree(lens);CHKERRQ(ierr); 1045 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1046 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1047 1048 } else { 1049 B = **Bin; 1050 b = (Mat_SeqAIJ*)B->data; 1051 } 1052 1053 /*-------------------------------------------------------------------- 1054 Copy my part of matrix numerical values into the values location 1055 */ 1056 if (flag == MAT_GET_VALUES) { 1057 sendcount = ad->nz + bd->nz; 1058 sendbuf = b->a + b->i[rstarts[rank]]; 1059 a_sendbuf = ad->a; 1060 b_sendbuf = bd->a; 1061 b_sendj = bd->j; 1062 n = A->rmap->rend - A->rmap->rstart; 1063 cnt = 0; 1064 for (i=0; i<n; i++) { 1065 1066 /* put in lower diagonal portion */ 1067 m = bd->i[i+1] - bd->i[i]; 1068 while (m > 0) { 1069 /* is it above diagonal (in bd (compressed) numbering) */ 1070 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1071 sendbuf[cnt++] = *b_sendbuf++; 1072 m--; 1073 b_sendj++; 1074 } 1075 1076 /* put in diagonal portion */ 1077 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1078 sendbuf[cnt++] = *a_sendbuf++; 1079 } 1080 1081 /* put in upper diagonal portion */ 1082 while (m-- > 0) { 1083 sendbuf[cnt++] = *b_sendbuf++; 1084 b_sendj++; 1085 } 1086 } 1087 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1088 1089 /* ----------------------------------------------------------------- 1090 Gather all numerical values to all processors 1091 */ 1092 if (!recvcounts) { 1093 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1094 } 1095 for (i=0; i<size; i++) { 1096 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1097 } 1098 displs[0] = 0; 1099 for (i=1; i<size; i++) { 1100 displs[i] = displs[i-1] + recvcounts[i-1]; 1101 } 1102 recvbuf = b->a; 1103 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1104 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1105 #else 1106 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1107 #endif 1108 } /* endof (flag == MAT_GET_VALUES) */ 1109 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1110 1111 if (A->symmetric) { 1112 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1113 } else if (A->hermitian) { 1114 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1115 } else if (A->structurally_symmetric) { 1116 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1117 } 1118 PetscFunctionReturn(0); 1119 } 1120 1121 1122 1123 #undef __FUNCT__ 1124 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 1125 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1126 { 1127 PetscErrorCode ierr; 1128 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 1129 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 1130 1131 PetscFunctionBegin; 1132 1133 /* 1134 Check for special case: each processor gets entire matrix 1135 */ 1136 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1137 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1138 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1139 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1140 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1141 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1142 wantallmatrix = PETSC_TRUE; 1143 1144 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1145 } 1146 } 1147 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1148 if (twantallmatrix) { 1149 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 1150 PetscFunctionReturn(0); 1151 } 1152 1153 /* Allocate memory to hold all the submatrices */ 1154 if (scall != MAT_REUSE_MATRIX) { 1155 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 1156 } 1157 1158 /* Check for special case: each processor gets entire matrix columns */ 1159 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 1160 for (i=0; i<ismax; i++) { 1161 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 1162 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 1163 if (colflag && ncol == C->cmap->N) { 1164 allcolumns[i] = PETSC_TRUE; 1165 } else { 1166 allcolumns[i] = PETSC_FALSE; 1167 } 1168 } 1169 1170 /* Determine the number of stages through which submatrices are done */ 1171 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1172 1173 /* 1174 Each stage will extract nmax submatrices. 1175 nmax is determined by the matrix column dimension. 1176 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1177 */ 1178 if (!nmax) nmax = 1; 1179 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 1180 1181 /* Make sure every processor loops through the nstages */ 1182 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1183 1184 for (i=0,pos=0; i<nstages; i++) { 1185 if (pos+nmax <= ismax) max_no = nmax; 1186 else if (pos == ismax) max_no = 0; 1187 else max_no = ismax-pos; 1188 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 1189 pos += max_no; 1190 } 1191 1192 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 1193 PetscFunctionReturn(0); 1194 } 1195 1196 /* -------------------------------------------------------------------------*/ 1197 #undef __FUNCT__ 1198 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 1199 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 1200 { 1201 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1202 Mat A = c->A; 1203 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 1204 const PetscInt **icol,**irow; 1205 PetscInt *nrow,*ncol,start; 1206 PetscErrorCode ierr; 1207 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 1208 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 1209 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 1210 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 1211 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 1212 #if defined(PETSC_USE_CTABLE) 1213 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 1214 #else 1215 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 1216 #endif 1217 const PetscInt *irow_i; 1218 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 1219 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1220 MPI_Request *r_waits4,*s_waits3,*s_waits4; 1221 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 1222 MPI_Status *r_status3,*r_status4,*s_status4; 1223 MPI_Comm comm; 1224 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 1225 PetscMPIInt *onodes1,*olengths1; 1226 PetscMPIInt idex,idex2,end; 1227 1228 PetscFunctionBegin; 1229 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1230 tag0 = ((PetscObject)C)->tag; 1231 size = c->size; 1232 rank = c->rank; 1233 1234 /* Get some new tags to keep the communication clean */ 1235 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 1236 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1237 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1238 1239 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 1240 1241 for (i=0; i<ismax; i++) { 1242 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 1243 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 1244 if (allcolumns[i]) { 1245 icol[i] = NULL; 1246 ncol[i] = C->cmap->N; 1247 } else { 1248 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 1249 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 1250 } 1251 } 1252 1253 /* evaluate communication - mesg to who, length of mesg, and buffer space 1254 required. Based on this, buffers are allocated, and data copied into them*/ 1255 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 1256 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1257 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1258 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1259 for (i=0; i<ismax; i++) { 1260 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1261 jmax = nrow[i]; 1262 irow_i = irow[i]; 1263 for (j=0; j<jmax; j++) { 1264 l = 0; 1265 row = irow_i[j]; 1266 while (row >= C->rmap->range[l+1]) l++; 1267 proc = l; 1268 w4[proc]++; 1269 } 1270 for (j=0; j<size; j++) { 1271 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 1272 } 1273 } 1274 1275 nrqs = 0; /* no of outgoing messages */ 1276 msz = 0; /* total mesg length (for all procs) */ 1277 w1[rank] = 0; /* no mesg sent to self */ 1278 w3[rank] = 0; 1279 for (i=0; i<size; i++) { 1280 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 1281 } 1282 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1283 for (i=0,j=0; i<size; i++) { 1284 if (w1[i]) { pa[j] = i; j++; } 1285 } 1286 1287 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1288 for (i=0; i<nrqs; i++) { 1289 j = pa[i]; 1290 w1[j] += w2[j] + 2* w3[j]; 1291 msz += w1[j]; 1292 } 1293 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1294 1295 /* Determine the number of messages to expect, their lengths, from from-ids */ 1296 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1297 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1298 1299 /* Now post the Irecvs corresponding to these messages */ 1300 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1301 1302 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1303 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1304 1305 /* Allocate Memory for outgoing messages */ 1306 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1307 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1308 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1309 1310 { 1311 PetscInt *iptr = tmp,ict = 0; 1312 for (i=0; i<nrqs; i++) { 1313 j = pa[i]; 1314 iptr += ict; 1315 sbuf1[j] = iptr; 1316 ict = w1[j]; 1317 } 1318 } 1319 1320 /* Form the outgoing messages */ 1321 /* Initialize the header space */ 1322 for (i=0; i<nrqs; i++) { 1323 j = pa[i]; 1324 sbuf1[j][0] = 0; 1325 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 1326 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 1327 } 1328 1329 /* Parse the isrow and copy data into outbuf */ 1330 for (i=0; i<ismax; i++) { 1331 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1332 irow_i = irow[i]; 1333 jmax = nrow[i]; 1334 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 1335 l = 0; 1336 row = irow_i[j]; 1337 while (row >= C->rmap->range[l+1]) l++; 1338 proc = l; 1339 if (proc != rank) { /* copy to the outgoing buf*/ 1340 ctr[proc]++; 1341 *ptr[proc] = row; 1342 ptr[proc]++; 1343 } 1344 } 1345 /* Update the headers for the current IS */ 1346 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1347 if ((ctr_j = ctr[j])) { 1348 sbuf1_j = sbuf1[j]; 1349 k = ++sbuf1_j[0]; 1350 sbuf1_j[2*k] = ctr_j; 1351 sbuf1_j[2*k-1] = i; 1352 } 1353 } 1354 } 1355 1356 /* Now post the sends */ 1357 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1358 for (i=0; i<nrqs; ++i) { 1359 j = pa[i]; 1360 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 1361 } 1362 1363 /* Post Receives to capture the buffer size */ 1364 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 1365 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 1366 rbuf2[0] = tmp + msz; 1367 for (i=1; i<nrqs; ++i) { 1368 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 1369 } 1370 for (i=0; i<nrqs; ++i) { 1371 j = pa[i]; 1372 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 1373 } 1374 1375 /* Send to other procs the buf size they should allocate */ 1376 1377 1378 /* Receive messages*/ 1379 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 1380 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1381 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 1382 { 1383 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 1384 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 1385 PetscInt *sbuf2_i; 1386 1387 for (i=0; i<nrqr; ++i) { 1388 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1389 1390 req_size[idex] = 0; 1391 rbuf1_i = rbuf1[idex]; 1392 start = 2*rbuf1_i[0] + 1; 1393 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1394 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 1395 sbuf2_i = sbuf2[idex]; 1396 for (j=start; j<end; j++) { 1397 id = rbuf1_i[j] - rstart; 1398 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1399 sbuf2_i[j] = ncols; 1400 req_size[idex] += ncols; 1401 } 1402 req_source[idex] = r_status1[i].MPI_SOURCE; 1403 /* form the header */ 1404 sbuf2_i[0] = req_size[idex]; 1405 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1406 1407 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1408 } 1409 } 1410 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1411 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1412 1413 /* recv buffer sizes */ 1414 /* Receive messages*/ 1415 1416 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 1417 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1418 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 1419 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1420 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 1421 1422 for (i=0; i<nrqs; ++i) { 1423 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1424 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 1425 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 1426 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1427 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1428 } 1429 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1430 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1431 1432 /* Wait on sends1 and sends2 */ 1433 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1434 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 1435 1436 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1437 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1438 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1439 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1440 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1441 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1442 1443 /* Now allocate buffers for a->j, and send them off */ 1444 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1445 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1446 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1447 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1448 1449 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 1450 { 1451 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1452 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1453 PetscInt cend = C->cmap->rend; 1454 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1455 1456 for (i=0; i<nrqr; i++) { 1457 rbuf1_i = rbuf1[i]; 1458 sbuf_aj_i = sbuf_aj[i]; 1459 ct1 = 2*rbuf1_i[0] + 1; 1460 ct2 = 0; 1461 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1462 kmax = rbuf1[i][2*j]; 1463 for (k=0; k<kmax; k++,ct1++) { 1464 row = rbuf1_i[ct1] - rstart; 1465 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1466 ncols = nzA + nzB; 1467 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1468 1469 /* load the column indices for this row into cols*/ 1470 cols = sbuf_aj_i + ct2; 1471 1472 lwrite = 0; 1473 for (l=0; l<nzB; l++) { 1474 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1475 } 1476 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1477 for (l=0; l<nzB; l++) { 1478 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1479 } 1480 1481 ct2 += ncols; 1482 } 1483 } 1484 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1485 } 1486 } 1487 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 1488 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 1489 1490 /* Allocate buffers for a->a, and send them off */ 1491 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1492 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1493 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1494 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1495 1496 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1497 { 1498 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1499 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1500 PetscInt cend = C->cmap->rend; 1501 PetscInt *b_j = b->j; 1502 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1503 1504 for (i=0; i<nrqr; i++) { 1505 rbuf1_i = rbuf1[i]; 1506 sbuf_aa_i = sbuf_aa[i]; 1507 ct1 = 2*rbuf1_i[0]+1; 1508 ct2 = 0; 1509 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1510 kmax = rbuf1_i[2*j]; 1511 for (k=0; k<kmax; k++,ct1++) { 1512 row = rbuf1_i[ct1] - rstart; 1513 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1514 ncols = nzA + nzB; 1515 cworkB = b_j + b_i[row]; 1516 vworkA = a_a + a_i[row]; 1517 vworkB = b_a + b_i[row]; 1518 1519 /* load the column values for this row into vals*/ 1520 vals = sbuf_aa_i+ct2; 1521 1522 lwrite = 0; 1523 for (l=0; l<nzB; l++) { 1524 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1525 } 1526 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1527 for (l=0; l<nzB; l++) { 1528 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1529 } 1530 1531 ct2 += ncols; 1532 } 1533 } 1534 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1535 } 1536 } 1537 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1538 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1539 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1540 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1541 1542 /* Form the matrix */ 1543 /* create col map: global col of C -> local col of submatrices */ 1544 { 1545 const PetscInt *icol_i; 1546 #if defined(PETSC_USE_CTABLE) 1547 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 1548 for (i=0; i<ismax; i++) { 1549 if (!allcolumns[i]) { 1550 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1551 1552 jmax = ncol[i]; 1553 icol_i = icol[i]; 1554 cmap_i = cmap[i]; 1555 for (j=0; j<jmax; j++) { 1556 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1557 } 1558 } else { 1559 cmap[i] = NULL; 1560 } 1561 } 1562 #else 1563 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1564 for (i=0; i<ismax; i++) { 1565 if (!allcolumns[i]) { 1566 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1567 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1568 jmax = ncol[i]; 1569 icol_i = icol[i]; 1570 cmap_i = cmap[i]; 1571 for (j=0; j<jmax; j++) { 1572 cmap_i[icol_i[j]] = j+1; 1573 } 1574 } else { 1575 cmap[i] = NULL; 1576 } 1577 } 1578 #endif 1579 } 1580 1581 /* Create lens which is required for MatCreate... */ 1582 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1583 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1584 if (ismax) { 1585 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1586 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1587 } 1588 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1589 1590 /* Update lens from local data */ 1591 for (i=0; i<ismax; i++) { 1592 jmax = nrow[i]; 1593 if (!allcolumns[i]) cmap_i = cmap[i]; 1594 irow_i = irow[i]; 1595 lens_i = lens[i]; 1596 for (j=0; j<jmax; j++) { 1597 l = 0; 1598 row = irow_i[j]; 1599 while (row >= C->rmap->range[l+1]) l++; 1600 proc = l; 1601 if (proc == rank) { 1602 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1603 if (!allcolumns[i]) { 1604 for (k=0; k<ncols; k++) { 1605 #if defined(PETSC_USE_CTABLE) 1606 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1607 #else 1608 tcol = cmap_i[cols[k]]; 1609 #endif 1610 if (tcol) lens_i[j]++; 1611 } 1612 } else { /* allcolumns */ 1613 lens_i[j] = ncols; 1614 } 1615 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1616 } 1617 } 1618 } 1619 1620 /* Create row map: global row of C -> local row of submatrices */ 1621 #if defined(PETSC_USE_CTABLE) 1622 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 1623 for (i=0; i<ismax; i++) { 1624 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1625 rmap_i = rmap[i]; 1626 irow_i = irow[i]; 1627 jmax = nrow[i]; 1628 for (j=0; j<jmax; j++) { 1629 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1630 } 1631 } 1632 #else 1633 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1634 if (ismax) { 1635 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1636 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1637 } 1638 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1639 for (i=0; i<ismax; i++) { 1640 rmap_i = rmap[i]; 1641 irow_i = irow[i]; 1642 jmax = nrow[i]; 1643 for (j=0; j<jmax; j++) { 1644 rmap_i[irow_i[j]] = j; 1645 } 1646 } 1647 #endif 1648 1649 /* Update lens from offproc data */ 1650 { 1651 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1652 1653 for (tmp2=0; tmp2<nrqs; tmp2++) { 1654 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1655 idex = pa[idex2]; 1656 sbuf1_i = sbuf1[idex]; 1657 jmax = sbuf1_i[0]; 1658 ct1 = 2*jmax+1; 1659 ct2 = 0; 1660 rbuf2_i = rbuf2[idex2]; 1661 rbuf3_i = rbuf3[idex2]; 1662 for (j=1; j<=jmax; j++) { 1663 is_no = sbuf1_i[2*j-1]; 1664 max1 = sbuf1_i[2*j]; 1665 lens_i = lens[is_no]; 1666 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1667 rmap_i = rmap[is_no]; 1668 for (k=0; k<max1; k++,ct1++) { 1669 #if defined(PETSC_USE_CTABLE) 1670 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1671 row--; 1672 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1673 #else 1674 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1675 #endif 1676 max2 = rbuf2_i[ct1]; 1677 for (l=0; l<max2; l++,ct2++) { 1678 if (!allcolumns[is_no]) { 1679 #if defined(PETSC_USE_CTABLE) 1680 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1681 #else 1682 tcol = cmap_i[rbuf3_i[ct2]]; 1683 #endif 1684 if (tcol) lens_i[row]++; 1685 } else { /* allcolumns */ 1686 lens_i[row]++; /* lens_i[row] += max2 ? */ 1687 } 1688 } 1689 } 1690 } 1691 } 1692 } 1693 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1694 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1695 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1696 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1697 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1698 1699 /* Create the submatrices */ 1700 if (scall == MAT_REUSE_MATRIX) { 1701 PetscBool flag; 1702 1703 /* 1704 Assumes new rows are same length as the old rows,hence bug! 1705 */ 1706 for (i=0; i<ismax; i++) { 1707 mat = (Mat_SeqAIJ*)(submats[i]->data); 1708 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1709 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1710 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1711 /* Initial matrix as if empty */ 1712 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1713 1714 submats[i]->factortype = C->factortype; 1715 } 1716 } else { 1717 for (i=0; i<ismax; i++) { 1718 PetscInt rbs,cbs; 1719 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1720 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1721 1722 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1723 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1724 1725 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1726 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1727 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1728 } 1729 } 1730 1731 /* Assemble the matrices */ 1732 /* First assemble the local rows */ 1733 { 1734 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1735 PetscScalar *imat_a; 1736 1737 for (i=0; i<ismax; i++) { 1738 mat = (Mat_SeqAIJ*)submats[i]->data; 1739 imat_ilen = mat->ilen; 1740 imat_j = mat->j; 1741 imat_i = mat->i; 1742 imat_a = mat->a; 1743 1744 if (!allcolumns[i]) cmap_i = cmap[i]; 1745 rmap_i = rmap[i]; 1746 irow_i = irow[i]; 1747 jmax = nrow[i]; 1748 for (j=0; j<jmax; j++) { 1749 l = 0; 1750 row = irow_i[j]; 1751 while (row >= C->rmap->range[l+1]) l++; 1752 proc = l; 1753 if (proc == rank) { 1754 old_row = row; 1755 #if defined(PETSC_USE_CTABLE) 1756 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1757 row--; 1758 #else 1759 row = rmap_i[row]; 1760 #endif 1761 ilen_row = imat_ilen[row]; 1762 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1763 mat_i = imat_i[row]; 1764 mat_a = imat_a + mat_i; 1765 mat_j = imat_j + mat_i; 1766 if (!allcolumns[i]) { 1767 for (k=0; k<ncols; k++) { 1768 #if defined(PETSC_USE_CTABLE) 1769 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1770 #else 1771 tcol = cmap_i[cols[k]]; 1772 #endif 1773 if (tcol) { 1774 *mat_j++ = tcol - 1; 1775 *mat_a++ = vals[k]; 1776 ilen_row++; 1777 } 1778 } 1779 } else { /* allcolumns */ 1780 for (k=0; k<ncols; k++) { 1781 *mat_j++ = cols[k]; /* global col index! */ 1782 *mat_a++ = vals[k]; 1783 ilen_row++; 1784 } 1785 } 1786 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1787 1788 imat_ilen[row] = ilen_row; 1789 } 1790 } 1791 } 1792 } 1793 1794 /* Now assemble the off proc rows*/ 1795 { 1796 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1797 PetscInt *imat_j,*imat_i; 1798 PetscScalar *imat_a,*rbuf4_i; 1799 1800 for (tmp2=0; tmp2<nrqs; tmp2++) { 1801 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1802 idex = pa[idex2]; 1803 sbuf1_i = sbuf1[idex]; 1804 jmax = sbuf1_i[0]; 1805 ct1 = 2*jmax + 1; 1806 ct2 = 0; 1807 rbuf2_i = rbuf2[idex2]; 1808 rbuf3_i = rbuf3[idex2]; 1809 rbuf4_i = rbuf4[idex2]; 1810 for (j=1; j<=jmax; j++) { 1811 is_no = sbuf1_i[2*j-1]; 1812 rmap_i = rmap[is_no]; 1813 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1814 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1815 imat_ilen = mat->ilen; 1816 imat_j = mat->j; 1817 imat_i = mat->i; 1818 imat_a = mat->a; 1819 max1 = sbuf1_i[2*j]; 1820 for (k=0; k<max1; k++,ct1++) { 1821 row = sbuf1_i[ct1]; 1822 #if defined(PETSC_USE_CTABLE) 1823 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1824 row--; 1825 #else 1826 row = rmap_i[row]; 1827 #endif 1828 ilen = imat_ilen[row]; 1829 mat_i = imat_i[row]; 1830 mat_a = imat_a + mat_i; 1831 mat_j = imat_j + mat_i; 1832 max2 = rbuf2_i[ct1]; 1833 if (!allcolumns[is_no]) { 1834 for (l=0; l<max2; l++,ct2++) { 1835 1836 #if defined(PETSC_USE_CTABLE) 1837 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1838 #else 1839 tcol = cmap_i[rbuf3_i[ct2]]; 1840 #endif 1841 if (tcol) { 1842 *mat_j++ = tcol - 1; 1843 *mat_a++ = rbuf4_i[ct2]; 1844 ilen++; 1845 } 1846 } 1847 } else { /* allcolumns */ 1848 for (l=0; l<max2; l++,ct2++) { 1849 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1850 *mat_a++ = rbuf4_i[ct2]; 1851 ilen++; 1852 } 1853 } 1854 imat_ilen[row] = ilen; 1855 } 1856 } 1857 } 1858 } 1859 1860 /* sort the rows */ 1861 { 1862 PetscInt *imat_ilen,*imat_j,*imat_i; 1863 PetscScalar *imat_a; 1864 1865 for (i=0; i<ismax; i++) { 1866 mat = (Mat_SeqAIJ*)submats[i]->data; 1867 imat_j = mat->j; 1868 imat_i = mat->i; 1869 imat_a = mat->a; 1870 imat_ilen = mat->ilen; 1871 1872 if (allcolumns[i]) continue; 1873 jmax = nrow[i]; 1874 for (j=0; j<jmax; j++) { 1875 PetscInt ilen; 1876 1877 mat_i = imat_i[j]; 1878 mat_a = imat_a + mat_i; 1879 mat_j = imat_j + mat_i; 1880 ilen = imat_ilen[j]; 1881 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 1882 } 1883 } 1884 } 1885 1886 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1887 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1888 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1889 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1890 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1891 1892 /* Restore the indices */ 1893 for (i=0; i<ismax; i++) { 1894 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1895 if (!allcolumns[i]) { 1896 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1897 } 1898 } 1899 1900 /* Destroy allocated memory */ 1901 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1902 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1903 ierr = PetscFree(pa);CHKERRQ(ierr); 1904 1905 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1906 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1907 for (i=0; i<nrqr; ++i) { 1908 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1909 } 1910 for (i=0; i<nrqs; ++i) { 1911 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1912 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1913 } 1914 1915 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1916 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1917 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1918 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1919 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1920 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1921 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1922 1923 #if defined(PETSC_USE_CTABLE) 1924 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1925 #else 1926 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1927 #endif 1928 ierr = PetscFree(rmap);CHKERRQ(ierr); 1929 1930 for (i=0; i<ismax; i++) { 1931 if (!allcolumns[i]) { 1932 #if defined(PETSC_USE_CTABLE) 1933 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1934 #else 1935 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1936 #endif 1937 } 1938 } 1939 ierr = PetscFree(cmap);CHKERRQ(ierr); 1940 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1941 ierr = PetscFree(lens);CHKERRQ(ierr); 1942 1943 for (i=0; i<ismax; i++) { 1944 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1945 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1946 } 1947 PetscFunctionReturn(0); 1948 } 1949 1950 /* 1951 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 1952 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 1953 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 1954 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 1955 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 1956 state, and needs to be "assembled" later by compressing B's column space. 1957 1958 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 1959 Following this call, C->A & C->B have been created, even if empty. 1960 */ 1961 #undef __FUNCT__ 1962 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 1963 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 1964 { 1965 /* If making this function public, change the error returned in this function away from _PLIB. */ 1966 PetscErrorCode ierr; 1967 Mat_MPIAIJ *aij; 1968 Mat_SeqAIJ *Baij; 1969 PetscBool seqaij,Bdisassembled; 1970 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 1971 PetscScalar v; 1972 const PetscInt *rowindices,*colindices; 1973 1974 PetscFunctionBegin; 1975 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 1976 if (A) { 1977 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 1978 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 1979 if (rowemb) { 1980 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 1981 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 1982 } else { 1983 if (C->rmap->n != A->rmap->n) { 1984 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 1985 } 1986 } 1987 if (dcolemb) { 1988 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 1989 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 1990 } else { 1991 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 1992 } 1993 } 1994 if (B) { 1995 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 1996 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 1997 if (rowemb) { 1998 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 1999 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2000 } else { 2001 if (C->rmap->n != B->rmap->n) { 2002 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2003 } 2004 } 2005 if (ocolemb) { 2006 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2007 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2008 } else { 2009 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2010 } 2011 } 2012 2013 aij = (Mat_MPIAIJ*)(C->data); 2014 if (!aij->A) { 2015 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2016 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2017 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2018 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2019 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2020 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2021 } 2022 if (A) { 2023 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2024 } else { 2025 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2026 } 2027 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2028 /* 2029 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2030 need to "disassemble" B -- convert it to using C's global indices. 2031 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2032 2033 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2034 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2035 2036 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2037 At least avoid calling MatSetValues() and the implied searches? 2038 */ 2039 2040 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2041 #if defined(PETSC_USE_CTABLE) 2042 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2043 #else 2044 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2045 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2046 if (aij->B) { 2047 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2048 } 2049 #endif 2050 ngcol = 0; 2051 if (aij->lvec) { 2052 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2053 } 2054 if (aij->garray) { 2055 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2056 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2057 } 2058 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2059 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2060 } 2061 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2062 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2063 } 2064 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2065 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2066 } 2067 } 2068 Bdisassembled = PETSC_FALSE; 2069 if (!aij->B) { 2070 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2071 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2072 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2073 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2074 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2075 Bdisassembled = PETSC_TRUE; 2076 } 2077 if (B) { 2078 Baij = (Mat_SeqAIJ*)(B->data); 2079 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2080 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2081 for (i=0; i<B->rmap->n; i++) { 2082 nz[i] = Baij->i[i+1] - Baij->i[i]; 2083 } 2084 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2085 ierr = PetscFree(nz);CHKERRQ(ierr); 2086 } 2087 2088 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 2089 shift = rend-rstart; 2090 count = 0; 2091 rowindices = NULL; 2092 colindices = NULL; 2093 if (rowemb) { 2094 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 2095 } 2096 if (ocolemb) { 2097 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 2098 } 2099 for (i=0; i<B->rmap->n; i++) { 2100 PetscInt row; 2101 row = i; 2102 if (rowindices) row = rowindices[i]; 2103 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 2104 col = Baij->j[count]; 2105 if (colindices) col = colindices[col]; 2106 if (Bdisassembled && col>=rstart) col += shift; 2107 v = Baij->a[count]; 2108 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 2109 ++count; 2110 } 2111 } 2112 /* No assembly for aij->B is necessary. */ 2113 /* FIXME: set aij->B's nonzerostate correctly. */ 2114 } else { 2115 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 2116 } 2117 C->preallocated = PETSC_TRUE; 2118 C->was_assembled = PETSC_FALSE; 2119 C->assembled = PETSC_FALSE; 2120 /* 2121 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2122 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2123 */ 2124 PetscFunctionReturn(0); 2125 } 2126 2127 #undef __FUNCT__ 2128 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 2129 /* 2130 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 2131 */ 2132 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 2133 { 2134 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 2135 2136 PetscFunctionBegin; 2137 PetscValidPointer(A,2); 2138 PetscValidPointer(B,3); 2139 /* FIXME: make sure C is assembled */ 2140 *A = aij->A; 2141 *B = aij->B; 2142 /* Note that we don't incref *A and *B, so be careful! */ 2143 PetscFunctionReturn(0); 2144 } 2145 2146 /* 2147 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 2148 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 2149 */ 2150 #undef __FUNCT__ 2151 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 2152 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 2153 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 2154 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 2155 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 2156 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 2157 { 2158 PetscErrorCode ierr; 2159 PetscMPIInt isize,flag; 2160 PetscInt i,ii,cismax,ispar,ciscol_localsize; 2161 Mat *A,*B; 2162 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 2163 2164 PetscFunctionBegin; 2165 if (!ismax) PetscFunctionReturn(0); 2166 2167 for (i = 0, cismax = 0; i < ismax; ++i) { 2168 PetscMPIInt isize; 2169 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 2170 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2171 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 2172 if (isize > 1) ++cismax; 2173 } 2174 /* 2175 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 2176 ispar counts the number of parallel ISs across C's comm. 2177 */ 2178 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2179 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 2180 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2181 PetscFunctionReturn(0); 2182 } 2183 2184 /* if (ispar) */ 2185 /* 2186 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 2187 These are used to extract the off-diag portion of the resulting parallel matrix. 2188 The row IS for the off-diag portion is the same as for the diag portion, 2189 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 2190 */ 2191 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 2192 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 2193 for (i = 0, ii = 0; i < ismax; ++i) { 2194 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2195 if (isize > 1) { 2196 /* 2197 TODO: This is the part that's ***NOT SCALABLE***. 2198 To fix this we need to extract just the indices of C's nonzero columns 2199 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 2200 part of iscol[i] -- without actually computing ciscol[ii]. This also has 2201 to be done without serializing on the IS list, so, most likely, it is best 2202 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 2203 */ 2204 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 2205 /* Now we have to 2206 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 2207 were sorted on each rank, concatenated they might no longer be sorted; 2208 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 2209 indices in the nondecreasing order to the original index positions. 2210 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 2211 */ 2212 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 2213 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 2214 ++ii; 2215 } 2216 } 2217 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 2218 for (i = 0, ii = 0; i < ismax; ++i) { 2219 PetscInt j,issize; 2220 const PetscInt *indices; 2221 2222 /* 2223 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 2224 */ 2225 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 2226 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 2227 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 2228 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 2229 for (j = 1; j < issize; ++j) { 2230 if (indices[j] == indices[j-1]) { 2231 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 2232 } 2233 } 2234 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 2235 2236 2237 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 2238 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 2239 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 2240 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 2241 for (j = 1; j < issize; ++j) { 2242 if (indices[j-1] == indices[j]) { 2243 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 2244 } 2245 } 2246 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 2247 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2248 if (isize > 1) { 2249 cisrow[ii] = isrow[i]; 2250 ++ii; 2251 } 2252 } 2253 /* 2254 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 2255 array of sequential matrices underlying the resulting parallel matrices. 2256 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 2257 contain duplicates. 2258 2259 There are as many diag matrices as there are original index sets. There are only as many parallel 2260 and off-diag matrices, as there are parallel (comm size > 1) index sets. 2261 2262 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 2263 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 2264 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 2265 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 2266 with A[i] and B[ii] extracted from the corresponding MPI submat. 2267 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 2268 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 2269 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 2270 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 2271 values into A[i] and B[ii] sitting inside the corresponding submat. 2272 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 2273 A[i], B[ii], AA[i] or BB[ii] matrices. 2274 */ 2275 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 2276 if (scall == MAT_INITIAL_MATRIX) { 2277 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 2278 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 2279 } else { 2280 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 2281 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 2282 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 2283 for (i = 0, ii = 0; i < ismax; ++i) { 2284 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2285 if (isize > 1) { 2286 Mat AA,BB; 2287 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 2288 if (!isrow_p[i] && !iscol_p[i]) { 2289 A[i] = AA; 2290 } else { 2291 /* TODO: extract A[i] composed on AA. */ 2292 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 2293 } 2294 if (!isrow_p[i] && !ciscol_p[ii]) { 2295 B[ii] = BB; 2296 } else { 2297 /* TODO: extract B[ii] composed on BB. */ 2298 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 2299 } 2300 ++ii; 2301 } else { 2302 if (!isrow_p[i] && !iscol_p[i]) { 2303 A[i] = (*submat)[i]; 2304 } else { 2305 /* TODO: extract A[i] composed on (*submat)[i]. */ 2306 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 2307 } 2308 } 2309 } 2310 } 2311 /* Now obtain the sequential A and B submatrices separately. */ 2312 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 2313 /* I did not figure out a good way to fix it right now. 2314 * Local column size of B[i] is different from the size of ciscol[i]. 2315 * B[i]'s size is finally determined by MatAssembly, while 2316 * ciscol[i] is computed as the complement of iscol[i]. 2317 * It is better to keep only nonzero indices when computing 2318 * the complement ciscol[i]. 2319 * */ 2320 if(scall==MAT_REUSE_MATRIX){ 2321 for(i=0; i<cismax; i++){ 2322 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 2323 B[i]->cmap->n = ciscol_localsize; 2324 } 2325 } 2326 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 2327 /* 2328 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 2329 matrices A & B have been extracted directly into the parallel matrices containing them, or 2330 simply into the sequential matrix identical with the corresponding A (if isize == 1). 2331 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 2332 to have the same sparsity pattern. 2333 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 2334 must be constructed for C. This is done by setseqmat(s). 2335 */ 2336 for (i = 0, ii = 0; i < ismax; ++i) { 2337 /* 2338 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 2339 That way we can avoid sorting and computing permutations when reusing. 2340 To this end: 2341 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 2342 - if caching arrays to hold the ISs, make and compose a container for them so that it can 2343 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 2344 */ 2345 MatStructure pattern; 2346 if (scall == MAT_INITIAL_MATRIX) { 2347 pattern = DIFFERENT_NONZERO_PATTERN; 2348 } else { 2349 pattern = SAME_NONZERO_PATTERN; 2350 } 2351 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 2352 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 2353 if (isize > 1) { 2354 if (scall == MAT_INITIAL_MATRIX) { 2355 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 2356 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2357 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 2358 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 2359 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 2360 } 2361 /* 2362 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 2363 */ 2364 { 2365 Mat AA,BB; 2366 AA = NULL; 2367 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 2368 BB = NULL; 2369 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 2370 if (AA || BB) { 2371 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 2372 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2373 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2374 } 2375 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 2376 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 2377 ierr = MatDestroy(&AA);CHKERRQ(ierr); 2378 } 2379 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 2380 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 2381 ierr = MatDestroy(&BB);CHKERRQ(ierr); 2382 } 2383 } 2384 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 2385 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 2386 ++ii; 2387 } else { /* if (isize == 1) */ 2388 if (scall == MAT_INITIAL_MATRIX) { 2389 if (isrow_p[i] || iscol_p[i]) { 2390 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 2391 } else (*submat)[i] = A[i]; 2392 } 2393 if (isrow_p[i] || iscol_p[i]) { 2394 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 2395 /* Otherwise A is extracted straight into (*submats)[i]. */ 2396 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 2397 ierr = MatDestroy(A+i);CHKERRQ(ierr); 2398 } 2399 } 2400 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 2401 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 2402 } 2403 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 2404 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 2405 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 2406 ierr = PetscFree(A);CHKERRQ(ierr); 2407 ierr = PetscFree(B);CHKERRQ(ierr); 2408 PetscFunctionReturn(0); 2409 } 2410 2411 2412 2413 #undef __FUNCT__ 2414 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 2415 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2416 { 2417 PetscErrorCode ierr; 2418 2419 PetscFunctionBegin; 2420 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 2421 PetscFunctionReturn(0); 2422 } 2423