1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 29 PetscFunctionBegin; 30 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 31 for (i=0; i<ov; ++i) { 32 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 33 } 34 PetscFunctionReturn(0); 35 } 36 37 #undef __FUNCT__ 38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 40 { 41 PetscErrorCode ierr; 42 PetscInt i; 43 44 PetscFunctionBegin; 45 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 46 for (i=0; i<ov; ++i) { 47 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 48 } 49 PetscFunctionReturn(0); 50 } 51 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 56 { 57 PetscErrorCode ierr; 58 MPI_Comm comm; 59 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 60 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata,nto,nfrom; 61 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 62 const PetscInt **indices,*indices_i; 63 PetscLayout rmap; 64 PetscMPIInt rank,size,*toranks,*fromranks; 65 PetscSF sf; 66 PetscSFNode *remote; 67 68 PetscFunctionBegin; 69 /*communicator */ 70 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 71 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 72 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 73 /*row map*/ 74 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 75 /*retrieve IS data*/ 76 ierr = PetscCalloc2(nidx,&indices,nidx,&length);CHKERRQ(ierr); 77 /*get length and indices*/ 78 for (i=0,tlength=0; i<nidx; i++){ 79 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 80 tlength += length[i]; 81 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 82 } 83 /*find these rows on remote processors */ 84 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 85 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 86 nrrows = 0; 87 for (i=0; i<nidx; i++){ 88 length_i = length[i]; 89 indices_i = indices[i]; 90 for (j=0; j<length_i; j++){ 91 owner = -1; 92 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 93 /*remote processors*/ 94 if (owner != rank){ 95 tosizes_temp[owner]++; /*number of rows to owner*/ 96 rrow_ranks[nrrows] = owner; /*processor */ 97 rrow_isids[nrrows] = i; /*is id*/ 98 remoterows[nrrows++] = indices_i[j]; /* row */ 99 } 100 } 101 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 102 } 103 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 104 /*test if we need to exchange messages 105 * generally speaking, we do not need to exchange 106 * data when overlap is 1 107 * */ 108 ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 109 /*we do not have any messages 110 * It usually corresponds to overlap 1 111 * */ 112 if (!reducednrrows){ 113 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 114 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 115 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 nto = 0; 119 /*send sizes and ranks*/ 120 for (i=0; i<size; i++){ 121 if (tosizes_temp[i]){ 122 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 123 tosizes_temp[i] = nto; /* a map from processor to index */ 124 toranks[nto++] = i; /* processor */ 125 } 126 } 127 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 128 for (i=0; i<nto; i++){ 129 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /*offsets*/ 130 tosizes[2*i+1] = toffsets[i]; /*offsets to send*/ 131 } 132 /*send information to other processors*/ 133 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 134 /*build a star forest */ 135 nrecvrows = 0; 136 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 137 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 138 nrecvrows = 0; 139 for (i=0; i<nfrom; i++){ 140 for (j=0; j<fromsizes[2*i]; j++){ 141 remote[nrecvrows].rank = fromranks[i]; 142 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 143 } 144 } 145 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 146 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 147 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 148 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 149 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 150 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 151 /*message pair <no of is, row> */ 152 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 153 for (i=0; i<nrrows; i++){ 154 owner = rrow_ranks[i]; /* processor */ 155 j = tosizes_temp[owner]; /* index */ 156 todata[toffsets[j]++] = rrow_isids[i]; 157 todata[toffsets[j]++] = remoterows[i]; 158 } 159 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 160 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 161 ierr = PetscFree(toffsets);CHKERRQ(ierr); 162 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 163 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 164 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 165 /*deal with remote data */ 166 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 167 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 168 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 169 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 170 ierr = PetscFree(fromranks);CHKERRQ(ierr); 171 nrecvrows = 0; 172 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 173 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 174 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 175 nrecvrows = 0; 176 for (i=0; i<nto; i++){ 177 for (j=0; j<tosizes[2*i]; j++){ 178 remote[nrecvrows].rank = toranks[i]; 179 remote[nrecvrows++].index = tosizes[2*i+1]+j; 180 } 181 } 182 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 183 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 184 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 185 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 186 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 187 /*overlap communication and computation*/ 188 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 189 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 190 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 191 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 192 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 193 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 194 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 195 ierr = PetscFree(toranks);CHKERRQ(ierr); 196 ierr = PetscFree(tosizes);CHKERRQ(ierr); 197 ierr = PetscFree(todata);CHKERRQ(ierr); 198 PetscFunctionReturn(0); 199 } 200 201 #undef __FUNCT__ 202 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 203 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 204 { 205 PetscInt *isz,isz_i,i,j,is_id, data_size; 206 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 207 const PetscInt *indices_i_temp; 208 PetscErrorCode ierr; 209 210 PetscFunctionBegin; 211 max_lsize = 0; 212 ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr); 213 for (i=0; i<nidx; i++){ 214 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 215 max_lsize = lsize>max_lsize ? lsize:max_lsize; 216 isz[i] = lsize; 217 } 218 ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr); 219 for (i=0; i<nidx; i++){ 220 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 221 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 222 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 223 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 224 } 225 /*retrieve information */ 226 for (i=0; i<nrecvs; ){ 227 is_id = recvdata[i++]; 228 data_size = recvdata[i++]; 229 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 230 isz_i = isz[is_id]; 231 for (j=0; j< data_size; j++){ 232 col = recvdata[i++]; 233 indices_i[isz_i++] = col; 234 } 235 isz[is_id] = isz_i; 236 } 237 /*remove duplicate entities*/ 238 for (i=0; i<nidx; i++){ 239 indices_i = indices_temp+(max_lsize+nrecvs)*i; 240 isz_i = isz[i]; 241 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 242 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 243 } 244 ierr = PetscFree(isz);CHKERRQ(ierr); 245 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 246 PetscFunctionReturn(0); 247 } 248 249 #undef __FUNCT__ 250 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 251 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 252 { 253 PetscLayout rmap,cmap; 254 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 255 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 256 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 257 const PetscInt *gcols,*ai,*aj,*bi,*bj; 258 Mat amat,bmat; 259 PetscMPIInt rank; 260 PetscBool done; 261 MPI_Comm comm; 262 PetscErrorCode ierr; 263 264 PetscFunctionBegin; 265 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 266 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 267 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 268 /* Even if the mat is symmetric, we still assume it is not symmetric*/ 269 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 270 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 271 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 272 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 273 /*total number of nonzero values */ 274 tnz = ai[an]+bi[bn]; 275 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 276 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 277 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 278 /*longest message */ 279 max_fszs = 0; 280 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 281 /*better way to estimate number of nonzero in the mat???*/ 282 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 283 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 284 rows_pos = 0; 285 totalrows = 0; 286 for (i=0; i<nfrom; i++){ 287 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 288 /*group data*/ 289 for (j=0; j<fromsizes[2*i]; j+=2){ 290 is_id = fromrows[rows_pos++];/*no of is*/ 291 rows_i = rows_data[is_id]; 292 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 293 } 294 /*estimate a space to avoid multiple allocations */ 295 for (j=0; j<nidx; j++){ 296 indvc_ij = 0; 297 rows_i = rows_data[j]; 298 for (l=0; l<rows_pos_i[j]; l++){ 299 row = rows_i[l]-rstart; 300 start = ai[row]; 301 end = ai[row+1]; 302 for (k=start; k<end; k++){ /* Amat */ 303 col = aj[k] + cstart; 304 indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/ 305 } 306 start = bi[row]; 307 end = bi[row+1]; 308 for (k=start; k<end; k++) { /* Bmat */ 309 col = gcols[bj[k]]; 310 indices_tmp[indvc_ij++] = col; 311 } 312 } 313 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 314 indv_counts[i*nidx+j] = indvc_ij; 315 totalrows += indvc_ij; 316 } 317 } 318 /*message triple <no of is, number of rows, rows> */ 319 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 320 totalrows = 0; 321 rows_pos = 0; 322 /* use this code again */ 323 for (i=0;i<nfrom;i++){ 324 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 325 for (j=0; j<fromsizes[2*i]; j+=2){ 326 is_id = fromrows[rows_pos++]; 327 rows_i = rows_data[is_id]; 328 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 329 } 330 /* add data */ 331 for (j=0; j<nidx; j++){ 332 if (!indv_counts[i*nidx+j]) continue; 333 indvc_ij = 0; 334 sbdata[totalrows++] = j; 335 sbdata[totalrows++] = indv_counts[i*nidx+j]; 336 sbsizes[2*i] += 2; 337 rows_i = rows_data[j]; 338 for (l=0; l<rows_pos_i[j]; l++){ 339 row = rows_i[l]-rstart; 340 start = ai[row]; 341 end = ai[row+1]; 342 for (k=start; k<end; k++){ /* Amat */ 343 col = aj[k] + cstart; 344 indices_tmp[indvc_ij++] = col; 345 } 346 start = bi[row]; 347 end = bi[row+1]; 348 for (k=start; k<end; k++) { /* Bmat */ 349 col = gcols[bj[k]]; 350 indices_tmp[indvc_ij++] = col; 351 } 352 } 353 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 354 sbsizes[2*i] += indvc_ij; 355 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 356 totalrows += indvc_ij; 357 } 358 } 359 /* offsets */ 360 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 361 for (i=0; i<nfrom; i++){ 362 offsets[i+1] = offsets[i] + sbsizes[2*i]; 363 sbsizes[2*i+1] = offsets[i]; 364 } 365 ierr = PetscFree(offsets);CHKERRQ(ierr); 366 if (sbrowsizes) *sbrowsizes = sbsizes; 367 if (sbrows) *sbrows = sbdata; 368 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 369 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 370 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 371 PetscFunctionReturn(0); 372 } 373 374 #undef __FUNCT__ 375 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 376 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 377 { 378 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 379 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 380 PetscInt lsize,lsize_tmp,owner; 381 PetscMPIInt rank; 382 Mat amat,bmat; 383 PetscBool done; 384 PetscLayout cmap,rmap; 385 MPI_Comm comm; 386 PetscErrorCode ierr; 387 388 PetscFunctionBegin; 389 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 390 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 391 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 392 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 393 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 394 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 395 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 396 /*is it a safe way to compute number of nonzero values ?*/ 397 tnz = ai[an]+bi[bn]; 398 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 399 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 400 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 401 /*it is a better way to estimate memory than the old implementation 402 * where global size of matrix is used 403 * */ 404 ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr); 405 for (i=0; i<nidx; i++) { 406 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 407 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 408 lsize_tmp = 0; 409 for (j=0; j<lsize; j++) { 410 owner = -1; 411 row = indices[j]; 412 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 413 if (owner != rank) continue; 414 /*local number*/ 415 row -= rstart; 416 start = ai[row]; 417 end = ai[row+1]; 418 for (k=start; k<end; k++) { /* Amat */ 419 col = aj[k] + cstart; 420 indices_temp[lsize_tmp++] = col; 421 } 422 start = bi[row]; 423 end = bi[row+1]; 424 for (k=start; k<end; k++) { /* Bmat */ 425 col = gcols[bj[k]]; 426 indices_temp[lsize_tmp++] = col; 427 } 428 } 429 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 430 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 431 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 432 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 433 } 434 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 435 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 436 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 437 PetscFunctionReturn(0); 438 } 439 440 441 /* 442 Sample message format: 443 If a processor A wants processor B to process some elements corresponding 444 to index sets is[1],is[5] 445 mesg [0] = 2 (no of index sets in the mesg) 446 ----------- 447 mesg [1] = 1 => is[1] 448 mesg [2] = sizeof(is[1]); 449 ----------- 450 mesg [3] = 5 => is[5] 451 mesg [4] = sizeof(is[5]); 452 ----------- 453 mesg [5] 454 mesg [n] datas[1] 455 ----------- 456 mesg[n+1] 457 mesg[m] data(is[5]) 458 ----------- 459 460 Notes: 461 nrqs - no of requests sent (or to be sent out) 462 nrqr - no of requests recieved (which have to be or which have been processed 463 */ 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 466 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 467 { 468 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 469 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 470 const PetscInt **idx,*idx_i; 471 PetscInt *n,**data,len; 472 PetscErrorCode ierr; 473 PetscMPIInt size,rank,tag1,tag2; 474 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 475 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 476 PetscBT *table; 477 MPI_Comm comm; 478 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 479 MPI_Status *s_status,*recv_status; 480 char *t_p; 481 482 PetscFunctionBegin; 483 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 484 size = c->size; 485 rank = c->rank; 486 M = C->rmap->N; 487 488 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 489 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 490 491 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 492 493 for (i=0; i<imax; i++) { 494 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 495 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 496 } 497 498 /* evaluate communication - mesg to who,length of mesg, and buffer space 499 required. Based on this, buffers are allocated, and data copied into them*/ 500 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 501 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 502 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 503 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 504 for (i=0; i<imax; i++) { 505 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 506 idx_i = idx[i]; 507 len = n[i]; 508 for (j=0; j<len; j++) { 509 row = idx_i[j]; 510 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 511 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 512 w4[proc]++; 513 } 514 for (j=0; j<size; j++) { 515 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 516 } 517 } 518 519 nrqs = 0; /* no of outgoing messages */ 520 msz = 0; /* total mesg length (for all proc */ 521 w1[rank] = 0; /* no mesg sent to intself */ 522 w3[rank] = 0; 523 for (i=0; i<size; i++) { 524 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 525 } 526 /* pa - is list of processors to communicate with */ 527 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 528 for (i=0,j=0; i<size; i++) { 529 if (w1[i]) {pa[j] = i; j++;} 530 } 531 532 /* Each message would have a header = 1 + 2*(no of IS) + data */ 533 for (i=0; i<nrqs; i++) { 534 j = pa[i]; 535 w1[j] += w2[j] + 2*w3[j]; 536 msz += w1[j]; 537 } 538 539 /* Determine the number of messages to expect, their lengths, from from-ids */ 540 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 541 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 542 543 /* Now post the Irecvs corresponding to these messages */ 544 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 545 546 /* Allocate Memory for outgoing messages */ 547 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 548 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 549 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 550 551 { 552 PetscInt *iptr = tmp,ict = 0; 553 for (i=0; i<nrqs; i++) { 554 j = pa[i]; 555 iptr += ict; 556 outdat[j] = iptr; 557 ict = w1[j]; 558 } 559 } 560 561 /* Form the outgoing messages */ 562 /*plug in the headers*/ 563 for (i=0; i<nrqs; i++) { 564 j = pa[i]; 565 outdat[j][0] = 0; 566 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 567 ptr[j] = outdat[j] + 2*w3[j] + 1; 568 } 569 570 /* Memory for doing local proc's work*/ 571 { 572 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 573 574 for (i=0; i<imax; i++) { 575 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 576 data[i] = d_p + M*i; 577 } 578 } 579 580 /* Parse the IS and update local tables and the outgoing buf with the data*/ 581 { 582 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 583 PetscBT table_i; 584 585 for (i=0; i<imax; i++) { 586 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 587 n_i = n[i]; 588 table_i = table[i]; 589 idx_i = idx[i]; 590 data_i = data[i]; 591 isz_i = isz[i]; 592 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 593 row = idx_i[j]; 594 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 595 if (proc != rank) { /* copy to the outgoing buffer */ 596 ctr[proc]++; 597 *ptr[proc] = row; 598 ptr[proc]++; 599 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 600 } 601 /* Update the headers for the current IS */ 602 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 603 if ((ctr_j = ctr[j])) { 604 outdat_j = outdat[j]; 605 k = ++outdat_j[0]; 606 outdat_j[2*k] = ctr_j; 607 outdat_j[2*k-1] = i; 608 } 609 } 610 isz[i] = isz_i; 611 } 612 } 613 614 /* Now post the sends */ 615 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 616 for (i=0; i<nrqs; ++i) { 617 j = pa[i]; 618 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 619 } 620 621 /* No longer need the original indices*/ 622 for (i=0; i<imax; ++i) { 623 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 624 } 625 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 626 627 for (i=0; i<imax; ++i) { 628 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 629 } 630 631 /* Do Local work*/ 632 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 633 634 /* Receive messages*/ 635 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 636 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 637 638 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 639 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 640 641 /* Phase 1 sends are complete - deallocate buffers */ 642 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 643 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 644 645 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 646 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 647 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 648 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 649 ierr = PetscFree(rbuf);CHKERRQ(ierr); 650 651 652 /* Send the data back*/ 653 /* Do a global reduction to know the buffer space req for incoming messages*/ 654 { 655 PetscMPIInt *rw1; 656 657 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 658 659 for (i=0; i<nrqr; ++i) { 660 proc = recv_status[i].MPI_SOURCE; 661 662 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 663 rw1[proc] = isz1[i]; 664 } 665 ierr = PetscFree(onodes1);CHKERRQ(ierr); 666 ierr = PetscFree(olengths1);CHKERRQ(ierr); 667 668 /* Determine the number of messages to expect, their lengths, from from-ids */ 669 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 670 ierr = PetscFree(rw1);CHKERRQ(ierr); 671 } 672 /* Now post the Irecvs corresponding to these messages */ 673 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 674 675 /* Now post the sends */ 676 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 677 for (i=0; i<nrqr; ++i) { 678 j = recv_status[i].MPI_SOURCE; 679 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 680 } 681 682 /* receive work done on other processors*/ 683 { 684 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 685 PetscMPIInt idex; 686 PetscBT table_i; 687 MPI_Status *status2; 688 689 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 690 for (i=0; i<nrqs; ++i) { 691 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 692 /* Process the message*/ 693 rbuf2_i = rbuf2[idex]; 694 ct1 = 2*rbuf2_i[0]+1; 695 jmax = rbuf2[idex][0]; 696 for (j=1; j<=jmax; j++) { 697 max = rbuf2_i[2*j]; 698 is_no = rbuf2_i[2*j-1]; 699 isz_i = isz[is_no]; 700 data_i = data[is_no]; 701 table_i = table[is_no]; 702 for (k=0; k<max; k++,ct1++) { 703 row = rbuf2_i[ct1]; 704 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 705 } 706 isz[is_no] = isz_i; 707 } 708 } 709 710 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 711 ierr = PetscFree(status2);CHKERRQ(ierr); 712 } 713 714 for (i=0; i<imax; ++i) { 715 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 716 } 717 718 ierr = PetscFree(onodes2);CHKERRQ(ierr); 719 ierr = PetscFree(olengths2);CHKERRQ(ierr); 720 721 ierr = PetscFree(pa);CHKERRQ(ierr); 722 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 723 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 724 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 725 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 726 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 727 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 728 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 729 ierr = PetscFree(s_status);CHKERRQ(ierr); 730 ierr = PetscFree(recv_status);CHKERRQ(ierr); 731 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 732 ierr = PetscFree(xdata);CHKERRQ(ierr); 733 ierr = PetscFree(isz1);CHKERRQ(ierr); 734 PetscFunctionReturn(0); 735 } 736 737 #undef __FUNCT__ 738 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 739 /* 740 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 741 the work on the local processor. 742 743 Inputs: 744 C - MAT_MPIAIJ; 745 imax - total no of index sets processed at a time; 746 table - an array of char - size = m bits. 747 748 Output: 749 isz - array containing the count of the solution elements corresponding 750 to each index set; 751 data - pointer to the solutions 752 */ 753 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 754 { 755 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 756 Mat A = c->A,B = c->B; 757 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 758 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 759 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 760 PetscBT table_i; 761 762 PetscFunctionBegin; 763 rstart = C->rmap->rstart; 764 cstart = C->cmap->rstart; 765 ai = a->i; 766 aj = a->j; 767 bi = b->i; 768 bj = b->j; 769 garray = c->garray; 770 771 772 for (i=0; i<imax; i++) { 773 data_i = data[i]; 774 table_i = table[i]; 775 isz_i = isz[i]; 776 for (j=0,max=isz[i]; j<max; j++) { 777 row = data_i[j] - rstart; 778 start = ai[row]; 779 end = ai[row+1]; 780 for (k=start; k<end; k++) { /* Amat */ 781 val = aj[k] + cstart; 782 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 783 } 784 start = bi[row]; 785 end = bi[row+1]; 786 for (k=start; k<end; k++) { /* Bmat */ 787 val = garray[bj[k]]; 788 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 789 } 790 } 791 isz[i] = isz_i; 792 } 793 PetscFunctionReturn(0); 794 } 795 796 #undef __FUNCT__ 797 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 798 /* 799 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 800 and return the output 801 802 Input: 803 C - the matrix 804 nrqr - no of messages being processed. 805 rbuf - an array of pointers to the recieved requests 806 807 Output: 808 xdata - array of messages to be sent back 809 isz1 - size of each message 810 811 For better efficiency perhaps we should malloc separately each xdata[i], 812 then if a remalloc is required we need only copy the data for that one row 813 rather then all previous rows as it is now where a single large chunck of 814 memory is used. 815 816 */ 817 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 818 { 819 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 820 Mat A = c->A,B = c->B; 821 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 822 PetscErrorCode ierr; 823 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 824 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 825 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 826 PetscInt *rbuf_i,kmax,rbuf_0; 827 PetscBT xtable; 828 829 PetscFunctionBegin; 830 m = C->rmap->N; 831 rstart = C->rmap->rstart; 832 cstart = C->cmap->rstart; 833 ai = a->i; 834 aj = a->j; 835 bi = b->i; 836 bj = b->j; 837 garray = c->garray; 838 839 840 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 841 rbuf_i = rbuf[i]; 842 rbuf_0 = rbuf_i[0]; 843 ct += rbuf_0; 844 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 845 } 846 847 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 848 else max1 = 1; 849 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 850 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 851 ++no_malloc; 852 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 853 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 854 855 ct3 = 0; 856 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 857 rbuf_i = rbuf[i]; 858 rbuf_0 = rbuf_i[0]; 859 ct1 = 2*rbuf_0+1; 860 ct2 = ct1; 861 ct3 += ct1; 862 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 863 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 864 oct2 = ct2; 865 kmax = rbuf_i[2*j]; 866 for (k=0; k<kmax; k++,ct1++) { 867 row = rbuf_i[ct1]; 868 if (!PetscBTLookupSet(xtable,row)) { 869 if (!(ct3 < mem_estimate)) { 870 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 871 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 872 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 873 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 874 xdata[0] = tmp; 875 mem_estimate = new_estimate; ++no_malloc; 876 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 877 } 878 xdata[i][ct2++] = row; 879 ct3++; 880 } 881 } 882 for (k=oct2,max2=ct2; k<max2; k++) { 883 row = xdata[i][k] - rstart; 884 start = ai[row]; 885 end = ai[row+1]; 886 for (l=start; l<end; l++) { 887 val = aj[l] + cstart; 888 if (!PetscBTLookupSet(xtable,val)) { 889 if (!(ct3 < mem_estimate)) { 890 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 891 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 892 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 893 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 894 xdata[0] = tmp; 895 mem_estimate = new_estimate; ++no_malloc; 896 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 897 } 898 xdata[i][ct2++] = val; 899 ct3++; 900 } 901 } 902 start = bi[row]; 903 end = bi[row+1]; 904 for (l=start; l<end; l++) { 905 val = garray[bj[l]]; 906 if (!PetscBTLookupSet(xtable,val)) { 907 if (!(ct3 < mem_estimate)) { 908 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 909 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 910 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 911 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 912 xdata[0] = tmp; 913 mem_estimate = new_estimate; ++no_malloc; 914 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 915 } 916 xdata[i][ct2++] = val; 917 ct3++; 918 } 919 } 920 } 921 /* Update the header*/ 922 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 923 xdata[i][2*j-1] = rbuf_i[2*j-1]; 924 } 925 xdata[i][0] = rbuf_0; 926 xdata[i+1] = xdata[i] + ct2; 927 isz1[i] = ct2; /* size of each message */ 928 } 929 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 930 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 931 PetscFunctionReturn(0); 932 } 933 /* -------------------------------------------------------------------------*/ 934 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 935 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 936 /* 937 Every processor gets the entire matrix 938 */ 939 #undef __FUNCT__ 940 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 941 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 942 { 943 Mat B; 944 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 945 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 946 PetscErrorCode ierr; 947 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 948 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 949 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 950 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 951 952 PetscFunctionBegin; 953 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 954 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 955 956 if (scall == MAT_INITIAL_MATRIX) { 957 /* ---------------------------------------------------------------- 958 Tell every processor the number of nonzeros per row 959 */ 960 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 961 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 962 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 963 } 964 sendcount = A->rmap->rend - A->rmap->rstart; 965 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 966 for (i=0; i<size; i++) { 967 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 968 displs[i] = A->rmap->range[i]; 969 } 970 #if defined(PETSC_HAVE_MPI_IN_PLACE) 971 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 972 #else 973 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 974 #endif 975 /* --------------------------------------------------------------- 976 Create the sequential matrix of the same type as the local block diagonal 977 */ 978 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 979 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 980 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 981 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 982 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 983 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 984 **Bin = B; 985 b = (Mat_SeqAIJ*)B->data; 986 987 /*-------------------------------------------------------------------- 988 Copy my part of matrix column indices over 989 */ 990 sendcount = ad->nz + bd->nz; 991 jsendbuf = b->j + b->i[rstarts[rank]]; 992 a_jsendbuf = ad->j; 993 b_jsendbuf = bd->j; 994 n = A->rmap->rend - A->rmap->rstart; 995 cnt = 0; 996 for (i=0; i<n; i++) { 997 998 /* put in lower diagonal portion */ 999 m = bd->i[i+1] - bd->i[i]; 1000 while (m > 0) { 1001 /* is it above diagonal (in bd (compressed) numbering) */ 1002 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1003 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1004 m--; 1005 } 1006 1007 /* put in diagonal portion */ 1008 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1009 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1010 } 1011 1012 /* put in upper diagonal portion */ 1013 while (m-- > 0) { 1014 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1015 } 1016 } 1017 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1018 1019 /*-------------------------------------------------------------------- 1020 Gather all column indices to all processors 1021 */ 1022 for (i=0; i<size; i++) { 1023 recvcounts[i] = 0; 1024 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1025 recvcounts[i] += lens[j]; 1026 } 1027 } 1028 displs[0] = 0; 1029 for (i=1; i<size; i++) { 1030 displs[i] = displs[i-1] + recvcounts[i-1]; 1031 } 1032 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1033 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1034 #else 1035 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1036 #endif 1037 /*-------------------------------------------------------------------- 1038 Assemble the matrix into useable form (note numerical values not yet set) 1039 */ 1040 /* set the b->ilen (length of each row) values */ 1041 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1042 /* set the b->i indices */ 1043 b->i[0] = 0; 1044 for (i=1; i<=A->rmap->N; i++) { 1045 b->i[i] = b->i[i-1] + lens[i-1]; 1046 } 1047 ierr = PetscFree(lens);CHKERRQ(ierr); 1048 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1049 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1050 1051 } else { 1052 B = **Bin; 1053 b = (Mat_SeqAIJ*)B->data; 1054 } 1055 1056 /*-------------------------------------------------------------------- 1057 Copy my part of matrix numerical values into the values location 1058 */ 1059 if (flag == MAT_GET_VALUES) { 1060 sendcount = ad->nz + bd->nz; 1061 sendbuf = b->a + b->i[rstarts[rank]]; 1062 a_sendbuf = ad->a; 1063 b_sendbuf = bd->a; 1064 b_sendj = bd->j; 1065 n = A->rmap->rend - A->rmap->rstart; 1066 cnt = 0; 1067 for (i=0; i<n; i++) { 1068 1069 /* put in lower diagonal portion */ 1070 m = bd->i[i+1] - bd->i[i]; 1071 while (m > 0) { 1072 /* is it above diagonal (in bd (compressed) numbering) */ 1073 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1074 sendbuf[cnt++] = *b_sendbuf++; 1075 m--; 1076 b_sendj++; 1077 } 1078 1079 /* put in diagonal portion */ 1080 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1081 sendbuf[cnt++] = *a_sendbuf++; 1082 } 1083 1084 /* put in upper diagonal portion */ 1085 while (m-- > 0) { 1086 sendbuf[cnt++] = *b_sendbuf++; 1087 b_sendj++; 1088 } 1089 } 1090 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1091 1092 /* ----------------------------------------------------------------- 1093 Gather all numerical values to all processors 1094 */ 1095 if (!recvcounts) { 1096 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1097 } 1098 for (i=0; i<size; i++) { 1099 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1100 } 1101 displs[0] = 0; 1102 for (i=1; i<size; i++) { 1103 displs[i] = displs[i-1] + recvcounts[i-1]; 1104 } 1105 recvbuf = b->a; 1106 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1107 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1108 #else 1109 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1110 #endif 1111 } /* endof (flag == MAT_GET_VALUES) */ 1112 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1113 1114 if (A->symmetric) { 1115 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1116 } else if (A->hermitian) { 1117 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1118 } else if (A->structurally_symmetric) { 1119 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1120 } 1121 PetscFunctionReturn(0); 1122 } 1123 1124 1125 1126 #undef __FUNCT__ 1127 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 1128 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1129 { 1130 PetscErrorCode ierr; 1131 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 1132 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 1133 1134 PetscFunctionBegin; 1135 1136 /* 1137 Check for special case: each processor gets entire matrix 1138 */ 1139 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1140 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1141 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1142 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1143 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1144 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1145 wantallmatrix = PETSC_TRUE; 1146 1147 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1148 } 1149 } 1150 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1151 if (twantallmatrix) { 1152 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 1153 PetscFunctionReturn(0); 1154 } 1155 1156 /* Allocate memory to hold all the submatrices */ 1157 if (scall != MAT_REUSE_MATRIX) { 1158 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 1159 } 1160 1161 /* Check for special case: each processor gets entire matrix columns */ 1162 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 1163 for (i=0; i<ismax; i++) { 1164 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 1165 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 1166 if (colflag && ncol == C->cmap->N) { 1167 allcolumns[i] = PETSC_TRUE; 1168 } else { 1169 allcolumns[i] = PETSC_FALSE; 1170 } 1171 } 1172 1173 /* Determine the number of stages through which submatrices are done */ 1174 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1175 1176 /* 1177 Each stage will extract nmax submatrices. 1178 nmax is determined by the matrix column dimension. 1179 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1180 */ 1181 if (!nmax) nmax = 1; 1182 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 1183 1184 /* Make sure every processor loops through the nstages */ 1185 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1186 1187 for (i=0,pos=0; i<nstages; i++) { 1188 if (pos+nmax <= ismax) max_no = nmax; 1189 else if (pos == ismax) max_no = 0; 1190 else max_no = ismax-pos; 1191 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 1192 pos += max_no; 1193 } 1194 1195 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 1196 PetscFunctionReturn(0); 1197 } 1198 1199 /* -------------------------------------------------------------------------*/ 1200 #undef __FUNCT__ 1201 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 1202 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 1203 { 1204 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1205 Mat A = c->A; 1206 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 1207 const PetscInt **icol,**irow; 1208 PetscInt *nrow,*ncol,start; 1209 PetscErrorCode ierr; 1210 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 1211 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 1212 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 1213 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 1214 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 1215 #if defined(PETSC_USE_CTABLE) 1216 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 1217 #else 1218 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 1219 #endif 1220 const PetscInt *irow_i; 1221 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 1222 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1223 MPI_Request *r_waits4,*s_waits3,*s_waits4; 1224 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 1225 MPI_Status *r_status3,*r_status4,*s_status4; 1226 MPI_Comm comm; 1227 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 1228 PetscMPIInt *onodes1,*olengths1; 1229 PetscMPIInt idex,idex2,end; 1230 1231 PetscFunctionBegin; 1232 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1233 tag0 = ((PetscObject)C)->tag; 1234 size = c->size; 1235 rank = c->rank; 1236 1237 /* Get some new tags to keep the communication clean */ 1238 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 1239 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1240 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1241 1242 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 1243 1244 for (i=0; i<ismax; i++) { 1245 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 1246 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 1247 if (allcolumns[i]) { 1248 icol[i] = NULL; 1249 ncol[i] = C->cmap->N; 1250 } else { 1251 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 1252 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 1253 } 1254 } 1255 1256 /* evaluate communication - mesg to who, length of mesg, and buffer space 1257 required. Based on this, buffers are allocated, and data copied into them*/ 1258 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 1259 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1260 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1261 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1262 for (i=0; i<ismax; i++) { 1263 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1264 jmax = nrow[i]; 1265 irow_i = irow[i]; 1266 for (j=0; j<jmax; j++) { 1267 l = 0; 1268 row = irow_i[j]; 1269 while (row >= C->rmap->range[l+1]) l++; 1270 proc = l; 1271 w4[proc]++; 1272 } 1273 for (j=0; j<size; j++) { 1274 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 1275 } 1276 } 1277 1278 nrqs = 0; /* no of outgoing messages */ 1279 msz = 0; /* total mesg length (for all procs) */ 1280 w1[rank] = 0; /* no mesg sent to self */ 1281 w3[rank] = 0; 1282 for (i=0; i<size; i++) { 1283 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 1284 } 1285 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1286 for (i=0,j=0; i<size; i++) { 1287 if (w1[i]) { pa[j] = i; j++; } 1288 } 1289 1290 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1291 for (i=0; i<nrqs; i++) { 1292 j = pa[i]; 1293 w1[j] += w2[j] + 2* w3[j]; 1294 msz += w1[j]; 1295 } 1296 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1297 1298 /* Determine the number of messages to expect, their lengths, from from-ids */ 1299 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1300 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1301 1302 /* Now post the Irecvs corresponding to these messages */ 1303 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1304 1305 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1306 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1307 1308 /* Allocate Memory for outgoing messages */ 1309 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1310 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1311 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1312 1313 { 1314 PetscInt *iptr = tmp,ict = 0; 1315 for (i=0; i<nrqs; i++) { 1316 j = pa[i]; 1317 iptr += ict; 1318 sbuf1[j] = iptr; 1319 ict = w1[j]; 1320 } 1321 } 1322 1323 /* Form the outgoing messages */ 1324 /* Initialize the header space */ 1325 for (i=0; i<nrqs; i++) { 1326 j = pa[i]; 1327 sbuf1[j][0] = 0; 1328 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 1329 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 1330 } 1331 1332 /* Parse the isrow and copy data into outbuf */ 1333 for (i=0; i<ismax; i++) { 1334 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1335 irow_i = irow[i]; 1336 jmax = nrow[i]; 1337 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 1338 l = 0; 1339 row = irow_i[j]; 1340 while (row >= C->rmap->range[l+1]) l++; 1341 proc = l; 1342 if (proc != rank) { /* copy to the outgoing buf*/ 1343 ctr[proc]++; 1344 *ptr[proc] = row; 1345 ptr[proc]++; 1346 } 1347 } 1348 /* Update the headers for the current IS */ 1349 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1350 if ((ctr_j = ctr[j])) { 1351 sbuf1_j = sbuf1[j]; 1352 k = ++sbuf1_j[0]; 1353 sbuf1_j[2*k] = ctr_j; 1354 sbuf1_j[2*k-1] = i; 1355 } 1356 } 1357 } 1358 1359 /* Now post the sends */ 1360 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1361 for (i=0; i<nrqs; ++i) { 1362 j = pa[i]; 1363 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 1364 } 1365 1366 /* Post Receives to capture the buffer size */ 1367 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 1368 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 1369 rbuf2[0] = tmp + msz; 1370 for (i=1; i<nrqs; ++i) { 1371 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 1372 } 1373 for (i=0; i<nrqs; ++i) { 1374 j = pa[i]; 1375 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 1376 } 1377 1378 /* Send to other procs the buf size they should allocate */ 1379 1380 1381 /* Receive messages*/ 1382 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 1383 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1384 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 1385 { 1386 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 1387 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 1388 PetscInt *sbuf2_i; 1389 1390 for (i=0; i<nrqr; ++i) { 1391 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1392 1393 req_size[idex] = 0; 1394 rbuf1_i = rbuf1[idex]; 1395 start = 2*rbuf1_i[0] + 1; 1396 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1397 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 1398 sbuf2_i = sbuf2[idex]; 1399 for (j=start; j<end; j++) { 1400 id = rbuf1_i[j] - rstart; 1401 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1402 sbuf2_i[j] = ncols; 1403 req_size[idex] += ncols; 1404 } 1405 req_source[idex] = r_status1[i].MPI_SOURCE; 1406 /* form the header */ 1407 sbuf2_i[0] = req_size[idex]; 1408 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1409 1410 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1411 } 1412 } 1413 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1414 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1415 1416 /* recv buffer sizes */ 1417 /* Receive messages*/ 1418 1419 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 1420 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1421 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 1422 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1423 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 1424 1425 for (i=0; i<nrqs; ++i) { 1426 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1427 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 1428 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 1429 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1430 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1431 } 1432 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1433 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1434 1435 /* Wait on sends1 and sends2 */ 1436 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1437 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 1438 1439 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1440 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1441 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1442 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1443 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1444 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1445 1446 /* Now allocate buffers for a->j, and send them off */ 1447 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1448 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1449 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1450 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1451 1452 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 1453 { 1454 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1455 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1456 PetscInt cend = C->cmap->rend; 1457 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1458 1459 for (i=0; i<nrqr; i++) { 1460 rbuf1_i = rbuf1[i]; 1461 sbuf_aj_i = sbuf_aj[i]; 1462 ct1 = 2*rbuf1_i[0] + 1; 1463 ct2 = 0; 1464 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1465 kmax = rbuf1[i][2*j]; 1466 for (k=0; k<kmax; k++,ct1++) { 1467 row = rbuf1_i[ct1] - rstart; 1468 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1469 ncols = nzA + nzB; 1470 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1471 1472 /* load the column indices for this row into cols*/ 1473 cols = sbuf_aj_i + ct2; 1474 1475 lwrite = 0; 1476 for (l=0; l<nzB; l++) { 1477 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1478 } 1479 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1480 for (l=0; l<nzB; l++) { 1481 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1482 } 1483 1484 ct2 += ncols; 1485 } 1486 } 1487 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1488 } 1489 } 1490 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 1491 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 1492 1493 /* Allocate buffers for a->a, and send them off */ 1494 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1495 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1496 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1497 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1498 1499 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1500 { 1501 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1502 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1503 PetscInt cend = C->cmap->rend; 1504 PetscInt *b_j = b->j; 1505 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1506 1507 for (i=0; i<nrqr; i++) { 1508 rbuf1_i = rbuf1[i]; 1509 sbuf_aa_i = sbuf_aa[i]; 1510 ct1 = 2*rbuf1_i[0]+1; 1511 ct2 = 0; 1512 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1513 kmax = rbuf1_i[2*j]; 1514 for (k=0; k<kmax; k++,ct1++) { 1515 row = rbuf1_i[ct1] - rstart; 1516 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1517 ncols = nzA + nzB; 1518 cworkB = b_j + b_i[row]; 1519 vworkA = a_a + a_i[row]; 1520 vworkB = b_a + b_i[row]; 1521 1522 /* load the column values for this row into vals*/ 1523 vals = sbuf_aa_i+ct2; 1524 1525 lwrite = 0; 1526 for (l=0; l<nzB; l++) { 1527 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1528 } 1529 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1530 for (l=0; l<nzB; l++) { 1531 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1532 } 1533 1534 ct2 += ncols; 1535 } 1536 } 1537 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1538 } 1539 } 1540 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1541 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1542 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1543 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1544 1545 /* Form the matrix */ 1546 /* create col map: global col of C -> local col of submatrices */ 1547 { 1548 const PetscInt *icol_i; 1549 #if defined(PETSC_USE_CTABLE) 1550 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 1551 for (i=0; i<ismax; i++) { 1552 if (!allcolumns[i]) { 1553 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1554 1555 jmax = ncol[i]; 1556 icol_i = icol[i]; 1557 cmap_i = cmap[i]; 1558 for (j=0; j<jmax; j++) { 1559 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1560 } 1561 } else { 1562 cmap[i] = NULL; 1563 } 1564 } 1565 #else 1566 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1567 for (i=0; i<ismax; i++) { 1568 if (!allcolumns[i]) { 1569 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1570 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1571 jmax = ncol[i]; 1572 icol_i = icol[i]; 1573 cmap_i = cmap[i]; 1574 for (j=0; j<jmax; j++) { 1575 cmap_i[icol_i[j]] = j+1; 1576 } 1577 } else { 1578 cmap[i] = NULL; 1579 } 1580 } 1581 #endif 1582 } 1583 1584 /* Create lens which is required for MatCreate... */ 1585 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1586 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1587 if (ismax) { 1588 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1589 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1590 } 1591 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1592 1593 /* Update lens from local data */ 1594 for (i=0; i<ismax; i++) { 1595 jmax = nrow[i]; 1596 if (!allcolumns[i]) cmap_i = cmap[i]; 1597 irow_i = irow[i]; 1598 lens_i = lens[i]; 1599 for (j=0; j<jmax; j++) { 1600 l = 0; 1601 row = irow_i[j]; 1602 while (row >= C->rmap->range[l+1]) l++; 1603 proc = l; 1604 if (proc == rank) { 1605 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1606 if (!allcolumns[i]) { 1607 for (k=0; k<ncols; k++) { 1608 #if defined(PETSC_USE_CTABLE) 1609 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1610 #else 1611 tcol = cmap_i[cols[k]]; 1612 #endif 1613 if (tcol) lens_i[j]++; 1614 } 1615 } else { /* allcolumns */ 1616 lens_i[j] = ncols; 1617 } 1618 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1619 } 1620 } 1621 } 1622 1623 /* Create row map: global row of C -> local row of submatrices */ 1624 #if defined(PETSC_USE_CTABLE) 1625 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 1626 for (i=0; i<ismax; i++) { 1627 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1628 rmap_i = rmap[i]; 1629 irow_i = irow[i]; 1630 jmax = nrow[i]; 1631 for (j=0; j<jmax; j++) { 1632 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1633 } 1634 } 1635 #else 1636 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1637 if (ismax) { 1638 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1639 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1640 } 1641 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1642 for (i=0; i<ismax; i++) { 1643 rmap_i = rmap[i]; 1644 irow_i = irow[i]; 1645 jmax = nrow[i]; 1646 for (j=0; j<jmax; j++) { 1647 rmap_i[irow_i[j]] = j; 1648 } 1649 } 1650 #endif 1651 1652 /* Update lens from offproc data */ 1653 { 1654 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1655 1656 for (tmp2=0; tmp2<nrqs; tmp2++) { 1657 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1658 idex = pa[idex2]; 1659 sbuf1_i = sbuf1[idex]; 1660 jmax = sbuf1_i[0]; 1661 ct1 = 2*jmax+1; 1662 ct2 = 0; 1663 rbuf2_i = rbuf2[idex2]; 1664 rbuf3_i = rbuf3[idex2]; 1665 for (j=1; j<=jmax; j++) { 1666 is_no = sbuf1_i[2*j-1]; 1667 max1 = sbuf1_i[2*j]; 1668 lens_i = lens[is_no]; 1669 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1670 rmap_i = rmap[is_no]; 1671 for (k=0; k<max1; k++,ct1++) { 1672 #if defined(PETSC_USE_CTABLE) 1673 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1674 row--; 1675 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1676 #else 1677 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1678 #endif 1679 max2 = rbuf2_i[ct1]; 1680 for (l=0; l<max2; l++,ct2++) { 1681 if (!allcolumns[is_no]) { 1682 #if defined(PETSC_USE_CTABLE) 1683 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1684 #else 1685 tcol = cmap_i[rbuf3_i[ct2]]; 1686 #endif 1687 if (tcol) lens_i[row]++; 1688 } else { /* allcolumns */ 1689 lens_i[row]++; /* lens_i[row] += max2 ? */ 1690 } 1691 } 1692 } 1693 } 1694 } 1695 } 1696 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1697 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1698 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1699 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1700 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1701 1702 /* Create the submatrices */ 1703 if (scall == MAT_REUSE_MATRIX) { 1704 PetscBool flag; 1705 1706 /* 1707 Assumes new rows are same length as the old rows,hence bug! 1708 */ 1709 for (i=0; i<ismax; i++) { 1710 mat = (Mat_SeqAIJ*)(submats[i]->data); 1711 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1712 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1713 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1714 /* Initial matrix as if empty */ 1715 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1716 1717 submats[i]->factortype = C->factortype; 1718 } 1719 } else { 1720 for (i=0; i<ismax; i++) { 1721 PetscInt rbs,cbs; 1722 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1723 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1724 1725 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1726 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1727 1728 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1729 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1730 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1731 } 1732 } 1733 1734 /* Assemble the matrices */ 1735 /* First assemble the local rows */ 1736 { 1737 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1738 PetscScalar *imat_a; 1739 1740 for (i=0; i<ismax; i++) { 1741 mat = (Mat_SeqAIJ*)submats[i]->data; 1742 imat_ilen = mat->ilen; 1743 imat_j = mat->j; 1744 imat_i = mat->i; 1745 imat_a = mat->a; 1746 1747 if (!allcolumns[i]) cmap_i = cmap[i]; 1748 rmap_i = rmap[i]; 1749 irow_i = irow[i]; 1750 jmax = nrow[i]; 1751 for (j=0; j<jmax; j++) { 1752 l = 0; 1753 row = irow_i[j]; 1754 while (row >= C->rmap->range[l+1]) l++; 1755 proc = l; 1756 if (proc == rank) { 1757 old_row = row; 1758 #if defined(PETSC_USE_CTABLE) 1759 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1760 row--; 1761 #else 1762 row = rmap_i[row]; 1763 #endif 1764 ilen_row = imat_ilen[row]; 1765 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1766 mat_i = imat_i[row]; 1767 mat_a = imat_a + mat_i; 1768 mat_j = imat_j + mat_i; 1769 if (!allcolumns[i]) { 1770 for (k=0; k<ncols; k++) { 1771 #if defined(PETSC_USE_CTABLE) 1772 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1773 #else 1774 tcol = cmap_i[cols[k]]; 1775 #endif 1776 if (tcol) { 1777 *mat_j++ = tcol - 1; 1778 *mat_a++ = vals[k]; 1779 ilen_row++; 1780 } 1781 } 1782 } else { /* allcolumns */ 1783 for (k=0; k<ncols; k++) { 1784 *mat_j++ = cols[k]; /* global col index! */ 1785 *mat_a++ = vals[k]; 1786 ilen_row++; 1787 } 1788 } 1789 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1790 1791 imat_ilen[row] = ilen_row; 1792 } 1793 } 1794 } 1795 } 1796 1797 /* Now assemble the off proc rows*/ 1798 { 1799 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1800 PetscInt *imat_j,*imat_i; 1801 PetscScalar *imat_a,*rbuf4_i; 1802 1803 for (tmp2=0; tmp2<nrqs; tmp2++) { 1804 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1805 idex = pa[idex2]; 1806 sbuf1_i = sbuf1[idex]; 1807 jmax = sbuf1_i[0]; 1808 ct1 = 2*jmax + 1; 1809 ct2 = 0; 1810 rbuf2_i = rbuf2[idex2]; 1811 rbuf3_i = rbuf3[idex2]; 1812 rbuf4_i = rbuf4[idex2]; 1813 for (j=1; j<=jmax; j++) { 1814 is_no = sbuf1_i[2*j-1]; 1815 rmap_i = rmap[is_no]; 1816 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1817 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1818 imat_ilen = mat->ilen; 1819 imat_j = mat->j; 1820 imat_i = mat->i; 1821 imat_a = mat->a; 1822 max1 = sbuf1_i[2*j]; 1823 for (k=0; k<max1; k++,ct1++) { 1824 row = sbuf1_i[ct1]; 1825 #if defined(PETSC_USE_CTABLE) 1826 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1827 row--; 1828 #else 1829 row = rmap_i[row]; 1830 #endif 1831 ilen = imat_ilen[row]; 1832 mat_i = imat_i[row]; 1833 mat_a = imat_a + mat_i; 1834 mat_j = imat_j + mat_i; 1835 max2 = rbuf2_i[ct1]; 1836 if (!allcolumns[is_no]) { 1837 for (l=0; l<max2; l++,ct2++) { 1838 1839 #if defined(PETSC_USE_CTABLE) 1840 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1841 #else 1842 tcol = cmap_i[rbuf3_i[ct2]]; 1843 #endif 1844 if (tcol) { 1845 *mat_j++ = tcol - 1; 1846 *mat_a++ = rbuf4_i[ct2]; 1847 ilen++; 1848 } 1849 } 1850 } else { /* allcolumns */ 1851 for (l=0; l<max2; l++,ct2++) { 1852 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1853 *mat_a++ = rbuf4_i[ct2]; 1854 ilen++; 1855 } 1856 } 1857 imat_ilen[row] = ilen; 1858 } 1859 } 1860 } 1861 } 1862 1863 /* sort the rows */ 1864 { 1865 PetscInt *imat_ilen,*imat_j,*imat_i; 1866 PetscScalar *imat_a; 1867 1868 for (i=0; i<ismax; i++) { 1869 mat = (Mat_SeqAIJ*)submats[i]->data; 1870 imat_j = mat->j; 1871 imat_i = mat->i; 1872 imat_a = mat->a; 1873 imat_ilen = mat->ilen; 1874 1875 if (allcolumns[i]) continue; 1876 jmax = nrow[i]; 1877 for (j=0; j<jmax; j++) { 1878 PetscInt ilen; 1879 1880 mat_i = imat_i[j]; 1881 mat_a = imat_a + mat_i; 1882 mat_j = imat_j + mat_i; 1883 ilen = imat_ilen[j]; 1884 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 1885 } 1886 } 1887 } 1888 1889 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1890 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1891 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1892 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1893 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1894 1895 /* Restore the indices */ 1896 for (i=0; i<ismax; i++) { 1897 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1898 if (!allcolumns[i]) { 1899 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1900 } 1901 } 1902 1903 /* Destroy allocated memory */ 1904 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1905 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1906 ierr = PetscFree(pa);CHKERRQ(ierr); 1907 1908 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1909 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1910 for (i=0; i<nrqr; ++i) { 1911 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1912 } 1913 for (i=0; i<nrqs; ++i) { 1914 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1915 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1916 } 1917 1918 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1919 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1920 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1921 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1922 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1923 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1924 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1925 1926 #if defined(PETSC_USE_CTABLE) 1927 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1928 #else 1929 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1930 #endif 1931 ierr = PetscFree(rmap);CHKERRQ(ierr); 1932 1933 for (i=0; i<ismax; i++) { 1934 if (!allcolumns[i]) { 1935 #if defined(PETSC_USE_CTABLE) 1936 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1937 #else 1938 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1939 #endif 1940 } 1941 } 1942 ierr = PetscFree(cmap);CHKERRQ(ierr); 1943 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1944 ierr = PetscFree(lens);CHKERRQ(ierr); 1945 1946 for (i=0; i<ismax; i++) { 1947 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1948 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1949 } 1950 PetscFunctionReturn(0); 1951 } 1952 1953 /* 1954 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1955 Be careful not to destroy them elsewhere. 1956 */ 1957 #undef __FUNCT__ 1958 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1959 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1960 { 1961 /* If making this function public, change the error returned in this function away from _PLIB. */ 1962 PetscErrorCode ierr; 1963 Mat_MPIAIJ *aij; 1964 PetscBool seqaij; 1965 1966 PetscFunctionBegin; 1967 /* Check to make sure the component matrices are compatible with C. */ 1968 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1969 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1970 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1971 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1972 if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1973 1974 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1975 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1976 ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr); 1977 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1978 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1979 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1980 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1981 aij = (Mat_MPIAIJ*)((*C)->data); 1982 aij->A = A; 1983 aij->B = B; 1984 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr); 1985 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr); 1986 1987 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1988 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1989 PetscFunctionReturn(0); 1990 } 1991 1992 #undef __FUNCT__ 1993 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1994 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1995 { 1996 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1997 1998 PetscFunctionBegin; 1999 PetscValidPointer(A,2); 2000 PetscValidPointer(B,3); 2001 *A = aij->A; 2002 *B = aij->B; 2003 /* Note that we don't incref *A and *B, so be careful! */ 2004 PetscFunctionReturn(0); 2005 } 2006 2007 #undef __FUNCT__ 2008 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 2009 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 2010 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 2011 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 2012 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 2013 { 2014 PetscErrorCode ierr; 2015 PetscMPIInt size, flag; 2016 PetscInt i,ii; 2017 PetscInt ismax_c; 2018 2019 PetscFunctionBegin; 2020 if (!ismax) PetscFunctionReturn(0); 2021 2022 for (i = 0, ismax_c = 0; i < ismax; ++i) { 2023 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 2024 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2025 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2026 if (size > 1) ++ismax_c; 2027 } 2028 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 2029 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2030 } else { /* if (ismax_c) */ 2031 Mat *A,*B; 2032 IS *isrow_c, *iscol_c; 2033 PetscMPIInt size; 2034 /* 2035 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 2036 array of sequential matrices underlying the resulting parallel matrices. 2037 Which arrays to allocate is based on the value of MatReuse scall. 2038 There are as many diag matrices as there are original index sets. 2039 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 2040 2041 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 2042 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 2043 will deposite the extracted diag and off-diag parts. 2044 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 2045 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 2046 */ 2047 2048 /* Parallel matrix array is allocated only if no reuse is taking place. */ 2049 if (scall != MAT_REUSE_MATRIX) { 2050 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 2051 } else { 2052 ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr); 2053 ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr); 2054 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 2055 for (i = 0, ii = 0; i < ismax; ++i) { 2056 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2057 if (size > 1) { 2058 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 2059 ++ii; 2060 } else A[i] = (*submat)[i]; 2061 } 2062 } 2063 /* 2064 Construct the complements of the iscol ISs for parallel ISs only. 2065 These are used to extract the off-diag portion of the resulting parallel matrix. 2066 The row IS for the off-diag portion is the same as for the diag portion, 2067 so we merely alias the row IS, while skipping those that are sequential. 2068 */ 2069 ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr); 2070 for (i = 0, ii = 0; i < ismax; ++i) { 2071 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2072 if (size > 1) { 2073 isrow_c[ii] = isrow[i]; 2074 2075 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 2076 ++ii; 2077 } 2078 } 2079 /* Now obtain the sequential A and B submatrices separately. */ 2080 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 2081 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 2082 for (ii = 0; ii < ismax_c; ++ii) { 2083 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 2084 } 2085 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 2086 /* 2087 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 2088 have been extracted directly into the parallel matrices containing them, or 2089 simply into the sequential matrix identical with the corresponding A (if size == 1). 2090 Otherwise, make sure that parallel matrices are constructed from A & B, or the 2091 A is put into the correct submat slot (if size == 1). 2092 */ 2093 if (scall != MAT_REUSE_MATRIX) { 2094 for (i = 0, ii = 0; i < ismax; ++i) { 2095 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2096 if (size > 1) { 2097 /* 2098 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 2099 */ 2100 /* Construct submat[i] from the Seq pieces A and B. */ 2101 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 2102 2103 ++ii; 2104 } else (*submat)[i] = A[i]; 2105 } 2106 } 2107 ierr = PetscFree(A);CHKERRQ(ierr); 2108 ierr = PetscFree(B);CHKERRQ(ierr); 2109 } 2110 PetscFunctionReturn(0); 2111 } /* MatGetSubMatricesParallel_MPIXAIJ() */ 2112 2113 2114 2115 #undef __FUNCT__ 2116 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 2117 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2118 { 2119 PetscErrorCode ierr; 2120 2121 PetscFunctionBegin; 2122 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 2123 PetscFunctionReturn(0); 2124 } 2125