1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_K(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_K(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_K(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_K(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscBool mat_increase_overlap_k,set; 28 PetscInt i; 29 30 PetscFunctionBegin; 31 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 32 ierr = PetscOptionsGetBool(NULL,"-mat_increase_overlap_k",&mat_increase_overlap_k,&set);CHKERRQ(ierr); 33 if(!set) mat_increase_overlap_k = PETSC_FALSE; 34 for (i=0; i<ov; ++i) { 35 if(mat_increase_overlap_k){ 36 ierr = MatIncreaseOverlap_MPIAIJ_Once_K(C,imax,is);CHKERRQ(ierr); 37 }else{ 38 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 39 } 40 } 41 PetscFunctionReturn(0); 42 } 43 44 45 #undef __FUNCT__ 46 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_K" 47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_K(Mat mat,PetscInt nidx,IS is[]) 48 { 49 PetscErrorCode ierr; 50 MPI_Comm comm; 51 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 52 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 53 PetscInt nrecvrows,*sbsizes,*sbdata,nto,nfrom; 54 const PetscInt **indices,*indices_i; 55 PetscLayout rmap; 56 PetscMPIInt rank,size,*toranks,*fromranks; 57 PetscSF sf; 58 PetscSFNode *remote; 59 60 PetscFunctionBegin; 61 /*communicator */ 62 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 63 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 64 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 65 /*row map*/ 66 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 67 /*retrieve IS data*/ 68 ierr = PetscCalloc2(nidx,&indices,nidx,&length);CHKERRQ(ierr); 69 /*get length and indices*/ 70 for(i=0,tlength=0; i<nidx; i++){ 71 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 72 tlength += length[i]; 73 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 74 } 75 /*find these rows on remote processors */ 76 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 77 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 78 nrrows = 0; 79 for(i=0; i<nidx; i++){ 80 length_i = length[i]; 81 indices_i = indices[i]; 82 for(j=0; j<length_i; j++){ 83 owner = -1; 84 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 85 /*remote processors*/ 86 if(owner != rank){ 87 tosizes_temp[owner]++; /*number of rows to owner*/ 88 rrow_ranks[nrrows] = owner; /*processor */ 89 rrow_isids[nrrows] = i; /*is id*/ 90 remoterows[nrrows++] = indices_i[j]; /* row */ 91 } 92 } 93 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 94 } 95 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 96 /*test if we need to exchange messages 97 * generally speaking, we do not need to exchange 98 * data when overlap is 1 99 * */ 100 ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 101 /*we do not have any messages 102 * It usually corresponds to overlap 1 103 * */ 104 if(!reducednrrows){ 105 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 106 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 107 ierr = MatIncreaseOverlap_MPIAIJ_Local_K(mat,nidx,is);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 nto = 0; 111 /*send sizes and ranks*/ 112 for(i=0; i<size; i++){ 113 if(tosizes_temp[i]){ 114 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 115 tosizes_temp[i] = nto; /* a map from processor to index */ 116 toranks[nto++] = i; /* processor */ 117 } 118 } 119 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 120 for(i=0; i<nto; i++){ 121 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /*offsets*/ 122 tosizes[2*i+1] = toffsets[i]; /*offsets to send*/ 123 } 124 /*send information to other processors*/ 125 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 126 /*build a star forest */ 127 nrecvrows = 0; 128 for(i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 129 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 130 nrecvrows = 0; 131 for(i=0; i<nfrom; i++){ 132 for(j=0; j<fromsizes[2*i]; j++){ 133 remote[nrecvrows].rank = fromranks[i]; 134 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 135 } 136 } 137 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 138 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 139 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 140 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 141 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 142 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 143 /*message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for(i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /*deal with remote data */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_K(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for(i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for(i=0; i<nto; i++){ 169 for(j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /*overlap communication and computation*/ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_K(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 184 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 185 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 186 ierr = MatIncreaseOverlap_MPIAIJ_Receive_K(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 187 ierr = PetscFree(toranks);CHKERRQ(ierr); 188 ierr = PetscFree(tosizes);CHKERRQ(ierr); 189 ierr = PetscFree(todata);CHKERRQ(ierr); 190 PetscFunctionReturn(0); 191 } 192 193 #undef __FUNCT__ 194 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_K" 195 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_K(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 196 { 197 PetscInt *isz,isz_i,i,j,is_id, data_size; 198 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 199 const PetscInt *indices_i_temp; 200 PetscErrorCode ierr; 201 202 PetscFunctionBegin; 203 max_lsize = 0; 204 ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr); 205 for(i=0; i<nidx; i++){ 206 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 207 max_lsize = lsize>max_lsize ? lsize:max_lsize; 208 isz[i] = lsize; 209 } 210 ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr); 211 for(i=0; i<nidx; i++){ 212 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 213 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 214 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 215 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 216 } 217 /*retrieve information */ 218 for(i=0; i<nrecvs; ){ 219 is_id = recvdata[i++]; 220 data_size = recvdata[i++]; 221 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 222 isz_i = isz[is_id]; 223 for(j=0; j< data_size; j++){ 224 col = recvdata[i++]; 225 indices_i[isz_i++] = col; 226 } 227 isz[is_id] = isz_i; 228 } 229 /*remove duplicate entities*/ 230 for(i=0; i<nidx; i++){ 231 indices_i = indices_temp+(max_lsize+nrecvs)*i; 232 isz_i = isz[i]; 233 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 234 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 235 } 236 ierr = PetscFree(isz);CHKERRQ(ierr); 237 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 #undef __FUNCT__ 242 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_K" 243 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_K(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 244 { 245 PetscLayout rmap,cmap; 246 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 247 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 248 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 249 const PetscInt *gcols,*ai,*aj,*bi,*bj; 250 Mat amat,bmat; 251 PetscMPIInt rank; 252 PetscBool done; 253 MPI_Comm comm; 254 PetscErrorCode ierr; 255 256 PetscFunctionBegin; 257 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 258 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 259 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 260 /* Even if the mat is symmetric, we still assume it is not symmetric*/ 261 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 262 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 263 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 264 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 265 /*total number of nonzero values */ 266 tnz = ai[an]+bi[bn]; 267 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 268 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 269 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 270 /*longest message */ 271 max_fszs = 0; 272 for(i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 273 /*better way to estimate number of nonzero in the mat???*/ 274 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 275 for(i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 276 rows_pos = 0; 277 totalrows = 0; 278 for(i=0; i<nfrom; i++){ 279 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 280 /*group data*/ 281 for(j=0; j<fromsizes[2*i]; j+=2){ 282 is_id = fromrows[rows_pos++];/*no of is*/ 283 rows_i = rows_data[is_id]; 284 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 285 } 286 /*estimate a space to avoid multiple allocations */ 287 for(j=0; j<nidx; j++){ 288 indvc_ij = 0; 289 rows_i = rows_data[j]; 290 for(l=0; l<rows_pos_i[j]; l++){ 291 row = rows_i[l]-rstart; 292 start = ai[row]; 293 end = ai[row+1]; 294 for(k=start; k<end; k++){ /* Amat */ 295 col = aj[k] + cstart; 296 /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr); 297 if(owner != fromranks[i])*/ 298 indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/ 299 } 300 start = bi[row]; 301 end = bi[row+1]; 302 for(k=start; k<end; k++) { /* Bmat */ 303 col = gcols[bj[k]]; 304 /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr); 305 if(owner != fromranks[i]) */ 306 indices_tmp[indvc_ij++] = col; 307 } 308 } 309 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 310 indv_counts[i*nidx+j] = indvc_ij; 311 totalrows += indvc_ij; 312 } 313 } 314 /*message triple <no of is, number of rows, rows> */ 315 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 316 totalrows = 0; 317 rows_pos = 0; 318 /* use this code again */ 319 for(i=0;i<nfrom;i++){ 320 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 321 for(j=0; j<fromsizes[2*i]; j+=2){ 322 is_id = fromrows[rows_pos++]; 323 rows_i = rows_data[is_id]; 324 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 325 } 326 /* add data */ 327 for(j=0; j<nidx; j++){ 328 if(!indv_counts[i*nidx+j]) continue; 329 indvc_ij = 0; 330 sbdata[totalrows++] = j; 331 sbdata[totalrows++] = indv_counts[i*nidx+j]; 332 sbsizes[2*i] += 2; 333 rows_i = rows_data[j]; 334 for(l=0; l<rows_pos_i[j]; l++){ 335 row = rows_i[l]-rstart; 336 start = ai[row]; 337 end = ai[row+1]; 338 for(k=start; k<end; k++){ /* Amat */ 339 col = aj[k] + cstart; 340 indices_tmp[indvc_ij++] = col; 341 } 342 start = bi[row]; 343 end = bi[row+1]; 344 for(k=start; k<end; k++) { /* Bmat */ 345 col = gcols[bj[k]]; 346 indices_tmp[indvc_ij++] = col; 347 } 348 } 349 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 350 sbsizes[2*i] += indvc_ij; 351 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 352 totalrows += indvc_ij; 353 } 354 } 355 /* offsets */ 356 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 357 for(i=0; i<nfrom; i++){ 358 offsets[i+1] = offsets[i] + sbsizes[2*i]; 359 sbsizes[2*i+1] = offsets[i]; 360 } 361 ierr = PetscFree(offsets);CHKERRQ(ierr); 362 if(sbrowsizes) *sbrowsizes = sbsizes; 363 if(sbrows) *sbrows = sbdata; 364 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 365 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_K" 372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_K(Mat mat,PetscInt nidx, IS is[]) 373 { 374 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 375 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 376 PetscInt lsize,lsize_tmp,owner; 377 PetscMPIInt rank; 378 Mat amat,bmat; 379 PetscBool done; 380 PetscLayout cmap,rmap; 381 MPI_Comm comm; 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 386 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 387 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 388 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 389 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 390 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 391 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 392 /*is it a safe way to compute number of nonzero values ?*/ 393 tnz = ai[an]+bi[bn]; 394 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 395 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 397 /*it is a better way to estimate memory than the old implementation 398 * where global size of matrix is used 399 * */ 400 ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr); 401 for(i=0; i<nidx; i++) { 402 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 403 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 404 lsize_tmp = 0; 405 for(j=0; j<lsize; j++) { 406 owner = -1; 407 row = indices[j]; 408 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 409 if(owner != rank) continue; 410 /*local number*/ 411 row -= rstart; 412 start = ai[row]; 413 end = ai[row+1]; 414 for(k=start; k<end; k++) { /* Amat */ 415 col = aj[k] + cstart; 416 indices_temp[lsize_tmp++] = col; 417 } 418 start = bi[row]; 419 end = bi[row+1]; 420 for(k=start; k<end; k++) { /* Bmat */ 421 col = gcols[bj[k]]; 422 indices_temp[lsize_tmp++] = col; 423 } 424 } 425 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 426 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 427 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 428 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 429 } 430 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 431 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 437 /* 438 Sample message format: 439 If a processor A wants processor B to process some elements corresponding 440 to index sets is[1],is[5] 441 mesg [0] = 2 (no of index sets in the mesg) 442 ----------- 443 mesg [1] = 1 => is[1] 444 mesg [2] = sizeof(is[1]); 445 ----------- 446 mesg [3] = 5 => is[5] 447 mesg [4] = sizeof(is[5]); 448 ----------- 449 mesg [5] 450 mesg [n] datas[1] 451 ----------- 452 mesg[n+1] 453 mesg[m] data(is[5]) 454 ----------- 455 456 Notes: 457 nrqs - no of requests sent (or to be sent out) 458 nrqr - no of requests recieved (which have to be or which have been processed 459 */ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 463 { 464 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 465 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 466 const PetscInt **idx,*idx_i; 467 PetscInt *n,**data,len; 468 PetscErrorCode ierr; 469 PetscMPIInt size,rank,tag1,tag2; 470 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 471 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 472 PetscBT *table; 473 MPI_Comm comm; 474 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 475 MPI_Status *s_status,*recv_status; 476 char *t_p; 477 478 PetscFunctionBegin; 479 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 480 size = c->size; 481 rank = c->rank; 482 M = C->rmap->N; 483 484 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 485 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 486 487 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 488 489 for (i=0; i<imax; i++) { 490 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 491 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 492 } 493 494 /* evaluate communication - mesg to who,length of mesg, and buffer space 495 required. Based on this, buffers are allocated, and data copied into them*/ 496 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 497 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 498 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 499 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 500 for (i=0; i<imax; i++) { 501 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 502 idx_i = idx[i]; 503 len = n[i]; 504 for (j=0; j<len; j++) { 505 row = idx_i[j]; 506 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 507 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 508 w4[proc]++; 509 } 510 for (j=0; j<size; j++) { 511 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 512 } 513 } 514 515 nrqs = 0; /* no of outgoing messages */ 516 msz = 0; /* total mesg length (for all proc */ 517 w1[rank] = 0; /* no mesg sent to intself */ 518 w3[rank] = 0; 519 for (i=0; i<size; i++) { 520 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 521 } 522 /* pa - is list of processors to communicate with */ 523 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 524 for (i=0,j=0; i<size; i++) { 525 if (w1[i]) {pa[j] = i; j++;} 526 } 527 528 /* Each message would have a header = 1 + 2*(no of IS) + data */ 529 for (i=0; i<nrqs; i++) { 530 j = pa[i]; 531 w1[j] += w2[j] + 2*w3[j]; 532 msz += w1[j]; 533 } 534 535 /* Determine the number of messages to expect, their lengths, from from-ids */ 536 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 537 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 538 539 /* Now post the Irecvs corresponding to these messages */ 540 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 541 542 /* Allocate Memory for outgoing messages */ 543 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 544 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 545 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 546 547 { 548 PetscInt *iptr = tmp,ict = 0; 549 for (i=0; i<nrqs; i++) { 550 j = pa[i]; 551 iptr += ict; 552 outdat[j] = iptr; 553 ict = w1[j]; 554 } 555 } 556 557 /* Form the outgoing messages */ 558 /*plug in the headers*/ 559 for (i=0; i<nrqs; i++) { 560 j = pa[i]; 561 outdat[j][0] = 0; 562 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 563 ptr[j] = outdat[j] + 2*w3[j] + 1; 564 } 565 566 /* Memory for doing local proc's work*/ 567 { 568 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 569 570 for (i=0; i<imax; i++) { 571 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 572 data[i] = d_p + M*i; 573 } 574 } 575 576 /* Parse the IS and update local tables and the outgoing buf with the data*/ 577 { 578 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 579 PetscBT table_i; 580 581 for (i=0; i<imax; i++) { 582 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 583 n_i = n[i]; 584 table_i = table[i]; 585 idx_i = idx[i]; 586 data_i = data[i]; 587 isz_i = isz[i]; 588 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 589 row = idx_i[j]; 590 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 591 if (proc != rank) { /* copy to the outgoing buffer */ 592 ctr[proc]++; 593 *ptr[proc] = row; 594 ptr[proc]++; 595 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 596 } 597 /* Update the headers for the current IS */ 598 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 599 if ((ctr_j = ctr[j])) { 600 outdat_j = outdat[j]; 601 k = ++outdat_j[0]; 602 outdat_j[2*k] = ctr_j; 603 outdat_j[2*k-1] = i; 604 } 605 } 606 isz[i] = isz_i; 607 } 608 } 609 610 /* Now post the sends */ 611 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 612 for (i=0; i<nrqs; ++i) { 613 j = pa[i]; 614 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 615 } 616 617 /* No longer need the original indices*/ 618 for (i=0; i<imax; ++i) { 619 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 620 } 621 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 622 623 for (i=0; i<imax; ++i) { 624 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 625 } 626 627 /* Do Local work*/ 628 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 629 630 /* Receive messages*/ 631 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 632 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 633 634 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 635 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 636 637 /* Phase 1 sends are complete - deallocate buffers */ 638 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 639 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 640 641 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 642 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 643 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 644 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 645 ierr = PetscFree(rbuf);CHKERRQ(ierr); 646 647 648 /* Send the data back*/ 649 /* Do a global reduction to know the buffer space req for incoming messages*/ 650 { 651 PetscMPIInt *rw1; 652 653 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 654 655 for (i=0; i<nrqr; ++i) { 656 proc = recv_status[i].MPI_SOURCE; 657 658 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 659 rw1[proc] = isz1[i]; 660 } 661 ierr = PetscFree(onodes1);CHKERRQ(ierr); 662 ierr = PetscFree(olengths1);CHKERRQ(ierr); 663 664 /* Determine the number of messages to expect, their lengths, from from-ids */ 665 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 666 ierr = PetscFree(rw1);CHKERRQ(ierr); 667 } 668 /* Now post the Irecvs corresponding to these messages */ 669 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 670 671 /* Now post the sends */ 672 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 673 for (i=0; i<nrqr; ++i) { 674 j = recv_status[i].MPI_SOURCE; 675 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 676 } 677 678 /* receive work done on other processors*/ 679 { 680 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 681 PetscMPIInt idex; 682 PetscBT table_i; 683 MPI_Status *status2; 684 685 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 686 for (i=0; i<nrqs; ++i) { 687 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 688 /* Process the message*/ 689 rbuf2_i = rbuf2[idex]; 690 ct1 = 2*rbuf2_i[0]+1; 691 jmax = rbuf2[idex][0]; 692 for (j=1; j<=jmax; j++) { 693 max = rbuf2_i[2*j]; 694 is_no = rbuf2_i[2*j-1]; 695 isz_i = isz[is_no]; 696 data_i = data[is_no]; 697 table_i = table[is_no]; 698 for (k=0; k<max; k++,ct1++) { 699 row = rbuf2_i[ct1]; 700 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 701 } 702 isz[is_no] = isz_i; 703 } 704 } 705 706 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 707 ierr = PetscFree(status2);CHKERRQ(ierr); 708 } 709 710 for (i=0; i<imax; ++i) { 711 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 712 } 713 714 ierr = PetscFree(onodes2);CHKERRQ(ierr); 715 ierr = PetscFree(olengths2);CHKERRQ(ierr); 716 717 ierr = PetscFree(pa);CHKERRQ(ierr); 718 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 719 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 720 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 721 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 722 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 723 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 724 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 725 ierr = PetscFree(s_status);CHKERRQ(ierr); 726 ierr = PetscFree(recv_status);CHKERRQ(ierr); 727 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 728 ierr = PetscFree(xdata);CHKERRQ(ierr); 729 ierr = PetscFree(isz1);CHKERRQ(ierr); 730 PetscFunctionReturn(0); 731 } 732 733 #undef __FUNCT__ 734 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 735 /* 736 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 737 the work on the local processor. 738 739 Inputs: 740 C - MAT_MPIAIJ; 741 imax - total no of index sets processed at a time; 742 table - an array of char - size = m bits. 743 744 Output: 745 isz - array containing the count of the solution elements corresponding 746 to each index set; 747 data - pointer to the solutions 748 */ 749 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 750 { 751 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 752 Mat A = c->A,B = c->B; 753 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 754 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 755 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 756 PetscBT table_i; 757 758 PetscFunctionBegin; 759 rstart = C->rmap->rstart; 760 cstart = C->cmap->rstart; 761 ai = a->i; 762 aj = a->j; 763 bi = b->i; 764 bj = b->j; 765 garray = c->garray; 766 767 768 for (i=0; i<imax; i++) { 769 data_i = data[i]; 770 table_i = table[i]; 771 isz_i = isz[i]; 772 for (j=0,max=isz[i]; j<max; j++) { 773 row = data_i[j] - rstart; 774 start = ai[row]; 775 end = ai[row+1]; 776 for (k=start; k<end; k++) { /* Amat */ 777 val = aj[k] + cstart; 778 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 779 } 780 start = bi[row]; 781 end = bi[row+1]; 782 for (k=start; k<end; k++) { /* Bmat */ 783 val = garray[bj[k]]; 784 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 785 } 786 } 787 isz[i] = isz_i; 788 } 789 PetscFunctionReturn(0); 790 } 791 792 #undef __FUNCT__ 793 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 794 /* 795 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 796 and return the output 797 798 Input: 799 C - the matrix 800 nrqr - no of messages being processed. 801 rbuf - an array of pointers to the recieved requests 802 803 Output: 804 xdata - array of messages to be sent back 805 isz1 - size of each message 806 807 For better efficiency perhaps we should malloc separately each xdata[i], 808 then if a remalloc is required we need only copy the data for that one row 809 rather then all previous rows as it is now where a single large chunck of 810 memory is used. 811 812 */ 813 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 814 { 815 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 816 Mat A = c->A,B = c->B; 817 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 818 PetscErrorCode ierr; 819 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 820 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 821 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 822 PetscInt *rbuf_i,kmax,rbuf_0; 823 PetscBT xtable; 824 825 PetscFunctionBegin; 826 m = C->rmap->N; 827 rstart = C->rmap->rstart; 828 cstart = C->cmap->rstart; 829 ai = a->i; 830 aj = a->j; 831 bi = b->i; 832 bj = b->j; 833 garray = c->garray; 834 835 836 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 837 rbuf_i = rbuf[i]; 838 rbuf_0 = rbuf_i[0]; 839 ct += rbuf_0; 840 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 841 } 842 843 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 844 else max1 = 1; 845 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 846 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 847 ++no_malloc; 848 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 849 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 850 851 ct3 = 0; 852 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 853 rbuf_i = rbuf[i]; 854 rbuf_0 = rbuf_i[0]; 855 ct1 = 2*rbuf_0+1; 856 ct2 = ct1; 857 ct3 += ct1; 858 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 859 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 860 oct2 = ct2; 861 kmax = rbuf_i[2*j]; 862 for (k=0; k<kmax; k++,ct1++) { 863 row = rbuf_i[ct1]; 864 if (!PetscBTLookupSet(xtable,row)) { 865 if (!(ct3 < mem_estimate)) { 866 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 867 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 868 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 869 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 870 xdata[0] = tmp; 871 mem_estimate = new_estimate; ++no_malloc; 872 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 873 } 874 xdata[i][ct2++] = row; 875 ct3++; 876 } 877 } 878 for (k=oct2,max2=ct2; k<max2; k++) { 879 row = xdata[i][k] - rstart; 880 start = ai[row]; 881 end = ai[row+1]; 882 for (l=start; l<end; l++) { 883 val = aj[l] + cstart; 884 if (!PetscBTLookupSet(xtable,val)) { 885 if (!(ct3 < mem_estimate)) { 886 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 887 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 888 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 889 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 890 xdata[0] = tmp; 891 mem_estimate = new_estimate; ++no_malloc; 892 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 893 } 894 xdata[i][ct2++] = val; 895 ct3++; 896 } 897 } 898 start = bi[row]; 899 end = bi[row+1]; 900 for (l=start; l<end; l++) { 901 val = garray[bj[l]]; 902 if (!PetscBTLookupSet(xtable,val)) { 903 if (!(ct3 < mem_estimate)) { 904 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 905 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 906 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 907 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 908 xdata[0] = tmp; 909 mem_estimate = new_estimate; ++no_malloc; 910 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 911 } 912 xdata[i][ct2++] = val; 913 ct3++; 914 } 915 } 916 } 917 /* Update the header*/ 918 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 919 xdata[i][2*j-1] = rbuf_i[2*j-1]; 920 } 921 xdata[i][0] = rbuf_0; 922 xdata[i+1] = xdata[i] + ct2; 923 isz1[i] = ct2; /* size of each message */ 924 } 925 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 926 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 927 PetscFunctionReturn(0); 928 } 929 /* -------------------------------------------------------------------------*/ 930 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 931 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 932 /* 933 Every processor gets the entire matrix 934 */ 935 #undef __FUNCT__ 936 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 937 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 938 { 939 Mat B; 940 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 941 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 942 PetscErrorCode ierr; 943 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 944 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 945 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 946 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 947 948 PetscFunctionBegin; 949 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 950 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 951 952 if (scall == MAT_INITIAL_MATRIX) { 953 /* ---------------------------------------------------------------- 954 Tell every processor the number of nonzeros per row 955 */ 956 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 957 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 958 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 959 } 960 sendcount = A->rmap->rend - A->rmap->rstart; 961 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 962 for (i=0; i<size; i++) { 963 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 964 displs[i] = A->rmap->range[i]; 965 } 966 #if defined(PETSC_HAVE_MPI_IN_PLACE) 967 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 968 #else 969 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 970 #endif 971 /* --------------------------------------------------------------- 972 Create the sequential matrix of the same type as the local block diagonal 973 */ 974 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 975 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 976 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 977 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 978 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 979 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 980 **Bin = B; 981 b = (Mat_SeqAIJ*)B->data; 982 983 /*-------------------------------------------------------------------- 984 Copy my part of matrix column indices over 985 */ 986 sendcount = ad->nz + bd->nz; 987 jsendbuf = b->j + b->i[rstarts[rank]]; 988 a_jsendbuf = ad->j; 989 b_jsendbuf = bd->j; 990 n = A->rmap->rend - A->rmap->rstart; 991 cnt = 0; 992 for (i=0; i<n; i++) { 993 994 /* put in lower diagonal portion */ 995 m = bd->i[i+1] - bd->i[i]; 996 while (m > 0) { 997 /* is it above diagonal (in bd (compressed) numbering) */ 998 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 999 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1000 m--; 1001 } 1002 1003 /* put in diagonal portion */ 1004 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1005 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1006 } 1007 1008 /* put in upper diagonal portion */ 1009 while (m-- > 0) { 1010 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1011 } 1012 } 1013 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1014 1015 /*-------------------------------------------------------------------- 1016 Gather all column indices to all processors 1017 */ 1018 for (i=0; i<size; i++) { 1019 recvcounts[i] = 0; 1020 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1021 recvcounts[i] += lens[j]; 1022 } 1023 } 1024 displs[0] = 0; 1025 for (i=1; i<size; i++) { 1026 displs[i] = displs[i-1] + recvcounts[i-1]; 1027 } 1028 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1029 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1030 #else 1031 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1032 #endif 1033 /*-------------------------------------------------------------------- 1034 Assemble the matrix into useable form (note numerical values not yet set) 1035 */ 1036 /* set the b->ilen (length of each row) values */ 1037 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1038 /* set the b->i indices */ 1039 b->i[0] = 0; 1040 for (i=1; i<=A->rmap->N; i++) { 1041 b->i[i] = b->i[i-1] + lens[i-1]; 1042 } 1043 ierr = PetscFree(lens);CHKERRQ(ierr); 1044 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1045 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1046 1047 } else { 1048 B = **Bin; 1049 b = (Mat_SeqAIJ*)B->data; 1050 } 1051 1052 /*-------------------------------------------------------------------- 1053 Copy my part of matrix numerical values into the values location 1054 */ 1055 if (flag == MAT_GET_VALUES) { 1056 sendcount = ad->nz + bd->nz; 1057 sendbuf = b->a + b->i[rstarts[rank]]; 1058 a_sendbuf = ad->a; 1059 b_sendbuf = bd->a; 1060 b_sendj = bd->j; 1061 n = A->rmap->rend - A->rmap->rstart; 1062 cnt = 0; 1063 for (i=0; i<n; i++) { 1064 1065 /* put in lower diagonal portion */ 1066 m = bd->i[i+1] - bd->i[i]; 1067 while (m > 0) { 1068 /* is it above diagonal (in bd (compressed) numbering) */ 1069 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1070 sendbuf[cnt++] = *b_sendbuf++; 1071 m--; 1072 b_sendj++; 1073 } 1074 1075 /* put in diagonal portion */ 1076 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1077 sendbuf[cnt++] = *a_sendbuf++; 1078 } 1079 1080 /* put in upper diagonal portion */ 1081 while (m-- > 0) { 1082 sendbuf[cnt++] = *b_sendbuf++; 1083 b_sendj++; 1084 } 1085 } 1086 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1087 1088 /* ----------------------------------------------------------------- 1089 Gather all numerical values to all processors 1090 */ 1091 if (!recvcounts) { 1092 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1093 } 1094 for (i=0; i<size; i++) { 1095 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1096 } 1097 displs[0] = 0; 1098 for (i=1; i<size; i++) { 1099 displs[i] = displs[i-1] + recvcounts[i-1]; 1100 } 1101 recvbuf = b->a; 1102 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1103 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1104 #else 1105 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1106 #endif 1107 } /* endof (flag == MAT_GET_VALUES) */ 1108 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1109 1110 if (A->symmetric) { 1111 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1112 } else if (A->hermitian) { 1113 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1114 } else if (A->structurally_symmetric) { 1115 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1116 } 1117 PetscFunctionReturn(0); 1118 } 1119 1120 1121 1122 #undef __FUNCT__ 1123 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 1124 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1125 { 1126 PetscErrorCode ierr; 1127 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 1128 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 1129 1130 PetscFunctionBegin; 1131 1132 /* 1133 Check for special case: each processor gets entire matrix 1134 */ 1135 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1136 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1137 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1138 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1139 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1140 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1141 wantallmatrix = PETSC_TRUE; 1142 1143 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1144 } 1145 } 1146 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1147 if (twantallmatrix) { 1148 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 1149 PetscFunctionReturn(0); 1150 } 1151 1152 /* Allocate memory to hold all the submatrices */ 1153 if (scall != MAT_REUSE_MATRIX) { 1154 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 1155 } 1156 1157 /* Check for special case: each processor gets entire matrix columns */ 1158 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 1159 for (i=0; i<ismax; i++) { 1160 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 1161 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 1162 if (colflag && ncol == C->cmap->N) { 1163 allcolumns[i] = PETSC_TRUE; 1164 } else { 1165 allcolumns[i] = PETSC_FALSE; 1166 } 1167 } 1168 1169 /* Determine the number of stages through which submatrices are done */ 1170 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1171 1172 /* 1173 Each stage will extract nmax submatrices. 1174 nmax is determined by the matrix column dimension. 1175 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1176 */ 1177 if (!nmax) nmax = 1; 1178 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 1179 1180 /* Make sure every processor loops through the nstages */ 1181 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1182 1183 for (i=0,pos=0; i<nstages; i++) { 1184 if (pos+nmax <= ismax) max_no = nmax; 1185 else if (pos == ismax) max_no = 0; 1186 else max_no = ismax-pos; 1187 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 1188 pos += max_no; 1189 } 1190 1191 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 1192 PetscFunctionReturn(0); 1193 } 1194 1195 /* -------------------------------------------------------------------------*/ 1196 #undef __FUNCT__ 1197 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 1198 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 1199 { 1200 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1201 Mat A = c->A; 1202 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 1203 const PetscInt **icol,**irow; 1204 PetscInt *nrow,*ncol,start; 1205 PetscErrorCode ierr; 1206 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 1207 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 1208 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 1209 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 1210 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 1211 #if defined(PETSC_USE_CTABLE) 1212 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 1213 #else 1214 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 1215 #endif 1216 const PetscInt *irow_i; 1217 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 1218 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1219 MPI_Request *r_waits4,*s_waits3,*s_waits4; 1220 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 1221 MPI_Status *r_status3,*r_status4,*s_status4; 1222 MPI_Comm comm; 1223 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 1224 PetscMPIInt *onodes1,*olengths1; 1225 PetscMPIInt idex,idex2,end; 1226 1227 PetscFunctionBegin; 1228 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1229 tag0 = ((PetscObject)C)->tag; 1230 size = c->size; 1231 rank = c->rank; 1232 1233 /* Get some new tags to keep the communication clean */ 1234 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 1235 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1236 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1237 1238 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 1239 1240 for (i=0; i<ismax; i++) { 1241 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 1242 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 1243 if (allcolumns[i]) { 1244 icol[i] = NULL; 1245 ncol[i] = C->cmap->N; 1246 } else { 1247 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 1248 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 1249 } 1250 } 1251 1252 /* evaluate communication - mesg to who, length of mesg, and buffer space 1253 required. Based on this, buffers are allocated, and data copied into them*/ 1254 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 1255 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1256 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1257 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1258 for (i=0; i<ismax; i++) { 1259 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1260 jmax = nrow[i]; 1261 irow_i = irow[i]; 1262 for (j=0; j<jmax; j++) { 1263 l = 0; 1264 row = irow_i[j]; 1265 while (row >= C->rmap->range[l+1]) l++; 1266 proc = l; 1267 w4[proc]++; 1268 } 1269 for (j=0; j<size; j++) { 1270 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 1271 } 1272 } 1273 1274 nrqs = 0; /* no of outgoing messages */ 1275 msz = 0; /* total mesg length (for all procs) */ 1276 w1[rank] = 0; /* no mesg sent to self */ 1277 w3[rank] = 0; 1278 for (i=0; i<size; i++) { 1279 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 1280 } 1281 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1282 for (i=0,j=0; i<size; i++) { 1283 if (w1[i]) { pa[j] = i; j++; } 1284 } 1285 1286 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1287 for (i=0; i<nrqs; i++) { 1288 j = pa[i]; 1289 w1[j] += w2[j] + 2* w3[j]; 1290 msz += w1[j]; 1291 } 1292 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1293 1294 /* Determine the number of messages to expect, their lengths, from from-ids */ 1295 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1296 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1297 1298 /* Now post the Irecvs corresponding to these messages */ 1299 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1300 1301 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1302 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1303 1304 /* Allocate Memory for outgoing messages */ 1305 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1306 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1307 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1308 1309 { 1310 PetscInt *iptr = tmp,ict = 0; 1311 for (i=0; i<nrqs; i++) { 1312 j = pa[i]; 1313 iptr += ict; 1314 sbuf1[j] = iptr; 1315 ict = w1[j]; 1316 } 1317 } 1318 1319 /* Form the outgoing messages */ 1320 /* Initialize the header space */ 1321 for (i=0; i<nrqs; i++) { 1322 j = pa[i]; 1323 sbuf1[j][0] = 0; 1324 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 1325 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 1326 } 1327 1328 /* Parse the isrow and copy data into outbuf */ 1329 for (i=0; i<ismax; i++) { 1330 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1331 irow_i = irow[i]; 1332 jmax = nrow[i]; 1333 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 1334 l = 0; 1335 row = irow_i[j]; 1336 while (row >= C->rmap->range[l+1]) l++; 1337 proc = l; 1338 if (proc != rank) { /* copy to the outgoing buf*/ 1339 ctr[proc]++; 1340 *ptr[proc] = row; 1341 ptr[proc]++; 1342 } 1343 } 1344 /* Update the headers for the current IS */ 1345 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1346 if ((ctr_j = ctr[j])) { 1347 sbuf1_j = sbuf1[j]; 1348 k = ++sbuf1_j[0]; 1349 sbuf1_j[2*k] = ctr_j; 1350 sbuf1_j[2*k-1] = i; 1351 } 1352 } 1353 } 1354 1355 /* Now post the sends */ 1356 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1357 for (i=0; i<nrqs; ++i) { 1358 j = pa[i]; 1359 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 1360 } 1361 1362 /* Post Receives to capture the buffer size */ 1363 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 1364 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 1365 rbuf2[0] = tmp + msz; 1366 for (i=1; i<nrqs; ++i) { 1367 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 1368 } 1369 for (i=0; i<nrqs; ++i) { 1370 j = pa[i]; 1371 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 1372 } 1373 1374 /* Send to other procs the buf size they should allocate */ 1375 1376 1377 /* Receive messages*/ 1378 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 1379 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1380 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 1381 { 1382 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 1383 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 1384 PetscInt *sbuf2_i; 1385 1386 for (i=0; i<nrqr; ++i) { 1387 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1388 1389 req_size[idex] = 0; 1390 rbuf1_i = rbuf1[idex]; 1391 start = 2*rbuf1_i[0] + 1; 1392 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1393 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 1394 sbuf2_i = sbuf2[idex]; 1395 for (j=start; j<end; j++) { 1396 id = rbuf1_i[j] - rstart; 1397 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1398 sbuf2_i[j] = ncols; 1399 req_size[idex] += ncols; 1400 } 1401 req_source[idex] = r_status1[i].MPI_SOURCE; 1402 /* form the header */ 1403 sbuf2_i[0] = req_size[idex]; 1404 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1405 1406 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1407 } 1408 } 1409 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1410 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1411 1412 /* recv buffer sizes */ 1413 /* Receive messages*/ 1414 1415 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 1416 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1417 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 1418 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1419 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 1420 1421 for (i=0; i<nrqs; ++i) { 1422 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1423 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 1424 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 1425 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1426 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1427 } 1428 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1429 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1430 1431 /* Wait on sends1 and sends2 */ 1432 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1433 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 1434 1435 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1436 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1437 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1438 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1439 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1440 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1441 1442 /* Now allocate buffers for a->j, and send them off */ 1443 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1444 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1445 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1446 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1447 1448 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 1449 { 1450 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1451 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1452 PetscInt cend = C->cmap->rend; 1453 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1454 1455 for (i=0; i<nrqr; i++) { 1456 rbuf1_i = rbuf1[i]; 1457 sbuf_aj_i = sbuf_aj[i]; 1458 ct1 = 2*rbuf1_i[0] + 1; 1459 ct2 = 0; 1460 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1461 kmax = rbuf1[i][2*j]; 1462 for (k=0; k<kmax; k++,ct1++) { 1463 row = rbuf1_i[ct1] - rstart; 1464 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1465 ncols = nzA + nzB; 1466 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1467 1468 /* load the column indices for this row into cols*/ 1469 cols = sbuf_aj_i + ct2; 1470 1471 lwrite = 0; 1472 for (l=0; l<nzB; l++) { 1473 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1474 } 1475 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1476 for (l=0; l<nzB; l++) { 1477 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1478 } 1479 1480 ct2 += ncols; 1481 } 1482 } 1483 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1484 } 1485 } 1486 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 1487 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 1488 1489 /* Allocate buffers for a->a, and send them off */ 1490 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1491 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1492 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1493 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1494 1495 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1496 { 1497 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1498 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1499 PetscInt cend = C->cmap->rend; 1500 PetscInt *b_j = b->j; 1501 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1502 1503 for (i=0; i<nrqr; i++) { 1504 rbuf1_i = rbuf1[i]; 1505 sbuf_aa_i = sbuf_aa[i]; 1506 ct1 = 2*rbuf1_i[0]+1; 1507 ct2 = 0; 1508 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1509 kmax = rbuf1_i[2*j]; 1510 for (k=0; k<kmax; k++,ct1++) { 1511 row = rbuf1_i[ct1] - rstart; 1512 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1513 ncols = nzA + nzB; 1514 cworkB = b_j + b_i[row]; 1515 vworkA = a_a + a_i[row]; 1516 vworkB = b_a + b_i[row]; 1517 1518 /* load the column values for this row into vals*/ 1519 vals = sbuf_aa_i+ct2; 1520 1521 lwrite = 0; 1522 for (l=0; l<nzB; l++) { 1523 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1524 } 1525 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1526 for (l=0; l<nzB; l++) { 1527 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1528 } 1529 1530 ct2 += ncols; 1531 } 1532 } 1533 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1534 } 1535 } 1536 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1537 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1538 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1539 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1540 1541 /* Form the matrix */ 1542 /* create col map: global col of C -> local col of submatrices */ 1543 { 1544 const PetscInt *icol_i; 1545 #if defined(PETSC_USE_CTABLE) 1546 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 1547 for (i=0; i<ismax; i++) { 1548 if (!allcolumns[i]) { 1549 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1550 1551 jmax = ncol[i]; 1552 icol_i = icol[i]; 1553 cmap_i = cmap[i]; 1554 for (j=0; j<jmax; j++) { 1555 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1556 } 1557 } else { 1558 cmap[i] = NULL; 1559 } 1560 } 1561 #else 1562 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1563 for (i=0; i<ismax; i++) { 1564 if (!allcolumns[i]) { 1565 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1566 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1567 jmax = ncol[i]; 1568 icol_i = icol[i]; 1569 cmap_i = cmap[i]; 1570 for (j=0; j<jmax; j++) { 1571 cmap_i[icol_i[j]] = j+1; 1572 } 1573 } else { 1574 cmap[i] = NULL; 1575 } 1576 } 1577 #endif 1578 } 1579 1580 /* Create lens which is required for MatCreate... */ 1581 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1582 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1583 if (ismax) { 1584 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1585 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1586 } 1587 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1588 1589 /* Update lens from local data */ 1590 for (i=0; i<ismax; i++) { 1591 jmax = nrow[i]; 1592 if (!allcolumns[i]) cmap_i = cmap[i]; 1593 irow_i = irow[i]; 1594 lens_i = lens[i]; 1595 for (j=0; j<jmax; j++) { 1596 l = 0; 1597 row = irow_i[j]; 1598 while (row >= C->rmap->range[l+1]) l++; 1599 proc = l; 1600 if (proc == rank) { 1601 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1602 if (!allcolumns[i]) { 1603 for (k=0; k<ncols; k++) { 1604 #if defined(PETSC_USE_CTABLE) 1605 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1606 #else 1607 tcol = cmap_i[cols[k]]; 1608 #endif 1609 if (tcol) lens_i[j]++; 1610 } 1611 } else { /* allcolumns */ 1612 lens_i[j] = ncols; 1613 } 1614 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1615 } 1616 } 1617 } 1618 1619 /* Create row map: global row of C -> local row of submatrices */ 1620 #if defined(PETSC_USE_CTABLE) 1621 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 1622 for (i=0; i<ismax; i++) { 1623 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1624 rmap_i = rmap[i]; 1625 irow_i = irow[i]; 1626 jmax = nrow[i]; 1627 for (j=0; j<jmax; j++) { 1628 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1629 } 1630 } 1631 #else 1632 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1633 if (ismax) { 1634 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1635 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1636 } 1637 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1638 for (i=0; i<ismax; i++) { 1639 rmap_i = rmap[i]; 1640 irow_i = irow[i]; 1641 jmax = nrow[i]; 1642 for (j=0; j<jmax; j++) { 1643 rmap_i[irow_i[j]] = j; 1644 } 1645 } 1646 #endif 1647 1648 /* Update lens from offproc data */ 1649 { 1650 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1651 1652 for (tmp2=0; tmp2<nrqs; tmp2++) { 1653 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1654 idex = pa[idex2]; 1655 sbuf1_i = sbuf1[idex]; 1656 jmax = sbuf1_i[0]; 1657 ct1 = 2*jmax+1; 1658 ct2 = 0; 1659 rbuf2_i = rbuf2[idex2]; 1660 rbuf3_i = rbuf3[idex2]; 1661 for (j=1; j<=jmax; j++) { 1662 is_no = sbuf1_i[2*j-1]; 1663 max1 = sbuf1_i[2*j]; 1664 lens_i = lens[is_no]; 1665 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1666 rmap_i = rmap[is_no]; 1667 for (k=0; k<max1; k++,ct1++) { 1668 #if defined(PETSC_USE_CTABLE) 1669 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1670 row--; 1671 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1672 #else 1673 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1674 #endif 1675 max2 = rbuf2_i[ct1]; 1676 for (l=0; l<max2; l++,ct2++) { 1677 if (!allcolumns[is_no]) { 1678 #if defined(PETSC_USE_CTABLE) 1679 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1680 #else 1681 tcol = cmap_i[rbuf3_i[ct2]]; 1682 #endif 1683 if (tcol) lens_i[row]++; 1684 } else { /* allcolumns */ 1685 lens_i[row]++; /* lens_i[row] += max2 ? */ 1686 } 1687 } 1688 } 1689 } 1690 } 1691 } 1692 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1693 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1694 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1695 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1696 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1697 1698 /* Create the submatrices */ 1699 if (scall == MAT_REUSE_MATRIX) { 1700 PetscBool flag; 1701 1702 /* 1703 Assumes new rows are same length as the old rows,hence bug! 1704 */ 1705 for (i=0; i<ismax; i++) { 1706 mat = (Mat_SeqAIJ*)(submats[i]->data); 1707 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1708 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1709 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1710 /* Initial matrix as if empty */ 1711 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1712 1713 submats[i]->factortype = C->factortype; 1714 } 1715 } else { 1716 for (i=0; i<ismax; i++) { 1717 PetscInt rbs,cbs; 1718 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1719 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1720 1721 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1722 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1723 1724 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1725 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1726 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1727 } 1728 } 1729 1730 /* Assemble the matrices */ 1731 /* First assemble the local rows */ 1732 { 1733 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1734 PetscScalar *imat_a; 1735 1736 for (i=0; i<ismax; i++) { 1737 mat = (Mat_SeqAIJ*)submats[i]->data; 1738 imat_ilen = mat->ilen; 1739 imat_j = mat->j; 1740 imat_i = mat->i; 1741 imat_a = mat->a; 1742 1743 if (!allcolumns[i]) cmap_i = cmap[i]; 1744 rmap_i = rmap[i]; 1745 irow_i = irow[i]; 1746 jmax = nrow[i]; 1747 for (j=0; j<jmax; j++) { 1748 l = 0; 1749 row = irow_i[j]; 1750 while (row >= C->rmap->range[l+1]) l++; 1751 proc = l; 1752 if (proc == rank) { 1753 old_row = row; 1754 #if defined(PETSC_USE_CTABLE) 1755 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1756 row--; 1757 #else 1758 row = rmap_i[row]; 1759 #endif 1760 ilen_row = imat_ilen[row]; 1761 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1762 mat_i = imat_i[row]; 1763 mat_a = imat_a + mat_i; 1764 mat_j = imat_j + mat_i; 1765 if (!allcolumns[i]) { 1766 for (k=0; k<ncols; k++) { 1767 #if defined(PETSC_USE_CTABLE) 1768 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1769 #else 1770 tcol = cmap_i[cols[k]]; 1771 #endif 1772 if (tcol) { 1773 *mat_j++ = tcol - 1; 1774 *mat_a++ = vals[k]; 1775 ilen_row++; 1776 } 1777 } 1778 } else { /* allcolumns */ 1779 for (k=0; k<ncols; k++) { 1780 *mat_j++ = cols[k]; /* global col index! */ 1781 *mat_a++ = vals[k]; 1782 ilen_row++; 1783 } 1784 } 1785 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1786 1787 imat_ilen[row] = ilen_row; 1788 } 1789 } 1790 } 1791 } 1792 1793 /* Now assemble the off proc rows*/ 1794 { 1795 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1796 PetscInt *imat_j,*imat_i; 1797 PetscScalar *imat_a,*rbuf4_i; 1798 1799 for (tmp2=0; tmp2<nrqs; tmp2++) { 1800 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1801 idex = pa[idex2]; 1802 sbuf1_i = sbuf1[idex]; 1803 jmax = sbuf1_i[0]; 1804 ct1 = 2*jmax + 1; 1805 ct2 = 0; 1806 rbuf2_i = rbuf2[idex2]; 1807 rbuf3_i = rbuf3[idex2]; 1808 rbuf4_i = rbuf4[idex2]; 1809 for (j=1; j<=jmax; j++) { 1810 is_no = sbuf1_i[2*j-1]; 1811 rmap_i = rmap[is_no]; 1812 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1813 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1814 imat_ilen = mat->ilen; 1815 imat_j = mat->j; 1816 imat_i = mat->i; 1817 imat_a = mat->a; 1818 max1 = sbuf1_i[2*j]; 1819 for (k=0; k<max1; k++,ct1++) { 1820 row = sbuf1_i[ct1]; 1821 #if defined(PETSC_USE_CTABLE) 1822 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1823 row--; 1824 #else 1825 row = rmap_i[row]; 1826 #endif 1827 ilen = imat_ilen[row]; 1828 mat_i = imat_i[row]; 1829 mat_a = imat_a + mat_i; 1830 mat_j = imat_j + mat_i; 1831 max2 = rbuf2_i[ct1]; 1832 if (!allcolumns[is_no]) { 1833 for (l=0; l<max2; l++,ct2++) { 1834 1835 #if defined(PETSC_USE_CTABLE) 1836 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1837 #else 1838 tcol = cmap_i[rbuf3_i[ct2]]; 1839 #endif 1840 if (tcol) { 1841 *mat_j++ = tcol - 1; 1842 *mat_a++ = rbuf4_i[ct2]; 1843 ilen++; 1844 } 1845 } 1846 } else { /* allcolumns */ 1847 for (l=0; l<max2; l++,ct2++) { 1848 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1849 *mat_a++ = rbuf4_i[ct2]; 1850 ilen++; 1851 } 1852 } 1853 imat_ilen[row] = ilen; 1854 } 1855 } 1856 } 1857 } 1858 1859 /* sort the rows */ 1860 { 1861 PetscInt *imat_ilen,*imat_j,*imat_i; 1862 PetscScalar *imat_a; 1863 1864 for (i=0; i<ismax; i++) { 1865 mat = (Mat_SeqAIJ*)submats[i]->data; 1866 imat_j = mat->j; 1867 imat_i = mat->i; 1868 imat_a = mat->a; 1869 imat_ilen = mat->ilen; 1870 1871 if (allcolumns[i]) continue; 1872 jmax = nrow[i]; 1873 for (j=0; j<jmax; j++) { 1874 PetscInt ilen; 1875 1876 mat_i = imat_i[j]; 1877 mat_a = imat_a + mat_i; 1878 mat_j = imat_j + mat_i; 1879 ilen = imat_ilen[j]; 1880 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 1881 } 1882 } 1883 } 1884 1885 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1886 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1887 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1888 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1889 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1890 1891 /* Restore the indices */ 1892 for (i=0; i<ismax; i++) { 1893 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1894 if (!allcolumns[i]) { 1895 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1896 } 1897 } 1898 1899 /* Destroy allocated memory */ 1900 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1901 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1902 ierr = PetscFree(pa);CHKERRQ(ierr); 1903 1904 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1905 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1906 for (i=0; i<nrqr; ++i) { 1907 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1908 } 1909 for (i=0; i<nrqs; ++i) { 1910 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1911 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1912 } 1913 1914 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1915 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1916 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1917 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1918 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1919 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1920 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1921 1922 #if defined(PETSC_USE_CTABLE) 1923 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1924 #else 1925 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1926 #endif 1927 ierr = PetscFree(rmap);CHKERRQ(ierr); 1928 1929 for (i=0; i<ismax; i++) { 1930 if (!allcolumns[i]) { 1931 #if defined(PETSC_USE_CTABLE) 1932 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1933 #else 1934 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1935 #endif 1936 } 1937 } 1938 ierr = PetscFree(cmap);CHKERRQ(ierr); 1939 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1940 ierr = PetscFree(lens);CHKERRQ(ierr); 1941 1942 for (i=0; i<ismax; i++) { 1943 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1944 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1945 } 1946 PetscFunctionReturn(0); 1947 } 1948 1949 /* 1950 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1951 Be careful not to destroy them elsewhere. 1952 */ 1953 #undef __FUNCT__ 1954 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1955 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1956 { 1957 /* If making this function public, change the error returned in this function away from _PLIB. */ 1958 PetscErrorCode ierr; 1959 Mat_MPIAIJ *aij; 1960 PetscBool seqaij; 1961 1962 PetscFunctionBegin; 1963 /* Check to make sure the component matrices are compatible with C. */ 1964 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1965 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1966 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1967 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1968 if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1969 1970 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1971 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1972 ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr); 1973 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1974 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1975 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1976 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1977 aij = (Mat_MPIAIJ*)((*C)->data); 1978 aij->A = A; 1979 aij->B = B; 1980 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr); 1981 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr); 1982 1983 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1984 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1985 PetscFunctionReturn(0); 1986 } 1987 1988 #undef __FUNCT__ 1989 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1990 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1991 { 1992 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1993 1994 PetscFunctionBegin; 1995 PetscValidPointer(A,2); 1996 PetscValidPointer(B,3); 1997 *A = aij->A; 1998 *B = aij->B; 1999 /* Note that we don't incref *A and *B, so be careful! */ 2000 PetscFunctionReturn(0); 2001 } 2002 2003 #undef __FUNCT__ 2004 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 2005 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 2006 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 2007 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 2008 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 2009 { 2010 PetscErrorCode ierr; 2011 PetscMPIInt size, flag; 2012 PetscInt i,ii; 2013 PetscInt ismax_c; 2014 2015 PetscFunctionBegin; 2016 if (!ismax) PetscFunctionReturn(0); 2017 2018 for (i = 0, ismax_c = 0; i < ismax; ++i) { 2019 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 2020 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2021 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2022 if (size > 1) ++ismax_c; 2023 } 2024 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 2025 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2026 } else { /* if (ismax_c) */ 2027 Mat *A,*B; 2028 IS *isrow_c, *iscol_c; 2029 PetscMPIInt size; 2030 /* 2031 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 2032 array of sequential matrices underlying the resulting parallel matrices. 2033 Which arrays to allocate is based on the value of MatReuse scall. 2034 There are as many diag matrices as there are original index sets. 2035 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 2036 2037 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 2038 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 2039 will deposite the extracted diag and off-diag parts. 2040 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 2041 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 2042 */ 2043 2044 /* Parallel matrix array is allocated only if no reuse is taking place. */ 2045 if (scall != MAT_REUSE_MATRIX) { 2046 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 2047 } else { 2048 ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr); 2049 ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr); 2050 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 2051 for (i = 0, ii = 0; i < ismax; ++i) { 2052 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2053 if (size > 1) { 2054 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 2055 ++ii; 2056 } else A[i] = (*submat)[i]; 2057 } 2058 } 2059 /* 2060 Construct the complements of the iscol ISs for parallel ISs only. 2061 These are used to extract the off-diag portion of the resulting parallel matrix. 2062 The row IS for the off-diag portion is the same as for the diag portion, 2063 so we merely alias the row IS, while skipping those that are sequential. 2064 */ 2065 ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr); 2066 for (i = 0, ii = 0; i < ismax; ++i) { 2067 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2068 if (size > 1) { 2069 isrow_c[ii] = isrow[i]; 2070 2071 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 2072 ++ii; 2073 } 2074 } 2075 /* Now obtain the sequential A and B submatrices separately. */ 2076 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 2077 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 2078 for (ii = 0; ii < ismax_c; ++ii) { 2079 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 2080 } 2081 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 2082 /* 2083 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 2084 have been extracted directly into the parallel matrices containing them, or 2085 simply into the sequential matrix identical with the corresponding A (if size == 1). 2086 Otherwise, make sure that parallel matrices are constructed from A & B, or the 2087 A is put into the correct submat slot (if size == 1). 2088 */ 2089 if (scall != MAT_REUSE_MATRIX) { 2090 for (i = 0, ii = 0; i < ismax; ++i) { 2091 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2092 if (size > 1) { 2093 /* 2094 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 2095 */ 2096 /* Construct submat[i] from the Seq pieces A and B. */ 2097 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 2098 2099 ++ii; 2100 } else (*submat)[i] = A[i]; 2101 } 2102 } 2103 ierr = PetscFree(A);CHKERRQ(ierr); 2104 ierr = PetscFree(B);CHKERRQ(ierr); 2105 } 2106 PetscFunctionReturn(0); 2107 } /* MatGetSubMatricesParallel_MPIXAIJ() */ 2108 2109 2110 2111 #undef __FUNCT__ 2112 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 2113 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2114 { 2115 PetscErrorCode ierr; 2116 2117 PetscFunctionBegin; 2118 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 2119 PetscFunctionReturn(0); 2120 } 2121