1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_K(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_K(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_K(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_K(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscBool mat_overlap_k,set; 28 PetscInt i; 29 30 PetscFunctionBegin; 31 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 32 ierr = PetscOptionsGetBool(NULL,"-mat_overlap_k",&mat_overlap_k,&set);CHKERRQ(ierr); 33 if(!set) mat_overlap_k = PETSC_FALSE; 34 for (i=0; i<ov; ++i) { 35 if(mat_overlap_k){ 36 ierr = MatIncreaseOverlap_MPIAIJ_Once_K(C,imax,is);CHKERRQ(ierr); 37 }else{ 38 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 39 } 40 } 41 PetscFunctionReturn(0); 42 } 43 44 45 #undef __FUNCT__ 46 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_K" 47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_K(Mat mat,PetscInt nidx,IS is[]) 48 { 49 PetscErrorCode ierr; 50 MPI_Comm comm; 51 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 52 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 53 PetscInt nrecvrows,*sbsizes,*sbdata,nto,nfrom; 54 const PetscInt **indices,*indices_i; 55 PetscLayout rmap; 56 PetscMPIInt rank,size,*toranks,*fromranks; 57 PetscSF sf; 58 PetscSFNode *remote; 59 60 PetscFunctionBegin; 61 /*communicator */ 62 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 63 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 64 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 65 /*row map*/ 66 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 67 /*retrieve IS data*/ 68 ierr = PetscCalloc2(nidx,&indices,nidx,&length);CHKERRQ(ierr); 69 /*get length and indices*/ 70 for(i=0,tlength=0; i<nidx; i++){ 71 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 72 tlength += length[i]; 73 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 74 } 75 /*find these rows on remote processors */ 76 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 77 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 78 nrrows = 0; 79 for(i=0; i<nidx; i++){ 80 length_i = length[i]; 81 indices_i = indices[i]; 82 for(j=0; j<length_i; j++){ 83 owner = -1; 84 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 85 /*remote processors*/ 86 if(owner != rank){ 87 tosizes_temp[owner]++; /*number of rows to owner*/ 88 rrow_ranks[nrrows] = owner; /*processor */ 89 rrow_isids[nrrows] = i; /*is id*/ 90 remoterows[nrrows++] = indices_i[j]; /* row */ 91 } 92 } 93 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 94 } 95 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 96 /*test if we need to exchange messages 97 * generally speaking, we do not need to exchange 98 * data when overlap is 1 99 * */ 100 ierr = MPI_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 101 /*we do not have any messages 102 * It usually corresponds to overlap 1 103 * */ 104 if(!reducednrrows){ 105 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 106 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 107 ierr = MatIncreaseOverlap_MPIAIJ_Local_K(mat,nidx,is);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 nto = 0; 111 /*send sizes and ranks*/ 112 for(i=0; i<size; i++){ 113 if(tosizes_temp[i]){ 114 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 115 tosizes_temp[i] = nto; /* a map from processor to index */ 116 toranks[nto++] = i; /* processor */ 117 } 118 } 119 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 120 for(i=0; i<nto; i++){ 121 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /*offsets*/ 122 tosizes[2*i+1] = toffsets[i]; /*offsets to send*/ 123 } 124 /*send information to other processors*/ 125 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 126 /*build a star forest */ 127 nrecvrows = 0; 128 for(i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 129 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 130 nrecvrows = 0; 131 for(i=0; i<nfrom; i++){ 132 for(j=0; j<fromsizes[2*i]; j++){ 133 remote[nrecvrows].rank = fromranks[i]; 134 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 135 } 136 } 137 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 138 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 139 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 140 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 141 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 142 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 143 /*message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for(i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /*deal with remote data */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_K(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for(i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc(nrecvrows*sizeof(PetscSFNode),&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for(i=0; i<nto; i++){ 169 for(j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /*use two-sided communication by default since OPENMPI has some bugs for one-sided one*/ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /*overlap communication and computation*/ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_K(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 /*ierr = PetscSFView(sf,PETSC_VIEWER_STDOUT_WORLD);CHKERRQ(ierr);*/ 184 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 185 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 186 ierr = MatIncreaseOverlap_MPIAIJ_Receive_K(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 187 ierr = PetscFree(toranks);CHKERRQ(ierr); 188 ierr = PetscFree(tosizes);CHKERRQ(ierr); 189 ierr = PetscFree(todata);CHKERRQ(ierr); 190 PetscFunctionReturn(0); 191 } 192 193 #undef __FUNCT__ 194 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_K" 195 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_K(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 196 { 197 PetscInt *isz,isz_i,i,j,is_id, data_size; 198 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 199 const PetscInt *indices_i_temp; 200 PetscErrorCode ierr; 201 202 PetscFunctionBegin; 203 max_lsize = 0; 204 ierr = PetscMalloc(nidx*sizeof(PetscInt),&isz);CHKERRQ(ierr); 205 for(i=0; i<nidx; i++){ 206 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 207 max_lsize = lsize>max_lsize ? lsize:max_lsize; 208 isz[i] = lsize; 209 } 210 ierr = PetscMalloc((max_lsize+nrecvs)*nidx*sizeof(PetscInt),&indices_temp);CHKERRQ(ierr); 211 for(i=0; i<nidx; i++){ 212 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 213 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 214 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 215 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 216 } 217 /*retrieve information */ 218 for(i=0; i<nrecvs; ){ 219 is_id = recvdata[i++]; 220 data_size = recvdata[i++]; 221 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 222 isz_i = isz[is_id]; 223 for(j=0; j< data_size; j++){ 224 col = recvdata[i++]; 225 indices_i[isz_i++] = col; 226 } 227 isz[is_id] = isz_i; 228 } 229 /*remove duplicate entities*/ 230 for(i=0; i<nidx; i++){ 231 indices_i = indices_temp+(max_lsize+nrecvs)*i; 232 isz_i = isz[i]; 233 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 234 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 235 } 236 ierr = PetscFree(isz);CHKERRQ(ierr); 237 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 #undef __FUNCT__ 242 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_K" 243 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_K(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 244 { 245 PetscLayout rmap,cmap; 246 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 247 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 248 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 249 const PetscInt *gcols,*ai,*aj,*bi,*bj; 250 Mat amat,bmat; 251 PetscMPIInt rank; 252 PetscBool done; 253 MPI_Comm comm; 254 PetscErrorCode ierr; 255 256 PetscFunctionBegin; 257 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 258 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 259 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 260 /* Even if the mat is symmetric, we still assume it is not symmetric*/ 261 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 262 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 263 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 264 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 265 /*total number of nonzero values */ 266 tnz = ai[an]+bi[bn]; 267 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 268 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 269 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 270 /*longest message */ 271 max_fszs = 0; 272 for(i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 273 /*better way to estimate number of nonzero in the mat???*/ 274 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 275 for(i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 276 rows_pos = 0; 277 totalrows = 0; 278 for(i=0; i<nfrom; i++){ 279 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 280 /*group data*/ 281 for(j=0; j<fromsizes[2*i]; j+=2){ 282 is_id = fromrows[rows_pos++];/*no of is*/ 283 rows_i = rows_data[is_id]; 284 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 285 } 286 /*estimate a space to avoid multiple allocations */ 287 for(j=0; j<nidx; j++){ 288 indvc_ij = 0; 289 rows_i = rows_data[j]; 290 for(l=0; l<rows_pos_i[j]; l++){ 291 row = rows_i[l]-rstart; 292 start = ai[row]; 293 end = ai[row+1]; 294 for(k=start; k<end; k++){ /* Amat */ 295 col = aj[k] + cstart; 296 /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr); 297 if(owner != fromranks[i])*/ 298 indices_tmp[indvc_ij++] = col;/*do not count the rows from the original rank*/ 299 } 300 start = bi[row]; 301 end = bi[row+1]; 302 for(k=start; k<end; k++) { /* Bmat */ 303 col = gcols[bj[k]]; 304 /*ierr = PetscLayoutFindOwner(rmap,col,&owner);CHKERRQ(ierr); 305 if(owner != fromranks[i]) */ 306 indices_tmp[indvc_ij++] = col; 307 } 308 } 309 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 310 indv_counts[i*nidx+j] = indvc_ij; 311 totalrows += indvc_ij; 312 } 313 } 314 /*message triple <no of is, number of rows, rows> */ 315 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 316 totalrows = 0; 317 rows_pos = 0; 318 /* use this code again */ 319 for(i=0;i<nfrom;i++){ 320 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 321 for(j=0; j<fromsizes[2*i]; j+=2){ 322 is_id = fromrows[rows_pos++]; 323 rows_i = rows_data[is_id]; 324 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 325 } 326 /* add data */ 327 for(j=0; j<nidx; j++){ 328 if(!indv_counts[i*nidx+j]) continue; 329 indvc_ij = 0; 330 sbdata[totalrows++] = j; 331 sbdata[totalrows++] = indv_counts[i*nidx+j]; 332 sbsizes[2*i] += 2; 333 rows_i = rows_data[j]; 334 for(l=0; l<rows_pos_i[j]; l++){ 335 row = rows_i[l]-rstart; 336 start = ai[row]; 337 end = ai[row+1]; 338 for(k=start; k<end; k++){ /* Amat */ 339 col = aj[k] + cstart; 340 indices_tmp[indvc_ij++] = col; 341 } 342 start = bi[row]; 343 end = bi[row+1]; 344 for(k=start; k<end; k++) { /* Bmat */ 345 col = gcols[bj[k]]; 346 indices_tmp[indvc_ij++] = col; 347 } 348 } 349 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 350 sbsizes[2*i] += indvc_ij; 351 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 352 totalrows += indvc_ij; 353 } 354 } 355 /* offsets */ 356 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 357 for(i=0; i<nfrom; i++){ 358 offsets[i+1] = offsets[i] + sbsizes[2*i]; 359 sbsizes[2*i+1] = offsets[i]; 360 } 361 ierr = PetscFree(offsets);CHKERRQ(ierr); 362 if(sbrowsizes) *sbrowsizes = sbsizes; 363 if(sbrows) *sbrows = sbdata; 364 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 365 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_K" 372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_K(Mat mat,PetscInt nidx, IS is[]) 373 { 374 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 375 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 376 PetscInt lsize,lsize_tmp,owner; 377 PetscMPIInt rank; 378 Mat amat,bmat; 379 PetscBool done; 380 PetscLayout cmap,rmap; 381 MPI_Comm comm; 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 386 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 387 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 388 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 389 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 390 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 391 if(!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 392 /*is it a safe way to compute number of nonzero values ?*/ 393 tnz = ai[an]+bi[bn]; 394 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 395 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 397 /*it is a better way to estimate memory than the old implementation 398 * where global size of matrix is used 399 * */ 400 ierr = PetscMalloc(sizeof(PetscInt)*tnz,&indices_temp);CHKERRQ(ierr); 401 for(i=0; i<nidx; i++) { 402 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 403 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 404 lsize_tmp = 0; 405 for(j=0; j<lsize; j++) { 406 owner = -1; 407 row = indices[j]; 408 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 409 if(owner != rank) continue; 410 /*local number*/ 411 row -= rstart; 412 start = ai[row]; 413 end = ai[row+1]; 414 for(k=start; k<end; k++) { /* Amat */ 415 col = aj[k] + cstart; 416 indices_temp[lsize_tmp++] = col; 417 } 418 start = bi[row]; 419 end = bi[row+1]; 420 for(k=start; k<end; k++) { /* Bmat */ 421 col = gcols[bj[k]]; 422 indices_temp[lsize_tmp++] = col; 423 } 424 } 425 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 426 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 427 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 428 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 429 } 430 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 431 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 437 /* 438 Sample message format: 439 If a processor A wants processor B to process some elements corresponding 440 to index sets is[1],is[5] 441 mesg [0] = 2 (no of index sets in the mesg) 442 ----------- 443 mesg [1] = 1 => is[1] 444 mesg [2] = sizeof(is[1]); 445 ----------- 446 mesg [3] = 5 => is[5] 447 mesg [4] = sizeof(is[5]); 448 ----------- 449 mesg [5] 450 mesg [n] datas[1] 451 ----------- 452 mesg[n+1] 453 mesg[m] data(is[5]) 454 ----------- 455 456 Notes: 457 nrqs - no of requests sent (or to be sent out) 458 nrqr - no of requests recieved (which have to be or which have been processed 459 */ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 463 { 464 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 465 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 466 const PetscInt **idx,*idx_i; 467 PetscInt *n,**data,len; 468 PetscErrorCode ierr; 469 PetscMPIInt size,rank,tag1,tag2; 470 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 471 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 472 PetscBT *table; 473 MPI_Comm comm; 474 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 475 MPI_Status *s_status,*recv_status; 476 char *t_p; 477 478 PetscFunctionBegin; 479 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 480 size = c->size; 481 rank = c->rank; 482 M = C->rmap->N; 483 484 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 485 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 486 487 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 488 489 for (i=0; i<imax; i++) { 490 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 491 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 492 } 493 494 /* evaluate communication - mesg to who,length of mesg, and buffer space 495 required. Based on this, buffers are allocated, and data copied into them*/ 496 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 497 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 498 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 499 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 500 for (i=0; i<imax; i++) { 501 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 502 idx_i = idx[i]; 503 len = n[i]; 504 for (j=0; j<len; j++) { 505 row = idx_i[j]; 506 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 507 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 508 w4[proc]++; 509 } 510 for (j=0; j<size; j++) { 511 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 512 } 513 } 514 515 nrqs = 0; /* no of outgoing messages */ 516 msz = 0; /* total mesg length (for all proc */ 517 w1[rank] = 0; /* no mesg sent to intself */ 518 w3[rank] = 0; 519 for (i=0; i<size; i++) { 520 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 521 } 522 /* pa - is list of processors to communicate with */ 523 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 524 for (i=0,j=0; i<size; i++) { 525 if (w1[i]) {pa[j] = i; j++;} 526 } 527 528 /* Each message would have a header = 1 + 2*(no of IS) + data */ 529 for (i=0; i<nrqs; i++) { 530 j = pa[i]; 531 w1[j] += w2[j] + 2*w3[j]; 532 msz += w1[j]; 533 } 534 535 /* Determine the number of messages to expect, their lengths, from from-ids */ 536 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 537 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 538 539 /* Now post the Irecvs corresponding to these messages */ 540 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 541 542 /* Allocate Memory for outgoing messages */ 543 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 544 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 545 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 546 547 { 548 PetscInt *iptr = tmp,ict = 0; 549 for (i=0; i<nrqs; i++) { 550 j = pa[i]; 551 iptr += ict; 552 outdat[j] = iptr; 553 ict = w1[j]; 554 } 555 } 556 557 /* Form the outgoing messages */ 558 /*plug in the headers*/ 559 for (i=0; i<nrqs; i++) { 560 j = pa[i]; 561 outdat[j][0] = 0; 562 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 563 ptr[j] = outdat[j] + 2*w3[j] + 1; 564 } 565 566 /* Memory for doing local proc's work*/ 567 { 568 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 569 570 for (i=0; i<imax; i++) { 571 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 572 data[i] = d_p + M*i; 573 } 574 } 575 576 /* Parse the IS and update local tables and the outgoing buf with the data*/ 577 { 578 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 579 PetscBT table_i; 580 581 for (i=0; i<imax; i++) { 582 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 583 n_i = n[i]; 584 table_i = table[i]; 585 idx_i = idx[i]; 586 data_i = data[i]; 587 isz_i = isz[i]; 588 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 589 row = idx_i[j]; 590 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 591 if (proc != rank) { /* copy to the outgoing buffer */ 592 ctr[proc]++; 593 *ptr[proc] = row; 594 ptr[proc]++; 595 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 596 } 597 /* Update the headers for the current IS */ 598 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 599 if ((ctr_j = ctr[j])) { 600 outdat_j = outdat[j]; 601 k = ++outdat_j[0]; 602 outdat_j[2*k] = ctr_j; 603 outdat_j[2*k-1] = i; 604 } 605 } 606 isz[i] = isz_i; 607 } 608 } 609 610 /* Now post the sends */ 611 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 612 for (i=0; i<nrqs; ++i) { 613 j = pa[i]; 614 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 615 } 616 617 /* No longer need the original indices*/ 618 for (i=0; i<imax; ++i) { 619 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 620 } 621 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 622 623 for (i=0; i<imax; ++i) { 624 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 625 } 626 627 /* Do Local work*/ 628 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 629 630 /* Receive messages*/ 631 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 632 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 633 634 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 635 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 636 637 /* Phase 1 sends are complete - deallocate buffers */ 638 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 639 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 640 641 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 642 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 643 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 644 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 645 ierr = PetscFree(rbuf);CHKERRQ(ierr); 646 647 648 /* Send the data back*/ 649 /* Do a global reduction to know the buffer space req for incoming messages*/ 650 { 651 PetscMPIInt *rw1; 652 653 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 654 655 for (i=0; i<nrqr; ++i) { 656 proc = recv_status[i].MPI_SOURCE; 657 658 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 659 rw1[proc] = isz1[i]; 660 } 661 ierr = PetscFree(onodes1);CHKERRQ(ierr); 662 ierr = PetscFree(olengths1);CHKERRQ(ierr); 663 664 /* Determine the number of messages to expect, their lengths, from from-ids */ 665 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 666 ierr = PetscFree(rw1);CHKERRQ(ierr); 667 } 668 /* Now post the Irecvs corresponding to these messages */ 669 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 670 671 /* Now post the sends */ 672 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 673 for (i=0; i<nrqr; ++i) { 674 j = recv_status[i].MPI_SOURCE; 675 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 676 } 677 678 /* receive work done on other processors*/ 679 { 680 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 681 PetscMPIInt idex; 682 PetscBT table_i; 683 MPI_Status *status2; 684 685 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 686 for (i=0; i<nrqs; ++i) { 687 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 688 /* Process the message*/ 689 rbuf2_i = rbuf2[idex]; 690 ct1 = 2*rbuf2_i[0]+1; 691 jmax = rbuf2[idex][0]; 692 for (j=1; j<=jmax; j++) { 693 max = rbuf2_i[2*j]; 694 is_no = rbuf2_i[2*j-1]; 695 isz_i = isz[is_no]; 696 data_i = data[is_no]; 697 table_i = table[is_no]; 698 for (k=0; k<max; k++,ct1++) { 699 row = rbuf2_i[ct1]; 700 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 701 } 702 isz[is_no] = isz_i; 703 } 704 } 705 706 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 707 ierr = PetscFree(status2);CHKERRQ(ierr); 708 } 709 710 for (i=0; i<imax; ++i) { 711 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 712 } 713 714 ierr = PetscFree(onodes2);CHKERRQ(ierr); 715 ierr = PetscFree(olengths2);CHKERRQ(ierr); 716 717 ierr = PetscFree(pa);CHKERRQ(ierr); 718 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 719 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 720 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 721 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 722 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 723 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 724 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 725 ierr = PetscFree(s_status);CHKERRQ(ierr); 726 ierr = PetscFree(recv_status);CHKERRQ(ierr); 727 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 728 ierr = PetscFree(xdata);CHKERRQ(ierr); 729 ierr = PetscFree(isz1);CHKERRQ(ierr); 730 PetscFunctionReturn(0); 731 } 732 733 #undef __FUNCT__ 734 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 735 /* 736 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 737 the work on the local processor. 738 739 Inputs: 740 C - MAT_MPIAIJ; 741 imax - total no of index sets processed at a time; 742 table - an array of char - size = m bits. 743 744 Output: 745 isz - array containing the count of the solution elements corresponding 746 to each index set; 747 data - pointer to the solutions 748 */ 749 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 750 { 751 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 752 Mat A = c->A,B = c->B; 753 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 754 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 755 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 756 PetscBT table_i; 757 758 PetscFunctionBegin; 759 rstart = C->rmap->rstart; 760 cstart = C->cmap->rstart; 761 ai = a->i; 762 aj = a->j; 763 bi = b->i; 764 bj = b->j; 765 garray = c->garray; 766 767 768 for (i=0; i<imax; i++) { 769 data_i = data[i]; 770 table_i = table[i]; 771 isz_i = isz[i]; 772 for (j=0,max=isz[i]; j<max; j++) { 773 row = data_i[j] - rstart; 774 start = ai[row]; 775 end = ai[row+1]; 776 for (k=start; k<end; k++) { /* Amat */ 777 val = aj[k] + cstart; 778 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 779 } 780 start = bi[row]; 781 end = bi[row+1]; 782 for (k=start; k<end; k++) { /* Bmat */ 783 val = garray[bj[k]]; 784 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 785 } 786 } 787 isz[i] = isz_i; 788 } 789 PetscFunctionReturn(0); 790 } 791 792 #undef __FUNCT__ 793 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 794 /* 795 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 796 and return the output 797 798 Input: 799 C - the matrix 800 nrqr - no of messages being processed. 801 rbuf - an array of pointers to the recieved requests 802 803 Output: 804 xdata - array of messages to be sent back 805 isz1 - size of each message 806 807 For better efficiency perhaps we should malloc separately each xdata[i], 808 then if a remalloc is required we need only copy the data for that one row 809 rather then all previous rows as it is now where a single large chunck of 810 memory is used. 811 812 */ 813 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 814 { 815 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 816 Mat A = c->A,B = c->B; 817 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 818 PetscErrorCode ierr; 819 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 820 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 821 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 822 PetscInt *rbuf_i,kmax,rbuf_0; 823 PetscBT xtable; 824 825 PetscFunctionBegin; 826 m = C->rmap->N; 827 rstart = C->rmap->rstart; 828 cstart = C->cmap->rstart; 829 ai = a->i; 830 aj = a->j; 831 bi = b->i; 832 bj = b->j; 833 garray = c->garray; 834 835 836 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 837 rbuf_i = rbuf[i]; 838 rbuf_0 = rbuf_i[0]; 839 ct += rbuf_0; 840 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 841 } 842 843 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 844 else max1 = 1; 845 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 846 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 847 ++no_malloc; 848 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 849 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 850 851 ct3 = 0; 852 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 853 rbuf_i = rbuf[i]; 854 rbuf_0 = rbuf_i[0]; 855 ct1 = 2*rbuf_0+1; 856 ct2 = ct1; 857 ct3 += ct1; 858 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 859 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 860 oct2 = ct2; 861 kmax = rbuf_i[2*j]; 862 for (k=0; k<kmax; k++,ct1++) { 863 row = rbuf_i[ct1]; 864 if (!PetscBTLookupSet(xtable,row)) { 865 if (!(ct3 < mem_estimate)) { 866 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 867 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 868 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 869 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 870 xdata[0] = tmp; 871 mem_estimate = new_estimate; ++no_malloc; 872 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 873 } 874 xdata[i][ct2++] = row; 875 ct3++; 876 } 877 } 878 for (k=oct2,max2=ct2; k<max2; k++) { 879 row = xdata[i][k] - rstart; 880 start = ai[row]; 881 end = ai[row+1]; 882 for (l=start; l<end; l++) { 883 val = aj[l] + cstart; 884 if (!PetscBTLookupSet(xtable,val)) { 885 if (!(ct3 < mem_estimate)) { 886 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 887 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 888 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 889 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 890 xdata[0] = tmp; 891 mem_estimate = new_estimate; ++no_malloc; 892 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 893 } 894 xdata[i][ct2++] = val; 895 ct3++; 896 } 897 } 898 start = bi[row]; 899 end = bi[row+1]; 900 for (l=start; l<end; l++) { 901 val = garray[bj[l]]; 902 if (!PetscBTLookupSet(xtable,val)) { 903 if (!(ct3 < mem_estimate)) { 904 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 905 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 906 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 907 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 908 xdata[0] = tmp; 909 mem_estimate = new_estimate; ++no_malloc; 910 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 911 } 912 xdata[i][ct2++] = val; 913 ct3++; 914 } 915 } 916 } 917 /* Update the header*/ 918 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 919 xdata[i][2*j-1] = rbuf_i[2*j-1]; 920 } 921 xdata[i][0] = rbuf_0; 922 xdata[i+1] = xdata[i] + ct2; 923 isz1[i] = ct2; /* size of each message */ 924 } 925 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 926 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 927 PetscFunctionReturn(0); 928 } 929 /* -------------------------------------------------------------------------*/ 930 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 931 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 932 /* 933 Every processor gets the entire matrix 934 */ 935 #undef __FUNCT__ 936 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 937 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 938 { 939 Mat B; 940 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 941 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 942 PetscErrorCode ierr; 943 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 944 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 945 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 946 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 947 948 PetscFunctionBegin; 949 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 950 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 951 952 if (scall == MAT_INITIAL_MATRIX) { 953 /* ---------------------------------------------------------------- 954 Tell every processor the number of nonzeros per row 955 */ 956 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 957 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 958 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 959 } 960 sendcount = A->rmap->rend - A->rmap->rstart; 961 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 962 for (i=0; i<size; i++) { 963 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 964 displs[i] = A->rmap->range[i]; 965 } 966 #if defined(PETSC_HAVE_MPI_IN_PLACE) 967 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 968 #else 969 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 970 #endif 971 /* --------------------------------------------------------------- 972 Create the sequential matrix of the same type as the local block diagonal 973 */ 974 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 975 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 976 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 977 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 978 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 979 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 980 **Bin = B; 981 b = (Mat_SeqAIJ*)B->data; 982 983 /*-------------------------------------------------------------------- 984 Copy my part of matrix column indices over 985 */ 986 sendcount = ad->nz + bd->nz; 987 jsendbuf = b->j + b->i[rstarts[rank]]; 988 a_jsendbuf = ad->j; 989 b_jsendbuf = bd->j; 990 n = A->rmap->rend - A->rmap->rstart; 991 cnt = 0; 992 for (i=0; i<n; i++) { 993 994 /* put in lower diagonal portion */ 995 m = bd->i[i+1] - bd->i[i]; 996 while (m > 0) { 997 /* is it above diagonal (in bd (compressed) numbering) */ 998 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 999 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1000 m--; 1001 } 1002 1003 /* put in diagonal portion */ 1004 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1005 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1006 } 1007 1008 /* put in upper diagonal portion */ 1009 while (m-- > 0) { 1010 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1011 } 1012 } 1013 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1014 1015 /*-------------------------------------------------------------------- 1016 Gather all column indices to all processors 1017 */ 1018 for (i=0; i<size; i++) { 1019 recvcounts[i] = 0; 1020 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1021 recvcounts[i] += lens[j]; 1022 } 1023 } 1024 displs[0] = 0; 1025 for (i=1; i<size; i++) { 1026 displs[i] = displs[i-1] + recvcounts[i-1]; 1027 } 1028 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1029 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1030 #else 1031 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1032 #endif 1033 /*-------------------------------------------------------------------- 1034 Assemble the matrix into useable form (note numerical values not yet set) 1035 */ 1036 /* set the b->ilen (length of each row) values */ 1037 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1038 /* set the b->i indices */ 1039 b->i[0] = 0; 1040 for (i=1; i<=A->rmap->N; i++) { 1041 b->i[i] = b->i[i-1] + lens[i-1]; 1042 } 1043 ierr = PetscFree(lens);CHKERRQ(ierr); 1044 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1045 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1046 1047 } else { 1048 B = **Bin; 1049 b = (Mat_SeqAIJ*)B->data; 1050 } 1051 1052 /*-------------------------------------------------------------------- 1053 Copy my part of matrix numerical values into the values location 1054 */ 1055 if (flag == MAT_GET_VALUES) { 1056 sendcount = ad->nz + bd->nz; 1057 sendbuf = b->a + b->i[rstarts[rank]]; 1058 a_sendbuf = ad->a; 1059 b_sendbuf = bd->a; 1060 b_sendj = bd->j; 1061 n = A->rmap->rend - A->rmap->rstart; 1062 cnt = 0; 1063 for (i=0; i<n; i++) { 1064 1065 /* put in lower diagonal portion */ 1066 m = bd->i[i+1] - bd->i[i]; 1067 while (m > 0) { 1068 /* is it above diagonal (in bd (compressed) numbering) */ 1069 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1070 sendbuf[cnt++] = *b_sendbuf++; 1071 m--; 1072 b_sendj++; 1073 } 1074 1075 /* put in diagonal portion */ 1076 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1077 sendbuf[cnt++] = *a_sendbuf++; 1078 } 1079 1080 /* put in upper diagonal portion */ 1081 while (m-- > 0) { 1082 sendbuf[cnt++] = *b_sendbuf++; 1083 b_sendj++; 1084 } 1085 } 1086 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1087 1088 /* ----------------------------------------------------------------- 1089 Gather all numerical values to all processors 1090 */ 1091 if (!recvcounts) { 1092 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1093 } 1094 for (i=0; i<size; i++) { 1095 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1096 } 1097 displs[0] = 0; 1098 for (i=1; i<size; i++) { 1099 displs[i] = displs[i-1] + recvcounts[i-1]; 1100 } 1101 recvbuf = b->a; 1102 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1103 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1104 #else 1105 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1106 #endif 1107 } /* endof (flag == MAT_GET_VALUES) */ 1108 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1109 1110 if (A->symmetric) { 1111 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1112 } else if (A->hermitian) { 1113 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1114 } else if (A->structurally_symmetric) { 1115 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1116 } 1117 PetscFunctionReturn(0); 1118 } 1119 1120 1121 1122 #undef __FUNCT__ 1123 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 1124 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1125 { 1126 PetscErrorCode ierr; 1127 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 1128 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 1129 1130 PetscFunctionBegin; 1131 /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */ 1132 /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level. 1133 However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that 1134 take care to order the result correctly by assembling it with MatSetValues() (after preallocating). 1135 */ 1136 for (i = 0; i < ismax; ++i) { 1137 PetscBool sorted; 1138 ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr); 1139 if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i); 1140 } 1141 1142 /* 1143 Check for special case: each processor gets entire matrix 1144 */ 1145 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1146 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1147 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1148 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1149 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1150 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1151 wantallmatrix = PETSC_TRUE; 1152 1153 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1154 } 1155 } 1156 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1157 if (twantallmatrix) { 1158 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 1159 PetscFunctionReturn(0); 1160 } 1161 1162 /* Allocate memory to hold all the submatrices */ 1163 if (scall != MAT_REUSE_MATRIX) { 1164 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 1165 } 1166 1167 /* Check for special case: each processor gets entire matrix columns */ 1168 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 1169 for (i=0; i<ismax; i++) { 1170 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 1171 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 1172 if (colflag && ncol == C->cmap->N) { 1173 allcolumns[i] = PETSC_TRUE; 1174 } else { 1175 allcolumns[i] = PETSC_FALSE; 1176 } 1177 } 1178 1179 /* Determine the number of stages through which submatrices are done */ 1180 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1181 1182 /* 1183 Each stage will extract nmax submatrices. 1184 nmax is determined by the matrix column dimension. 1185 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1186 */ 1187 if (!nmax) nmax = 1; 1188 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 1189 1190 /* Make sure every processor loops through the nstages */ 1191 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1192 1193 for (i=0,pos=0; i<nstages; i++) { 1194 if (pos+nmax <= ismax) max_no = nmax; 1195 else if (pos == ismax) max_no = 0; 1196 else max_no = ismax-pos; 1197 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 1198 pos += max_no; 1199 } 1200 1201 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 1202 PetscFunctionReturn(0); 1203 } 1204 1205 /* -------------------------------------------------------------------------*/ 1206 #undef __FUNCT__ 1207 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 1208 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 1209 { 1210 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1211 Mat A = c->A; 1212 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 1213 const PetscInt **icol,**irow; 1214 PetscInt *nrow,*ncol,start; 1215 PetscErrorCode ierr; 1216 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 1217 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 1218 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 1219 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 1220 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 1221 #if defined(PETSC_USE_CTABLE) 1222 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 1223 #else 1224 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 1225 #endif 1226 const PetscInt *irow_i; 1227 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 1228 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1229 MPI_Request *r_waits4,*s_waits3,*s_waits4; 1230 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 1231 MPI_Status *r_status3,*r_status4,*s_status4; 1232 MPI_Comm comm; 1233 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 1234 PetscMPIInt *onodes1,*olengths1; 1235 PetscMPIInt idex,idex2,end; 1236 1237 PetscFunctionBegin; 1238 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1239 tag0 = ((PetscObject)C)->tag; 1240 size = c->size; 1241 rank = c->rank; 1242 1243 /* Get some new tags to keep the communication clean */ 1244 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 1245 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1246 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1247 1248 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 1249 1250 for (i=0; i<ismax; i++) { 1251 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 1252 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 1253 if (allcolumns[i]) { 1254 icol[i] = NULL; 1255 ncol[i] = C->cmap->N; 1256 } else { 1257 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 1258 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 1259 } 1260 } 1261 1262 /* evaluate communication - mesg to who, length of mesg, and buffer space 1263 required. Based on this, buffers are allocated, and data copied into them*/ 1264 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 1265 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1266 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1267 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1268 for (i=0; i<ismax; i++) { 1269 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 1270 jmax = nrow[i]; 1271 irow_i = irow[i]; 1272 for (j=0; j<jmax; j++) { 1273 l = 0; 1274 row = irow_i[j]; 1275 while (row >= C->rmap->range[l+1]) l++; 1276 proc = l; 1277 w4[proc]++; 1278 } 1279 for (j=0; j<size; j++) { 1280 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 1281 } 1282 } 1283 1284 nrqs = 0; /* no of outgoing messages */ 1285 msz = 0; /* total mesg length (for all procs) */ 1286 w1[rank] = 0; /* no mesg sent to self */ 1287 w3[rank] = 0; 1288 for (i=0; i<size; i++) { 1289 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 1290 } 1291 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1292 for (i=0,j=0; i<size; i++) { 1293 if (w1[i]) { pa[j] = i; j++; } 1294 } 1295 1296 /* Each message would have a header = 1 + 2*(no of IS) + data */ 1297 for (i=0; i<nrqs; i++) { 1298 j = pa[i]; 1299 w1[j] += w2[j] + 2* w3[j]; 1300 msz += w1[j]; 1301 } 1302 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1303 1304 /* Determine the number of messages to expect, their lengths, from from-ids */ 1305 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1306 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1307 1308 /* Now post the Irecvs corresponding to these messages */ 1309 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1310 1311 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1312 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1313 1314 /* Allocate Memory for outgoing messages */ 1315 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1316 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1317 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1318 1319 { 1320 PetscInt *iptr = tmp,ict = 0; 1321 for (i=0; i<nrqs; i++) { 1322 j = pa[i]; 1323 iptr += ict; 1324 sbuf1[j] = iptr; 1325 ict = w1[j]; 1326 } 1327 } 1328 1329 /* Form the outgoing messages */ 1330 /* Initialize the header space */ 1331 for (i=0; i<nrqs; i++) { 1332 j = pa[i]; 1333 sbuf1[j][0] = 0; 1334 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 1335 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 1336 } 1337 1338 /* Parse the isrow and copy data into outbuf */ 1339 for (i=0; i<ismax; i++) { 1340 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1341 irow_i = irow[i]; 1342 jmax = nrow[i]; 1343 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 1344 l = 0; 1345 row = irow_i[j]; 1346 while (row >= C->rmap->range[l+1]) l++; 1347 proc = l; 1348 if (proc != rank) { /* copy to the outgoing buf*/ 1349 ctr[proc]++; 1350 *ptr[proc] = row; 1351 ptr[proc]++; 1352 } 1353 } 1354 /* Update the headers for the current IS */ 1355 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1356 if ((ctr_j = ctr[j])) { 1357 sbuf1_j = sbuf1[j]; 1358 k = ++sbuf1_j[0]; 1359 sbuf1_j[2*k] = ctr_j; 1360 sbuf1_j[2*k-1] = i; 1361 } 1362 } 1363 } 1364 1365 /* Now post the sends */ 1366 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1367 for (i=0; i<nrqs; ++i) { 1368 j = pa[i]; 1369 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 1370 } 1371 1372 /* Post Receives to capture the buffer size */ 1373 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 1374 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 1375 rbuf2[0] = tmp + msz; 1376 for (i=1; i<nrqs; ++i) { 1377 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 1378 } 1379 for (i=0; i<nrqs; ++i) { 1380 j = pa[i]; 1381 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 1382 } 1383 1384 /* Send to other procs the buf size they should allocate */ 1385 1386 1387 /* Receive messages*/ 1388 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 1389 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1390 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 1391 { 1392 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 1393 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 1394 PetscInt *sbuf2_i; 1395 1396 for (i=0; i<nrqr; ++i) { 1397 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1398 1399 req_size[idex] = 0; 1400 rbuf1_i = rbuf1[idex]; 1401 start = 2*rbuf1_i[0] + 1; 1402 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1403 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 1404 sbuf2_i = sbuf2[idex]; 1405 for (j=start; j<end; j++) { 1406 id = rbuf1_i[j] - rstart; 1407 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1408 sbuf2_i[j] = ncols; 1409 req_size[idex] += ncols; 1410 } 1411 req_source[idex] = r_status1[i].MPI_SOURCE; 1412 /* form the header */ 1413 sbuf2_i[0] = req_size[idex]; 1414 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1415 1416 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1417 } 1418 } 1419 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1420 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1421 1422 /* recv buffer sizes */ 1423 /* Receive messages*/ 1424 1425 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 1426 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1427 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 1428 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1429 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 1430 1431 for (i=0; i<nrqs; ++i) { 1432 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1433 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 1434 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 1435 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1436 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1437 } 1438 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1439 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1440 1441 /* Wait on sends1 and sends2 */ 1442 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1443 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 1444 1445 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1446 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1447 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1448 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1449 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1450 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1451 1452 /* Now allocate buffers for a->j, and send them off */ 1453 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1454 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1455 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1456 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1457 1458 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 1459 { 1460 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1461 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1462 PetscInt cend = C->cmap->rend; 1463 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1464 1465 for (i=0; i<nrqr; i++) { 1466 rbuf1_i = rbuf1[i]; 1467 sbuf_aj_i = sbuf_aj[i]; 1468 ct1 = 2*rbuf1_i[0] + 1; 1469 ct2 = 0; 1470 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1471 kmax = rbuf1[i][2*j]; 1472 for (k=0; k<kmax; k++,ct1++) { 1473 row = rbuf1_i[ct1] - rstart; 1474 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1475 ncols = nzA + nzB; 1476 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1477 1478 /* load the column indices for this row into cols*/ 1479 cols = sbuf_aj_i + ct2; 1480 1481 lwrite = 0; 1482 for (l=0; l<nzB; l++) { 1483 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1484 } 1485 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1486 for (l=0; l<nzB; l++) { 1487 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1488 } 1489 1490 ct2 += ncols; 1491 } 1492 } 1493 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1494 } 1495 } 1496 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 1497 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 1498 1499 /* Allocate buffers for a->a, and send them off */ 1500 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1501 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1502 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1503 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1504 1505 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1506 { 1507 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1508 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1509 PetscInt cend = C->cmap->rend; 1510 PetscInt *b_j = b->j; 1511 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1512 1513 for (i=0; i<nrqr; i++) { 1514 rbuf1_i = rbuf1[i]; 1515 sbuf_aa_i = sbuf_aa[i]; 1516 ct1 = 2*rbuf1_i[0]+1; 1517 ct2 = 0; 1518 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1519 kmax = rbuf1_i[2*j]; 1520 for (k=0; k<kmax; k++,ct1++) { 1521 row = rbuf1_i[ct1] - rstart; 1522 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1523 ncols = nzA + nzB; 1524 cworkB = b_j + b_i[row]; 1525 vworkA = a_a + a_i[row]; 1526 vworkB = b_a + b_i[row]; 1527 1528 /* load the column values for this row into vals*/ 1529 vals = sbuf_aa_i+ct2; 1530 1531 lwrite = 0; 1532 for (l=0; l<nzB; l++) { 1533 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1534 } 1535 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1536 for (l=0; l<nzB; l++) { 1537 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1538 } 1539 1540 ct2 += ncols; 1541 } 1542 } 1543 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1544 } 1545 } 1546 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1547 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1548 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1549 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1550 1551 /* Form the matrix */ 1552 /* create col map: global col of C -> local col of submatrices */ 1553 { 1554 const PetscInt *icol_i; 1555 #if defined(PETSC_USE_CTABLE) 1556 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 1557 for (i=0; i<ismax; i++) { 1558 if (!allcolumns[i]) { 1559 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1560 1561 jmax = ncol[i]; 1562 icol_i = icol[i]; 1563 cmap_i = cmap[i]; 1564 for (j=0; j<jmax; j++) { 1565 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1566 } 1567 } else { 1568 cmap[i] = NULL; 1569 } 1570 } 1571 #else 1572 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1573 for (i=0; i<ismax; i++) { 1574 if (!allcolumns[i]) { 1575 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1576 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1577 jmax = ncol[i]; 1578 icol_i = icol[i]; 1579 cmap_i = cmap[i]; 1580 for (j=0; j<jmax; j++) { 1581 cmap_i[icol_i[j]] = j+1; 1582 } 1583 } else { 1584 cmap[i] = NULL; 1585 } 1586 } 1587 #endif 1588 } 1589 1590 /* Create lens which is required for MatCreate... */ 1591 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1592 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1593 if (ismax) { 1594 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1595 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1596 } 1597 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1598 1599 /* Update lens from local data */ 1600 for (i=0; i<ismax; i++) { 1601 jmax = nrow[i]; 1602 if (!allcolumns[i]) cmap_i = cmap[i]; 1603 irow_i = irow[i]; 1604 lens_i = lens[i]; 1605 for (j=0; j<jmax; j++) { 1606 l = 0; 1607 row = irow_i[j]; 1608 while (row >= C->rmap->range[l+1]) l++; 1609 proc = l; 1610 if (proc == rank) { 1611 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1612 if (!allcolumns[i]) { 1613 for (k=0; k<ncols; k++) { 1614 #if defined(PETSC_USE_CTABLE) 1615 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1616 #else 1617 tcol = cmap_i[cols[k]]; 1618 #endif 1619 if (tcol) lens_i[j]++; 1620 } 1621 } else { /* allcolumns */ 1622 lens_i[j] = ncols; 1623 } 1624 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1625 } 1626 } 1627 } 1628 1629 /* Create row map: global row of C -> local row of submatrices */ 1630 #if defined(PETSC_USE_CTABLE) 1631 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 1632 for (i=0; i<ismax; i++) { 1633 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1634 rmap_i = rmap[i]; 1635 irow_i = irow[i]; 1636 jmax = nrow[i]; 1637 for (j=0; j<jmax; j++) { 1638 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1639 } 1640 } 1641 #else 1642 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1643 if (ismax) { 1644 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1645 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1646 } 1647 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1648 for (i=0; i<ismax; i++) { 1649 rmap_i = rmap[i]; 1650 irow_i = irow[i]; 1651 jmax = nrow[i]; 1652 for (j=0; j<jmax; j++) { 1653 rmap_i[irow_i[j]] = j; 1654 } 1655 } 1656 #endif 1657 1658 /* Update lens from offproc data */ 1659 { 1660 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1661 1662 for (tmp2=0; tmp2<nrqs; tmp2++) { 1663 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1664 idex = pa[idex2]; 1665 sbuf1_i = sbuf1[idex]; 1666 jmax = sbuf1_i[0]; 1667 ct1 = 2*jmax+1; 1668 ct2 = 0; 1669 rbuf2_i = rbuf2[idex2]; 1670 rbuf3_i = rbuf3[idex2]; 1671 for (j=1; j<=jmax; j++) { 1672 is_no = sbuf1_i[2*j-1]; 1673 max1 = sbuf1_i[2*j]; 1674 lens_i = lens[is_no]; 1675 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1676 rmap_i = rmap[is_no]; 1677 for (k=0; k<max1; k++,ct1++) { 1678 #if defined(PETSC_USE_CTABLE) 1679 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1680 row--; 1681 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1682 #else 1683 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1684 #endif 1685 max2 = rbuf2_i[ct1]; 1686 for (l=0; l<max2; l++,ct2++) { 1687 if (!allcolumns[is_no]) { 1688 #if defined(PETSC_USE_CTABLE) 1689 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1690 #else 1691 tcol = cmap_i[rbuf3_i[ct2]]; 1692 #endif 1693 if (tcol) lens_i[row]++; 1694 } else { /* allcolumns */ 1695 lens_i[row]++; /* lens_i[row] += max2 ? */ 1696 } 1697 } 1698 } 1699 } 1700 } 1701 } 1702 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1703 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1704 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1705 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1706 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1707 1708 /* Create the submatrices */ 1709 if (scall == MAT_REUSE_MATRIX) { 1710 PetscBool flag; 1711 1712 /* 1713 Assumes new rows are same length as the old rows,hence bug! 1714 */ 1715 for (i=0; i<ismax; i++) { 1716 mat = (Mat_SeqAIJ*)(submats[i]->data); 1717 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1718 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1719 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1720 /* Initial matrix as if empty */ 1721 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1722 1723 submats[i]->factortype = C->factortype; 1724 } 1725 } else { 1726 for (i=0; i<ismax; i++) { 1727 PetscInt rbs,cbs; 1728 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1729 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1730 1731 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1732 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1733 1734 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1735 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1736 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1737 } 1738 } 1739 1740 /* Assemble the matrices */ 1741 /* First assemble the local rows */ 1742 { 1743 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1744 PetscScalar *imat_a; 1745 1746 for (i=0; i<ismax; i++) { 1747 mat = (Mat_SeqAIJ*)submats[i]->data; 1748 imat_ilen = mat->ilen; 1749 imat_j = mat->j; 1750 imat_i = mat->i; 1751 imat_a = mat->a; 1752 1753 if (!allcolumns[i]) cmap_i = cmap[i]; 1754 rmap_i = rmap[i]; 1755 irow_i = irow[i]; 1756 jmax = nrow[i]; 1757 for (j=0; j<jmax; j++) { 1758 l = 0; 1759 row = irow_i[j]; 1760 while (row >= C->rmap->range[l+1]) l++; 1761 proc = l; 1762 if (proc == rank) { 1763 old_row = row; 1764 #if defined(PETSC_USE_CTABLE) 1765 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1766 row--; 1767 #else 1768 row = rmap_i[row]; 1769 #endif 1770 ilen_row = imat_ilen[row]; 1771 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1772 mat_i = imat_i[row]; 1773 mat_a = imat_a + mat_i; 1774 mat_j = imat_j + mat_i; 1775 if (!allcolumns[i]) { 1776 for (k=0; k<ncols; k++) { 1777 #if defined(PETSC_USE_CTABLE) 1778 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1779 #else 1780 tcol = cmap_i[cols[k]]; 1781 #endif 1782 if (tcol) { 1783 *mat_j++ = tcol - 1; 1784 *mat_a++ = vals[k]; 1785 ilen_row++; 1786 } 1787 } 1788 } else { /* allcolumns */ 1789 for (k=0; k<ncols; k++) { 1790 *mat_j++ = cols[k]; /* global col index! */ 1791 *mat_a++ = vals[k]; 1792 ilen_row++; 1793 } 1794 } 1795 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1796 1797 imat_ilen[row] = ilen_row; 1798 } 1799 } 1800 } 1801 } 1802 1803 /* Now assemble the off proc rows*/ 1804 { 1805 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1806 PetscInt *imat_j,*imat_i; 1807 PetscScalar *imat_a,*rbuf4_i; 1808 1809 for (tmp2=0; tmp2<nrqs; tmp2++) { 1810 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1811 idex = pa[idex2]; 1812 sbuf1_i = sbuf1[idex]; 1813 jmax = sbuf1_i[0]; 1814 ct1 = 2*jmax + 1; 1815 ct2 = 0; 1816 rbuf2_i = rbuf2[idex2]; 1817 rbuf3_i = rbuf3[idex2]; 1818 rbuf4_i = rbuf4[idex2]; 1819 for (j=1; j<=jmax; j++) { 1820 is_no = sbuf1_i[2*j-1]; 1821 rmap_i = rmap[is_no]; 1822 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1823 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1824 imat_ilen = mat->ilen; 1825 imat_j = mat->j; 1826 imat_i = mat->i; 1827 imat_a = mat->a; 1828 max1 = sbuf1_i[2*j]; 1829 for (k=0; k<max1; k++,ct1++) { 1830 row = sbuf1_i[ct1]; 1831 #if defined(PETSC_USE_CTABLE) 1832 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1833 row--; 1834 #else 1835 row = rmap_i[row]; 1836 #endif 1837 ilen = imat_ilen[row]; 1838 mat_i = imat_i[row]; 1839 mat_a = imat_a + mat_i; 1840 mat_j = imat_j + mat_i; 1841 max2 = rbuf2_i[ct1]; 1842 if (!allcolumns[is_no]) { 1843 for (l=0; l<max2; l++,ct2++) { 1844 1845 #if defined(PETSC_USE_CTABLE) 1846 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1847 #else 1848 tcol = cmap_i[rbuf3_i[ct2]]; 1849 #endif 1850 if (tcol) { 1851 *mat_j++ = tcol - 1; 1852 *mat_a++ = rbuf4_i[ct2]; 1853 ilen++; 1854 } 1855 } 1856 } else { /* allcolumns */ 1857 for (l=0; l<max2; l++,ct2++) { 1858 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1859 *mat_a++ = rbuf4_i[ct2]; 1860 ilen++; 1861 } 1862 } 1863 imat_ilen[row] = ilen; 1864 } 1865 } 1866 } 1867 } 1868 1869 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1870 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1871 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1872 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1873 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1874 1875 /* Restore the indices */ 1876 for (i=0; i<ismax; i++) { 1877 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1878 if (!allcolumns[i]) { 1879 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1880 } 1881 } 1882 1883 /* Destroy allocated memory */ 1884 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1885 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1886 ierr = PetscFree(pa);CHKERRQ(ierr); 1887 1888 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1889 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1890 for (i=0; i<nrqr; ++i) { 1891 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1892 } 1893 for (i=0; i<nrqs; ++i) { 1894 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1895 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1896 } 1897 1898 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1899 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1900 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1901 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1902 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1903 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1904 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1905 1906 #if defined(PETSC_USE_CTABLE) 1907 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1908 #else 1909 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1910 #endif 1911 ierr = PetscFree(rmap);CHKERRQ(ierr); 1912 1913 for (i=0; i<ismax; i++) { 1914 if (!allcolumns[i]) { 1915 #if defined(PETSC_USE_CTABLE) 1916 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1917 #else 1918 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1919 #endif 1920 } 1921 } 1922 ierr = PetscFree(cmap);CHKERRQ(ierr); 1923 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1924 ierr = PetscFree(lens);CHKERRQ(ierr); 1925 1926 for (i=0; i<ismax; i++) { 1927 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1928 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1929 } 1930 PetscFunctionReturn(0); 1931 } 1932 1933 /* 1934 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1935 Be careful not to destroy them elsewhere. 1936 */ 1937 #undef __FUNCT__ 1938 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1939 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1940 { 1941 /* If making this function public, change the error returned in this function away from _PLIB. */ 1942 PetscErrorCode ierr; 1943 Mat_MPIAIJ *aij; 1944 PetscBool seqaij; 1945 1946 PetscFunctionBegin; 1947 /* Check to make sure the component matrices are compatible with C. */ 1948 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1949 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1950 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1951 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1952 if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1953 1954 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1955 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1956 ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr); 1957 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1958 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1959 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1960 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1961 aij = (Mat_MPIAIJ*)((*C)->data); 1962 aij->A = A; 1963 aij->B = B; 1964 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr); 1965 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr); 1966 1967 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1968 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1969 PetscFunctionReturn(0); 1970 } 1971 1972 #undef __FUNCT__ 1973 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1974 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1975 { 1976 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1977 1978 PetscFunctionBegin; 1979 PetscValidPointer(A,2); 1980 PetscValidPointer(B,3); 1981 *A = aij->A; 1982 *B = aij->B; 1983 /* Note that we don't incref *A and *B, so be careful! */ 1984 PetscFunctionReturn(0); 1985 } 1986 1987 #undef __FUNCT__ 1988 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1989 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1990 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1991 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1992 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1993 { 1994 PetscErrorCode ierr; 1995 PetscMPIInt size, flag; 1996 PetscInt i,ii; 1997 PetscInt ismax_c; 1998 1999 PetscFunctionBegin; 2000 if (!ismax) PetscFunctionReturn(0); 2001 2002 for (i = 0, ismax_c = 0; i < ismax; ++i) { 2003 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 2004 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 2005 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2006 if (size > 1) ++ismax_c; 2007 } 2008 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 2009 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2010 } else { /* if (ismax_c) */ 2011 Mat *A,*B; 2012 IS *isrow_c, *iscol_c; 2013 PetscMPIInt size; 2014 /* 2015 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 2016 array of sequential matrices underlying the resulting parallel matrices. 2017 Which arrays to allocate is based on the value of MatReuse scall. 2018 There are as many diag matrices as there are original index sets. 2019 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 2020 2021 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 2022 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 2023 will deposite the extracted diag and off-diag parts. 2024 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 2025 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 2026 */ 2027 2028 /* Parallel matrix array is allocated only if no reuse is taking place. */ 2029 if (scall != MAT_REUSE_MATRIX) { 2030 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 2031 } else { 2032 ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr); 2033 ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr); 2034 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 2035 for (i = 0, ii = 0; i < ismax; ++i) { 2036 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2037 if (size > 1) { 2038 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 2039 ++ii; 2040 } else A[i] = (*submat)[i]; 2041 } 2042 } 2043 /* 2044 Construct the complements of the iscol ISs for parallel ISs only. 2045 These are used to extract the off-diag portion of the resulting parallel matrix. 2046 The row IS for the off-diag portion is the same as for the diag portion, 2047 so we merely alias the row IS, while skipping those that are sequential. 2048 */ 2049 ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr); 2050 for (i = 0, ii = 0; i < ismax; ++i) { 2051 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2052 if (size > 1) { 2053 isrow_c[ii] = isrow[i]; 2054 2055 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 2056 ++ii; 2057 } 2058 } 2059 /* Now obtain the sequential A and B submatrices separately. */ 2060 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 2061 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 2062 for (ii = 0; ii < ismax_c; ++ii) { 2063 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 2064 } 2065 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 2066 /* 2067 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 2068 have been extracted directly into the parallel matrices containing them, or 2069 simply into the sequential matrix identical with the corresponding A (if size == 1). 2070 Otherwise, make sure that parallel matrices are constructed from A & B, or the 2071 A is put into the correct submat slot (if size == 1). 2072 */ 2073 if (scall != MAT_REUSE_MATRIX) { 2074 for (i = 0, ii = 0; i < ismax; ++i) { 2075 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 2076 if (size > 1) { 2077 /* 2078 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 2079 */ 2080 /* Construct submat[i] from the Seq pieces A and B. */ 2081 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 2082 2083 ++ii; 2084 } else (*submat)[i] = A[i]; 2085 } 2086 } 2087 ierr = PetscFree(A);CHKERRQ(ierr); 2088 ierr = PetscFree(B);CHKERRQ(ierr); 2089 } 2090 PetscFunctionReturn(0); 2091 } /* MatGetSubMatricesParallel_MPIXAIJ() */ 2092 2093 2094 2095 #undef __FUNCT__ 2096 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 2097 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2098 { 2099 PetscErrorCode ierr; 2100 2101 PetscFunctionBegin; 2102 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 2103 PetscFunctionReturn(0); 2104 } 2105