1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++){ 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++){ 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank){ 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows){ 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++){ 115 if (tosizes_temp[i]){ 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++){ 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++){ 133 for (j=0; j<fromsizes[2*i]; j++){ 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++){ 169 for (j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++){ 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++){ 209 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 210 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 213 } 214 /* retrieve information to get row id and its overlap */ 215 for (i=0; i<nrecvs; ){ 216 is_id = recvdata[i++]; 217 data_size = recvdata[i++]; 218 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 219 isz_i = isz[is_id]; 220 for (j=0; j< data_size; j++){ 221 col = recvdata[i++]; 222 indices_i[isz_i++] = col; 223 } 224 isz[is_id] = isz_i; 225 } 226 /* remove duplicate entities */ 227 for (i=0; i<nidx; i++){ 228 indices_i = indices_temp+(max_lsize+nrecvs)*i; 229 isz_i = isz[i]; 230 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 231 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 232 } 233 ierr = PetscFree(isz);CHKERRQ(ierr); 234 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 239 { 240 PetscLayout rmap,cmap; 241 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 242 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 243 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 244 const PetscInt *gcols,*ai,*aj,*bi,*bj; 245 Mat amat,bmat; 246 PetscMPIInt rank; 247 PetscBool done; 248 MPI_Comm comm; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 253 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 254 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 255 /* Even if the mat is symmetric, we still assume it is not symmetric */ 256 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 257 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 258 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 /* total number of nonzero values is used to estimate the memory usage in the next step */ 261 tnz = ai[an]+bi[bn]; 262 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 263 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 264 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 265 /* to find the longest message */ 266 max_fszs = 0; 267 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 268 /* better way to estimate number of nonzero in the mat??? */ 269 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 270 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 271 rows_pos = 0; 272 totalrows = 0; 273 for (i=0; i<nfrom; i++){ 274 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 275 /* group data together */ 276 for (j=0; j<fromsizes[2*i]; j+=2){ 277 is_id = fromrows[rows_pos++];/* no of is */ 278 rows_i = rows_data[is_id]; 279 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 280 } 281 /* estimate a space to avoid multiple allocations */ 282 for (j=0; j<nidx; j++){ 283 indvc_ij = 0; 284 rows_i = rows_data[j]; 285 for (l=0; l<rows_pos_i[j]; l++){ 286 row = rows_i[l]-rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for (k=start; k<end; k++){ /* Amat */ 290 col = aj[k] + cstart; 291 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for (k=start; k<end; k++) { /* Bmat */ 296 col = gcols[bj[k]]; 297 indices_tmp[indvc_ij++] = col; 298 } 299 } 300 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 301 indv_counts[i*nidx+j] = indvc_ij; 302 totalrows += indvc_ij; 303 } 304 } 305 /* message triple <no of is, number of rows, rows> */ 306 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 307 totalrows = 0; 308 rows_pos = 0; 309 /* use this code again */ 310 for (i=0;i<nfrom;i++){ 311 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 312 for (j=0; j<fromsizes[2*i]; j+=2){ 313 is_id = fromrows[rows_pos++]; 314 rows_i = rows_data[is_id]; 315 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 316 } 317 /* add data */ 318 for (j=0; j<nidx; j++){ 319 if (!indv_counts[i*nidx+j]) continue; 320 indvc_ij = 0; 321 sbdata[totalrows++] = j; 322 sbdata[totalrows++] = indv_counts[i*nidx+j]; 323 sbsizes[2*i] += 2; 324 rows_i = rows_data[j]; 325 for (l=0; l<rows_pos_i[j]; l++){ 326 row = rows_i[l]-rstart; 327 start = ai[row]; 328 end = ai[row+1]; 329 for (k=start; k<end; k++){ /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; 332 } 333 start = bi[row]; 334 end = bi[row+1]; 335 for (k=start; k<end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 341 sbsizes[2*i] += indvc_ij; 342 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 343 totalrows += indvc_ij; 344 } 345 } 346 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 347 for (i=0; i<nfrom; i++){ 348 offsets[i+1] = offsets[i] + sbsizes[2*i]; 349 sbsizes[2*i+1] = offsets[i]; 350 } 351 ierr = PetscFree(offsets);CHKERRQ(ierr); 352 if (sbrowsizes) *sbrowsizes = sbsizes; 353 if (sbrows) *sbrows = sbdata; 354 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 355 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 356 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 361 { 362 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 363 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 364 PetscInt lsize,lsize_tmp,owner; 365 PetscMPIInt rank; 366 Mat amat,bmat; 367 PetscBool done; 368 PetscLayout cmap,rmap; 369 MPI_Comm comm; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 374 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 375 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 376 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 377 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 378 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 /* is it a safe way to compute number of nonzero values ? */ 381 tnz = ai[an]+bi[bn]; 382 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 383 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 384 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 385 /* it is a better way to estimate memory than the old implementation 386 * where global size of matrix is used 387 * */ 388 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 389 for (i=0; i<nidx; i++) { 390 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 391 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 392 lsize_tmp = 0; 393 for (j=0; j<lsize; j++) { 394 owner = -1; 395 row = indices[j]; 396 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 397 if (owner != rank) continue; 398 /* local number */ 399 row -= rstart; 400 start = ai[row]; 401 end = ai[row+1]; 402 for (k=start; k<end; k++) { /* Amat */ 403 col = aj[k] + cstart; 404 indices_temp[lsize_tmp++] = col; 405 } 406 start = bi[row]; 407 end = bi[row+1]; 408 for (k=start; k<end; k++) { /* Bmat */ 409 col = gcols[bj[k]]; 410 indices_temp[lsize_tmp++] = col; 411 } 412 } 413 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 414 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 415 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 416 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 417 } 418 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 419 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 420 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests recieved (which have to be or which have been processed 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscErrorCode ierr; 461 PetscMPIInt size,rank,tag1,tag2; 462 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 463 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 464 PetscBT *table; 465 MPI_Comm comm; 466 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 467 MPI_Status *s_status,*recv_status; 468 char *t_p; 469 470 PetscFunctionBegin; 471 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 477 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 478 479 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 480 481 for (i=0; i<imax; i++) { 482 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 483 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 489 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 490 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 491 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 492 for (i=0; i<imax; i++) { 493 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 idx_i = idx[i]; 495 len = n[i]; 496 for (j=0; j<len; j++) { 497 row = idx_i[j]; 498 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 499 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 500 w4[proc]++; 501 } 502 for (j=0; j<size; j++) { 503 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 504 } 505 } 506 507 nrqs = 0; /* no of outgoing messages */ 508 msz = 0; /* total mesg length (for all proc */ 509 w1[rank] = 0; /* no mesg sent to intself */ 510 w3[rank] = 0; 511 for (i=0; i<size; i++) { 512 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 513 } 514 /* pa - is list of processors to communicate with */ 515 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 516 for (i=0,j=0; i<size; i++) { 517 if (w1[i]) {pa[j] = i; j++;} 518 } 519 520 /* Each message would have a header = 1 + 2*(no of IS) + data */ 521 for (i=0; i<nrqs; i++) { 522 j = pa[i]; 523 w1[j] += w2[j] + 2*w3[j]; 524 msz += w1[j]; 525 } 526 527 /* Determine the number of messages to expect, their lengths, from from-ids */ 528 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 529 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 530 531 /* Now post the Irecvs corresponding to these messages */ 532 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 533 534 /* Allocate Memory for outgoing messages */ 535 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 536 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 537 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 538 539 { 540 PetscInt *iptr = tmp,ict = 0; 541 for (i=0; i<nrqs; i++) { 542 j = pa[i]; 543 iptr += ict; 544 outdat[j] = iptr; 545 ict = w1[j]; 546 } 547 } 548 549 /* Form the outgoing messages */ 550 /* plug in the headers */ 551 for (i=0; i<nrqs; i++) { 552 j = pa[i]; 553 outdat[j][0] = 0; 554 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 555 ptr[j] = outdat[j] + 2*w3[j] + 1; 556 } 557 558 /* Memory for doing local proc's work */ 559 { 560 PetscInt M_BPB_imax = 0; 561 #if defined(PETSC_USE_CTABLE) 562 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 563 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 564 for (i=0; i<imax; i++) { 565 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 566 } 567 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 570 } 571 #else 572 PetscInt Mimax = 0; 573 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 574 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 575 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 data[i] = d_p + M*i; 579 } 580 #endif 581 } 582 583 /* Parse the IS and update local tables and the outgoing buf with the data */ 584 { 585 PetscInt n_i,isz_i,*outdat_j,ctr_j; 586 PetscBT table_i; 587 588 for (i=0; i<imax; i++) { 589 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 590 n_i = n[i]; 591 table_i = table[i]; 592 idx_i = idx[i]; 593 #if defined(PETSC_USE_CTABLE) 594 table_data_i = table_data[i]; 595 #else 596 data_i = data[i]; 597 #endif 598 isz_i = isz[i]; 599 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 600 row = idx_i[j]; 601 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 602 if (proc != rank) { /* copy to the outgoing buffer */ 603 ctr[proc]++; 604 *ptr[proc] = row; 605 ptr[proc]++; 606 } else if (!PetscBTLookupSet(table_i,row)) { 607 #if defined(PETSC_USE_CTABLE) 608 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 609 #else 610 data_i[isz_i] = row; /* Update the local table */ 611 #endif 612 isz_i++; 613 } 614 } 615 /* Update the headers for the current IS */ 616 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 617 if ((ctr_j = ctr[j])) { 618 outdat_j = outdat[j]; 619 k = ++outdat_j[0]; 620 outdat_j[2*k] = ctr_j; 621 outdat_j[2*k-1] = i; 622 } 623 } 624 isz[i] = isz_i; 625 } 626 } 627 628 /* Now post the sends */ 629 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 630 for (i=0; i<nrqs; ++i) { 631 j = pa[i]; 632 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 633 } 634 635 /* No longer need the original indices */ 636 for (i=0; i<imax; ++i) { 637 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 638 } 639 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 640 641 for (i=0; i<imax; ++i) { 642 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 643 } 644 645 /* Do Local work */ 646 #if defined(PETSC_USE_CTABLE) 647 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 648 #else 649 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 650 #endif 651 652 /* Receive messages */ 653 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 654 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 655 656 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 657 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 658 659 /* Phase 1 sends are complete - deallocate buffers */ 660 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 661 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 662 663 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 664 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 665 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 666 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 667 ierr = PetscFree(rbuf);CHKERRQ(ierr); 668 669 670 /* Send the data back */ 671 /* Do a global reduction to know the buffer space req for incoming messages */ 672 { 673 PetscMPIInt *rw1; 674 675 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 676 677 for (i=0; i<nrqr; ++i) { 678 proc = recv_status[i].MPI_SOURCE; 679 680 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 681 rw1[proc] = isz1[i]; 682 } 683 ierr = PetscFree(onodes1);CHKERRQ(ierr); 684 ierr = PetscFree(olengths1);CHKERRQ(ierr); 685 686 /* Determine the number of messages to expect, their lengths, from from-ids */ 687 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 688 ierr = PetscFree(rw1);CHKERRQ(ierr); 689 } 690 /* Now post the Irecvs corresponding to these messages */ 691 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 692 693 /* Now post the sends */ 694 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 695 for (i=0; i<nrqr; ++i) { 696 j = recv_status[i].MPI_SOURCE; 697 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 698 } 699 700 /* receive work done on other processors */ 701 { 702 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 703 PetscMPIInt idex; 704 PetscBT table_i; 705 MPI_Status *status2; 706 707 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 708 for (i=0; i<nrqs; ++i) { 709 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 710 /* Process the message */ 711 rbuf2_i = rbuf2[idex]; 712 ct1 = 2*rbuf2_i[0]+1; 713 jmax = rbuf2[idex][0]; 714 for (j=1; j<=jmax; j++) { 715 max = rbuf2_i[2*j]; 716 is_no = rbuf2_i[2*j-1]; 717 isz_i = isz[is_no]; 718 table_i = table[is_no]; 719 #if defined(PETSC_USE_CTABLE) 720 table_data_i = table_data[is_no]; 721 #else 722 data_i = data[is_no]; 723 #endif 724 for (k=0; k<max; k++,ct1++) { 725 row = rbuf2_i[ct1]; 726 if (!PetscBTLookupSet(table_i,row)) { 727 #if defined(PETSC_USE_CTABLE) 728 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 729 #else 730 data_i[isz_i] = row; 731 #endif 732 isz_i++; 733 } 734 } 735 isz[is_no] = isz_i; 736 } 737 } 738 739 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 740 ierr = PetscFree(status2);CHKERRQ(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 } 768 769 ierr = PetscFree(onodes2);CHKERRQ(ierr); 770 ierr = PetscFree(olengths2);CHKERRQ(ierr); 771 772 ierr = PetscFree(pa);CHKERRQ(ierr); 773 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 774 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 775 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 776 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 779 ierr = PetscFree(s_status);CHKERRQ(ierr); 780 ierr = PetscFree(recv_status);CHKERRQ(ierr); 781 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 782 ierr = PetscFree(xdata);CHKERRQ(ierr); 783 ierr = PetscFree(isz1);CHKERRQ(ierr); 784 #if defined(PETSC_USE_CTABLE) 785 for (i=0; i<imax; i++) { 786 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 787 } 788 ierr = PetscFree(table_data);CHKERRQ(ierr); 789 ierr = PetscFree(tdata);CHKERRQ(ierr); 790 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 791 #else 792 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 793 #endif 794 PetscFunctionReturn(0); 795 } 796 797 /* 798 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 799 the work on the local processor. 800 801 Inputs: 802 C - MAT_MPIAIJ; 803 imax - total no of index sets processed at a time; 804 table - an array of char - size = m bits. 805 806 Output: 807 isz - array containing the count of the solution elements corresponding 808 to each index set; 809 data or table_data - pointer to the solutions 810 */ 811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 812 { 813 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 814 Mat A = c->A,B = c->B; 815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 816 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 817 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 818 PetscBT table_i; 819 #if defined(PETSC_USE_CTABLE) 820 PetscTable table_data_i; 821 PetscErrorCode ierr; 822 PetscTablePosition tpos; 823 PetscInt tcount,*tdata; 824 #else 825 PetscInt *data_i; 826 #endif 827 828 PetscFunctionBegin; 829 rstart = C->rmap->rstart; 830 cstart = C->cmap->rstart; 831 ai = a->i; 832 aj = a->j; 833 bi = b->i; 834 bj = b->j; 835 garray = c->garray; 836 837 for (i=0; i<imax; i++) { 838 #if defined(PETSC_USE_CTABLE) 839 /* copy existing entries of table_data_i into tdata[] */ 840 table_data_i = table_data[i]; 841 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 842 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 843 844 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 845 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 846 while (tpos) { 847 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 848 tdata[--j] = --row; 849 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 850 } 851 #else 852 data_i = data[i]; 853 #endif 854 table_i = table[i]; 855 isz_i = isz[i]; 856 max = isz[i]; 857 858 for (j=0; j<max; j++) { 859 #if defined(PETSC_USE_CTABLE) 860 row = tdata[j] - rstart; 861 #else 862 row = data_i[j] - rstart; 863 #endif 864 start = ai[row]; 865 end = ai[row+1]; 866 for (k=start; k<end; k++) { /* Amat */ 867 val = aj[k] + cstart; 868 if (!PetscBTLookupSet(table_i,val)) { 869 #if defined(PETSC_USE_CTABLE) 870 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 871 #else 872 data_i[isz_i] = val; 873 #endif 874 isz_i++; 875 } 876 } 877 start = bi[row]; 878 end = bi[row+1]; 879 for (k=start; k<end; k++) { /* Bmat */ 880 val = garray[bj[k]]; 881 if (!PetscBTLookupSet(table_i,val)) { 882 #if defined(PETSC_USE_CTABLE) 883 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 884 #else 885 data_i[isz_i] = val; 886 #endif 887 isz_i++; 888 } 889 } 890 } 891 isz[i] = isz_i; 892 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscFree(tdata);CHKERRQ(ierr); 895 #endif 896 } 897 PetscFunctionReturn(0); 898 } 899 900 /* 901 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 902 and return the output 903 904 Input: 905 C - the matrix 906 nrqr - no of messages being processed. 907 rbuf - an array of pointers to the recieved requests 908 909 Output: 910 xdata - array of messages to be sent back 911 isz1 - size of each message 912 913 For better efficiency perhaps we should malloc separately each xdata[i], 914 then if a remalloc is required we need only copy the data for that one row 915 rather then all previous rows as it is now where a single large chunck of 916 memory is used. 917 918 */ 919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 920 { 921 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 922 Mat A = c->A,B = c->B; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 924 PetscErrorCode ierr; 925 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 926 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 927 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 928 PetscInt *rbuf_i,kmax,rbuf_0; 929 PetscBT xtable; 930 931 PetscFunctionBegin; 932 m = C->rmap->N; 933 rstart = C->rmap->rstart; 934 cstart = C->cmap->rstart; 935 ai = a->i; 936 aj = a->j; 937 bi = b->i; 938 bj = b->j; 939 garray = c->garray; 940 941 942 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 943 rbuf_i = rbuf[i]; 944 rbuf_0 = rbuf_i[0]; 945 ct += rbuf_0; 946 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 947 } 948 949 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 950 else max1 = 1; 951 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 952 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 953 ++no_malloc; 954 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 955 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 956 957 ct3 = 0; 958 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 959 rbuf_i = rbuf[i]; 960 rbuf_0 = rbuf_i[0]; 961 ct1 = 2*rbuf_0+1; 962 ct2 = ct1; 963 ct3 += ct1; 964 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 965 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 966 oct2 = ct2; 967 kmax = rbuf_i[2*j]; 968 for (k=0; k<kmax; k++,ct1++) { 969 row = rbuf_i[ct1]; 970 if (!PetscBTLookupSet(xtable,row)) { 971 if (!(ct3 < mem_estimate)) { 972 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 973 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 974 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 975 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 976 xdata[0] = tmp; 977 mem_estimate = new_estimate; ++no_malloc; 978 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 979 } 980 xdata[i][ct2++] = row; 981 ct3++; 982 } 983 } 984 for (k=oct2,max2=ct2; k<max2; k++) { 985 row = xdata[i][k] - rstart; 986 start = ai[row]; 987 end = ai[row+1]; 988 for (l=start; l<end; l++) { 989 val = aj[l] + cstart; 990 if (!PetscBTLookupSet(xtable,val)) { 991 if (!(ct3 < mem_estimate)) { 992 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 993 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 994 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 995 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 996 xdata[0] = tmp; 997 mem_estimate = new_estimate; ++no_malloc; 998 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 999 } 1000 xdata[i][ct2++] = val; 1001 ct3++; 1002 } 1003 } 1004 start = bi[row]; 1005 end = bi[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = garray[bj[l]]; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 } 1023 /* Update the header*/ 1024 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1025 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1026 } 1027 xdata[i][0] = rbuf_0; 1028 xdata[i+1] = xdata[i] + ct2; 1029 isz1[i] = ct2; /* size of each message */ 1030 } 1031 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1032 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 /* -------------------------------------------------------------------------*/ 1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1038 /* 1039 Every processor gets the entire matrix 1040 */ 1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1042 { 1043 Mat B; 1044 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1045 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1046 PetscErrorCode ierr; 1047 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1048 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1049 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1050 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1051 1052 PetscFunctionBegin; 1053 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1054 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1055 1056 if (scall == MAT_INITIAL_MATRIX) { 1057 /* ---------------------------------------------------------------- 1058 Tell every processor the number of nonzeros per row 1059 */ 1060 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1061 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1062 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1063 } 1064 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1065 for (i=0; i<size; i++) { 1066 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1067 displs[i] = A->rmap->range[i]; 1068 } 1069 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1070 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1071 #else 1072 sendcount = A->rmap->rend - A->rmap->rstart; 1073 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1074 #endif 1075 /* --------------------------------------------------------------- 1076 Create the sequential matrix of the same type as the local block diagonal 1077 */ 1078 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1079 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1080 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1081 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1082 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1083 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1084 **Bin = B; 1085 b = (Mat_SeqAIJ*)B->data; 1086 1087 /*-------------------------------------------------------------------- 1088 Copy my part of matrix column indices over 1089 */ 1090 sendcount = ad->nz + bd->nz; 1091 jsendbuf = b->j + b->i[rstarts[rank]]; 1092 a_jsendbuf = ad->j; 1093 b_jsendbuf = bd->j; 1094 n = A->rmap->rend - A->rmap->rstart; 1095 cnt = 0; 1096 for (i=0; i<n; i++) { 1097 1098 /* put in lower diagonal portion */ 1099 m = bd->i[i+1] - bd->i[i]; 1100 while (m > 0) { 1101 /* is it above diagonal (in bd (compressed) numbering) */ 1102 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1103 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1104 m--; 1105 } 1106 1107 /* put in diagonal portion */ 1108 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1109 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1110 } 1111 1112 /* put in upper diagonal portion */ 1113 while (m-- > 0) { 1114 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1115 } 1116 } 1117 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1118 1119 /*-------------------------------------------------------------------- 1120 Gather all column indices to all processors 1121 */ 1122 for (i=0; i<size; i++) { 1123 recvcounts[i] = 0; 1124 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1125 recvcounts[i] += lens[j]; 1126 } 1127 } 1128 displs[0] = 0; 1129 for (i=1; i<size; i++) { 1130 displs[i] = displs[i-1] + recvcounts[i-1]; 1131 } 1132 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1133 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1134 #else 1135 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1136 #endif 1137 /*-------------------------------------------------------------------- 1138 Assemble the matrix into useable form (note numerical values not yet set) 1139 */ 1140 /* set the b->ilen (length of each row) values */ 1141 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1142 /* set the b->i indices */ 1143 b->i[0] = 0; 1144 for (i=1; i<=A->rmap->N; i++) { 1145 b->i[i] = b->i[i-1] + lens[i-1]; 1146 } 1147 ierr = PetscFree(lens);CHKERRQ(ierr); 1148 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1150 1151 } else { 1152 B = **Bin; 1153 b = (Mat_SeqAIJ*)B->data; 1154 } 1155 1156 /*-------------------------------------------------------------------- 1157 Copy my part of matrix numerical values into the values location 1158 */ 1159 if (flag == MAT_GET_VALUES) { 1160 sendcount = ad->nz + bd->nz; 1161 sendbuf = b->a + b->i[rstarts[rank]]; 1162 a_sendbuf = ad->a; 1163 b_sendbuf = bd->a; 1164 b_sendj = bd->j; 1165 n = A->rmap->rend - A->rmap->rstart; 1166 cnt = 0; 1167 for (i=0; i<n; i++) { 1168 1169 /* put in lower diagonal portion */ 1170 m = bd->i[i+1] - bd->i[i]; 1171 while (m > 0) { 1172 /* is it above diagonal (in bd (compressed) numbering) */ 1173 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1174 sendbuf[cnt++] = *b_sendbuf++; 1175 m--; 1176 b_sendj++; 1177 } 1178 1179 /* put in diagonal portion */ 1180 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1181 sendbuf[cnt++] = *a_sendbuf++; 1182 } 1183 1184 /* put in upper diagonal portion */ 1185 while (m-- > 0) { 1186 sendbuf[cnt++] = *b_sendbuf++; 1187 b_sendj++; 1188 } 1189 } 1190 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1191 1192 /* ----------------------------------------------------------------- 1193 Gather all numerical values to all processors 1194 */ 1195 if (!recvcounts) { 1196 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1197 } 1198 for (i=0; i<size; i++) { 1199 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1200 } 1201 displs[0] = 0; 1202 for (i=1; i<size; i++) { 1203 displs[i] = displs[i-1] + recvcounts[i-1]; 1204 } 1205 recvbuf = b->a; 1206 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1207 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1208 #else 1209 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1210 #endif 1211 } /* endof (flag == MAT_GET_VALUES) */ 1212 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1213 1214 if (A->symmetric) { 1215 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1216 } else if (A->hermitian) { 1217 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1218 } else if (A->structurally_symmetric) { 1219 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1220 } 1221 PetscFunctionReturn(0); 1222 } 1223 1224 PetscErrorCode MatDestroy_Dummy_MatGetSubmatrices(Mat C) 1225 { 1226 PetscErrorCode ierr; 1227 Mat_SubMat *submatj = (Mat_SubMat*)C->data; 1228 PetscInt i; 1229 1230 PetscFunctionBegin; 1231 if (!submatj->id) { /* delete data that are linked only to submats[id=0] */ 1232 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1233 1234 for (i=0; i<submatj->nrqr; ++i) { 1235 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1236 } 1237 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1238 1239 if (submatj->rbuf1) { 1240 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1241 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1242 } 1243 1244 for (i=0; i<submatj->nrqs; ++i) { 1245 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1246 } 1247 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1248 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1249 } 1250 1251 #if defined(PETSC_USE_CTABLE) 1252 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1253 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1254 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1255 #else 1256 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1257 #endif 1258 1259 if (!submatj->allcolumns) { 1260 #if defined(PETSC_USE_CTABLE) 1261 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1262 #else 1263 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1264 #endif 1265 } 1266 ierr = submatj->destroy(C);CHKERRQ(ierr); 1267 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1268 1269 ierr = PetscFree(submatj);CHKERRQ(ierr); 1270 PetscFunctionReturn(0); 1271 } 1272 1273 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1274 { 1275 PetscErrorCode ierr; 1276 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1277 Mat_SubMat *submatj = c->submatis1; 1278 PetscInt i; 1279 1280 PetscFunctionBegin; 1281 if (!submatj->id) { /* delete data that are linked only to submats[id=0] */ 1282 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1283 1284 for (i=0; i<submatj->nrqr; ++i) { 1285 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1286 } 1287 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1288 1289 if (submatj->rbuf1) { 1290 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1291 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1292 } 1293 1294 for (i=0; i<submatj->nrqs; ++i) { 1295 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1296 } 1297 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1298 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1299 } 1300 1301 #if defined(PETSC_USE_CTABLE) 1302 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1303 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1304 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1305 #else 1306 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1307 #endif 1308 1309 if (!submatj->allcolumns) { 1310 #if defined(PETSC_USE_CTABLE) 1311 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1312 #else 1313 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1314 #endif 1315 } 1316 ierr = submatj->destroy(C);CHKERRQ(ierr); 1317 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1318 1319 ierr = PetscFree(submatj);CHKERRQ(ierr); 1320 PetscFunctionReturn(0); 1321 } 1322 1323 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1324 { 1325 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1326 Mat submat,A = c->A,B = c->B; 1327 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1328 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1329 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1330 const PetscInt *icol,*irow; 1331 PetscInt nrow,ncol,start; 1332 PetscErrorCode ierr; 1333 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1334 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1335 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1336 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1337 PetscInt *lens,rmax,ncols,*cols,Crow; 1338 #if defined(PETSC_USE_CTABLE) 1339 PetscTable cmap,rmap; 1340 PetscInt *cmap_loc,*rmap_loc; 1341 #else 1342 PetscInt *cmap,*rmap; 1343 #endif 1344 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1345 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1346 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1347 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1348 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1349 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1350 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1351 MPI_Comm comm; 1352 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1353 PetscMPIInt *onodes1,*olengths1,idex,end; 1354 Mat_SubMat *smatis1; 1355 PetscBool isrowsorted; 1356 1357 PetscFunctionBegin; 1358 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1359 1360 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1361 size = c->size; 1362 rank = c->rank; 1363 1364 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1365 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1366 1367 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1368 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1369 if (allcolumns) { 1370 icol = NULL; 1371 ncol = C->cmap->N; 1372 } else { 1373 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1374 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1375 } 1376 1377 if (scall == MAT_INITIAL_MATRIX) { 1378 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1379 1380 /* Get some new tags to keep the communication clean */ 1381 tag1 = ((PetscObject)C)->tag; 1382 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1383 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1384 1385 /* evaluate communication - mesg to who, length of mesg, and buffer space 1386 required. Based on this, buffers are allocated, and data copied into them */ 1387 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1388 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1389 1390 /* w1[proc] = num of rows owned by proc -- to be requested */ 1391 proc = 0; 1392 nrqs = 0; /* num of outgoing messages */ 1393 for (j=0; j<nrow; j++) { 1394 row = irow[j]; /* sorted! */ 1395 while (row >= C->rmap->range[proc+1]) proc++; 1396 w1[proc]++; 1397 row2proc[j] = proc; /* map row index to proc */ 1398 1399 if (proc != rank && !w2[proc]) { 1400 w2[proc] = 1; nrqs++; 1401 } 1402 } 1403 w1[rank] = 0; /* rows owned by self will not be requested */ 1404 1405 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1406 for (proc=0,j=0; proc<size; proc++) { 1407 if (w1[proc]) { pa[j++] = proc;} 1408 } 1409 1410 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1411 msz = 0; /* total mesg length (for all procs) */ 1412 for (i=0; i<nrqs; i++) { 1413 proc = pa[i]; 1414 w1[proc] += 3; 1415 msz += w1[proc]; 1416 } 1417 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1418 1419 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1420 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1421 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1422 1423 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1424 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1425 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1426 1427 /* Now post the Irecvs corresponding to these messages */ 1428 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1429 1430 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1431 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1432 1433 /* Allocate Memory for outgoing messages */ 1434 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1435 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1436 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1437 1438 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1439 iptr = tmp; 1440 for (i=0; i<nrqs; i++) { 1441 proc = pa[i]; 1442 sbuf1[proc] = iptr; 1443 iptr += w1[proc]; 1444 } 1445 1446 /* Form the outgoing messages */ 1447 /* Initialize the header space */ 1448 for (i=0; i<nrqs; i++) { 1449 proc = pa[i]; 1450 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1451 ptr[proc] = sbuf1[proc] + 3; 1452 } 1453 1454 /* Parse the isrow and copy data into outbuf */ 1455 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1456 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1457 proc = row2proc[j]; 1458 if (proc != rank) { /* copy to the outgoing buf*/ 1459 *ptr[proc] = irow[j]; 1460 ctr[proc]++; ptr[proc]++; 1461 } 1462 } 1463 1464 /* Update the headers for the current IS */ 1465 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1466 if ((ctr_j = ctr[j])) { 1467 sbuf1_j = sbuf1[j]; 1468 k = ++sbuf1_j[0]; 1469 sbuf1_j[2*k] = ctr_j; 1470 sbuf1_j[2*k-1] = 0; 1471 } 1472 } 1473 1474 /* Now post the sends */ 1475 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1476 for (i=0; i<nrqs; ++i) { 1477 proc = pa[i]; 1478 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1479 } 1480 1481 /* Post Receives to capture the buffer size */ 1482 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1483 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1484 1485 rbuf2[0] = tmp + msz; 1486 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1487 1488 for (i=0; i<nrqs; ++i) { 1489 proc = pa[i]; 1490 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1491 } 1492 1493 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1494 1495 /* Send to other procs the buf size they should allocate */ 1496 /* Receive messages*/ 1497 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1498 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1499 1500 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1501 for (i=0; i<nrqr; ++i) { 1502 req_size[i] = 0; 1503 rbuf1_i = rbuf1[i]; 1504 start = 2*rbuf1_i[0] + 1; 1505 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1506 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1507 sbuf2_i = sbuf2[i]; 1508 for (j=start; j<end; j++) { 1509 k = rbuf1_i[j] - rstart; 1510 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1511 sbuf2_i[j] = ncols; 1512 req_size[i] += ncols; 1513 } 1514 req_source1[i] = r_status1[i].MPI_SOURCE; 1515 1516 /* form the header */ 1517 sbuf2_i[0] = req_size[i]; 1518 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1519 1520 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1521 } 1522 1523 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1524 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1525 1526 /* rbuf2 is received, Post recv column indices a->j */ 1527 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1528 1529 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1530 for (i=0; i<nrqs; ++i) { 1531 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1532 req_source2[i] = r_status2[i].MPI_SOURCE; 1533 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1534 } 1535 1536 /* Wait on sends1 and sends2 */ 1537 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1538 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1539 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1540 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1541 1542 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1543 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1544 1545 /* Now allocate sending buffers for a->j, and send them off */ 1546 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1547 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1548 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1549 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1550 1551 for (i=0; i<nrqr; i++) { /* for each requested message */ 1552 rbuf1_i = rbuf1[i]; 1553 sbuf_aj_i = sbuf_aj[i]; 1554 ct1 = 2*rbuf1_i[0] + 1; 1555 ct2 = 0; 1556 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1557 1558 kmax = rbuf1[i][2]; 1559 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1560 row = rbuf1_i[ct1] - rstart; 1561 nzA = ai[row+1] - ai[row]; 1562 nzB = bi[row+1] - bi[row]; 1563 ncols = nzA + nzB; 1564 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1565 1566 /* load the column indices for this row into cols*/ 1567 cols = sbuf_aj_i + ct2; 1568 1569 lwrite = 0; 1570 for (l=0; l<nzB; l++) { 1571 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1572 } 1573 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1574 for (l=0; l<nzB; l++) { 1575 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1576 } 1577 1578 ct2 += ncols; 1579 } 1580 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1581 } 1582 1583 /* create column map (cmap): global col of C -> local col of submat */ 1584 #if defined(PETSC_USE_CTABLE) 1585 if (!allcolumns) { 1586 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1587 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1588 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1589 if (icol[j] >= cstart && icol[j] <cend) { 1590 cmap_loc[icol[j] - cstart] = j+1; 1591 } else { /* use PetscTable for non-local col indices */ 1592 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1593 } 1594 } 1595 } else { 1596 cmap = NULL; 1597 cmap_loc = NULL; 1598 } 1599 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1600 #else 1601 if (!allcolumns) { 1602 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1603 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1604 } else { 1605 cmap = NULL; 1606 } 1607 #endif 1608 1609 /* Create lens for MatSeqAIJSetPreallocation() */ 1610 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1611 1612 /* Compute lens from local part of C */ 1613 for (j=0; j<nrow; j++) { 1614 row = irow[j]; 1615 proc = row2proc[j]; 1616 if (proc == rank) { 1617 /* diagonal part A = c->A */ 1618 ncols = ai[row-rstart+1] - ai[row-rstart]; 1619 cols = aj + ai[row-rstart]; 1620 if (!allcolumns) { 1621 for (k=0; k<ncols; k++) { 1622 #if defined(PETSC_USE_CTABLE) 1623 tcol = cmap_loc[cols[k]]; 1624 #else 1625 tcol = cmap[cols[k]+cstart]; 1626 #endif 1627 if (tcol) lens[j]++; 1628 } 1629 } else { /* allcolumns */ 1630 lens[j] = ncols; 1631 } 1632 1633 /* off-diagonal part B = c->B */ 1634 ncols = bi[row-rstart+1] - bi[row-rstart]; 1635 cols = bj + bi[row-rstart]; 1636 if (!allcolumns) { 1637 for (k=0; k<ncols; k++) { 1638 #if defined(PETSC_USE_CTABLE) 1639 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1640 #else 1641 tcol = cmap[bmap[cols[k]]]; 1642 #endif 1643 if (tcol) lens[j]++; 1644 } 1645 } else { /* allcolumns */ 1646 lens[j] += ncols; 1647 } 1648 } 1649 } 1650 1651 /* Create row map (rmap): global row of C -> local row of submat */ 1652 #if defined(PETSC_USE_CTABLE) 1653 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1654 for (j=0; j<nrow; j++) { 1655 row = irow[j]; 1656 proc = row2proc[j]; 1657 if (proc == rank) { /* a local row */ 1658 rmap_loc[row - rstart] = j; 1659 } else { 1660 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1661 } 1662 } 1663 #else 1664 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1665 for (j=0; j<nrow; j++) { 1666 rmap[irow[j]] = j; 1667 } 1668 #endif 1669 1670 /* Update lens from offproc data */ 1671 /* recv a->j is done */ 1672 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1673 for (i=0; i<nrqs; i++) { 1674 proc = pa[i]; 1675 sbuf1_i = sbuf1[proc]; 1676 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1677 ct1 = 2 + 1; 1678 ct2 = 0; 1679 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1680 rbuf3_i = rbuf3[i]; /* received C->j */ 1681 1682 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1683 max1 = sbuf1_i[2]; 1684 for (k=0; k<max1; k++,ct1++) { 1685 #if defined(PETSC_USE_CTABLE) 1686 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1687 row--; 1688 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1689 #else 1690 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1691 #endif 1692 /* Now, store row index of submat in sbuf1_i[ct1] */ 1693 sbuf1_i[ct1] = row; 1694 1695 nnz = rbuf2_i[ct1]; 1696 if (!allcolumns) { 1697 for (l=0; l<nnz; l++,ct2++) { 1698 #if defined(PETSC_USE_CTABLE) 1699 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1700 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1701 } else { 1702 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1703 } 1704 #else 1705 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1706 #endif 1707 if (tcol) lens[row]++; 1708 } 1709 } else { /* allcolumns */ 1710 lens[row] += nnz; 1711 } 1712 } 1713 } 1714 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1715 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1716 1717 /* Create the submatrices */ 1718 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1719 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1720 1721 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1722 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1723 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1724 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1725 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1726 1727 /* create struct Mat_SubMat and attached it to submat */ 1728 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1729 subc = (Mat_SeqAIJ*)submat->data; 1730 subc->submatis1 = smatis1; 1731 1732 smatis1->id = 0; 1733 smatis1->nrqs = nrqs; 1734 smatis1->nrqr = nrqr; 1735 smatis1->rbuf1 = rbuf1; 1736 smatis1->rbuf2 = rbuf2; 1737 smatis1->rbuf3 = rbuf3; 1738 smatis1->sbuf2 = sbuf2; 1739 smatis1->req_source2 = req_source2; 1740 1741 smatis1->sbuf1 = sbuf1; 1742 smatis1->ptr = ptr; 1743 smatis1->tmp = tmp; 1744 smatis1->ctr = ctr; 1745 1746 smatis1->pa = pa; 1747 smatis1->req_size = req_size; 1748 smatis1->req_source1 = req_source1; 1749 1750 smatis1->allcolumns = allcolumns; 1751 smatis1->singleis = PETSC_TRUE; 1752 smatis1->row2proc = row2proc; 1753 smatis1->rmap = rmap; 1754 smatis1->cmap = cmap; 1755 #if defined(PETSC_USE_CTABLE) 1756 smatis1->rmap_loc = rmap_loc; 1757 smatis1->cmap_loc = cmap_loc; 1758 #endif 1759 1760 smatis1->destroy = submat->ops->destroy; 1761 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1762 submat->factortype = C->factortype; 1763 1764 /* compute rmax */ 1765 rmax = 0; 1766 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1767 1768 } else { /* scall == MAT_REUSE_MATRIX */ 1769 submat = submats[0]; 1770 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1771 1772 subc = (Mat_SeqAIJ*)submat->data; 1773 rmax = subc->rmax; 1774 smatis1 = subc->submatis1; 1775 nrqs = smatis1->nrqs; 1776 nrqr = smatis1->nrqr; 1777 rbuf1 = smatis1->rbuf1; 1778 rbuf2 = smatis1->rbuf2; 1779 rbuf3 = smatis1->rbuf3; 1780 req_source2 = smatis1->req_source2; 1781 1782 sbuf1 = smatis1->sbuf1; 1783 sbuf2 = smatis1->sbuf2; 1784 ptr = smatis1->ptr; 1785 tmp = smatis1->tmp; 1786 ctr = smatis1->ctr; 1787 1788 pa = smatis1->pa; 1789 req_size = smatis1->req_size; 1790 req_source1 = smatis1->req_source1; 1791 1792 allcolumns = smatis1->allcolumns; 1793 row2proc = smatis1->row2proc; 1794 rmap = smatis1->rmap; 1795 cmap = smatis1->cmap; 1796 #if defined(PETSC_USE_CTABLE) 1797 rmap_loc = smatis1->rmap_loc; 1798 cmap_loc = smatis1->cmap_loc; 1799 #endif 1800 } 1801 1802 /* Post recv matrix values */ 1803 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1804 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1805 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1806 for (i=0; i<nrqs; ++i) { 1807 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1808 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1809 } 1810 1811 /* Allocate sending buffers for a->a, and send them off */ 1812 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1813 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1814 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1815 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1816 1817 for (i=0; i<nrqr; i++) { 1818 rbuf1_i = rbuf1[i]; 1819 sbuf_aa_i = sbuf_aa[i]; 1820 ct1 = 2*rbuf1_i[0]+1; 1821 ct2 = 0; 1822 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1823 1824 kmax = rbuf1_i[2]; 1825 for (k=0; k<kmax; k++,ct1++) { 1826 row = rbuf1_i[ct1] - rstart; 1827 nzA = ai[row+1] - ai[row]; 1828 nzB = bi[row+1] - bi[row]; 1829 ncols = nzA + nzB; 1830 cworkB = bj + bi[row]; 1831 vworkA = a_a + ai[row]; 1832 vworkB = b_a + bi[row]; 1833 1834 /* load the column values for this row into vals*/ 1835 vals = sbuf_aa_i + ct2; 1836 1837 lwrite = 0; 1838 for (l=0; l<nzB; l++) { 1839 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1840 } 1841 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1842 for (l=0; l<nzB; l++) { 1843 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1844 } 1845 1846 ct2 += ncols; 1847 } 1848 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1849 } 1850 1851 /* Assemble submat */ 1852 /* First assemble the local rows */ 1853 for (j=0; j<nrow; j++) { 1854 row = irow[j]; 1855 proc = row2proc[j]; 1856 if (proc == rank) { 1857 Crow = row - rstart; /* local row index of C */ 1858 #if defined(PETSC_USE_CTABLE) 1859 row = rmap_loc[Crow]; /* row index of submat */ 1860 #else 1861 row = rmap[row]; 1862 #endif 1863 1864 if (allcolumns) { 1865 /* diagonal part A = c->A */ 1866 ncols = ai[Crow+1] - ai[Crow]; 1867 cols = aj + ai[Crow]; 1868 vals = a->a + ai[Crow]; 1869 i = 0; 1870 for (k=0; k<ncols; k++) { 1871 subcols[i] = cols[k] + cstart; 1872 subvals[i++] = vals[k]; 1873 } 1874 1875 /* off-diagonal part B = c->B */ 1876 ncols = bi[Crow+1] - bi[Crow]; 1877 cols = bj + bi[Crow]; 1878 vals = b->a + bi[Crow]; 1879 for (k=0; k<ncols; k++) { 1880 subcols[i] = bmap[cols[k]]; 1881 subvals[i++] = vals[k]; 1882 } 1883 1884 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1885 1886 } else { /* !allcolumns */ 1887 #if defined(PETSC_USE_CTABLE) 1888 /* diagonal part A = c->A */ 1889 ncols = ai[Crow+1] - ai[Crow]; 1890 cols = aj + ai[Crow]; 1891 vals = a->a + ai[Crow]; 1892 i = 0; 1893 for (k=0; k<ncols; k++) { 1894 tcol = cmap_loc[cols[k]]; 1895 if (tcol) { 1896 subcols[i] = --tcol; 1897 subvals[i++] = vals[k]; 1898 } 1899 } 1900 1901 /* off-diagonal part B = c->B */ 1902 ncols = bi[Crow+1] - bi[Crow]; 1903 cols = bj + bi[Crow]; 1904 vals = b->a + bi[Crow]; 1905 for (k=0; k<ncols; k++) { 1906 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1907 if (tcol) { 1908 subcols[i] = --tcol; 1909 subvals[i++] = vals[k]; 1910 } 1911 } 1912 #else 1913 /* diagonal part A = c->A */ 1914 ncols = ai[Crow+1] - ai[Crow]; 1915 cols = aj + ai[Crow]; 1916 vals = a->a + ai[Crow]; 1917 i = 0; 1918 for (k=0; k<ncols; k++) { 1919 tcol = cmap[cols[k]+cstart]; 1920 if (tcol) { 1921 subcols[i] = --tcol; 1922 subvals[i++] = vals[k]; 1923 } 1924 } 1925 1926 /* off-diagonal part B = c->B */ 1927 ncols = bi[Crow+1] - bi[Crow]; 1928 cols = bj + bi[Crow]; 1929 vals = b->a + bi[Crow]; 1930 for (k=0; k<ncols; k++) { 1931 tcol = cmap[bmap[cols[k]]]; 1932 if (tcol) { 1933 subcols[i] = --tcol; 1934 subvals[i++] = vals[k]; 1935 } 1936 } 1937 #endif 1938 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1939 } 1940 } 1941 } 1942 1943 /* Now assemble the off-proc rows */ 1944 for (i=0; i<nrqs; i++) { /* for each requested message */ 1945 /* recv values from other processes */ 1946 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1947 proc = pa[idex]; 1948 sbuf1_i = sbuf1[proc]; 1949 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1950 ct1 = 2 + 1; 1951 ct2 = 0; /* count of received C->j */ 1952 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1953 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1954 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1955 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1956 1957 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1958 max1 = sbuf1_i[2]; /* num of rows */ 1959 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1960 row = sbuf1_i[ct1]; /* row index of submat */ 1961 if (!allcolumns) { 1962 idex = 0; 1963 if (scall == MAT_INITIAL_MATRIX) { 1964 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1965 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1966 #if defined(PETSC_USE_CTABLE) 1967 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1968 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1969 } else { 1970 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1971 } 1972 #else 1973 tcol = cmap[rbuf3_i[ct2]]; 1974 #endif 1975 if (tcol) { 1976 subcols[idex] = --tcol; 1977 subvals[idex++] = rbuf4_i[ct2]; 1978 1979 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1980 For reuse, we replace received C->j with index that should be inserted to submat */ 1981 rbuf3_i[ct3++] = ct2; 1982 } 1983 } 1984 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1985 1986 } else { /* scall == MAT_REUSE_MATRIX */ 1987 submat = submats[0]; 1988 subc = (Mat_SeqAIJ*)submat->data; 1989 1990 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1991 for (l=0; l<nnz; l++) { 1992 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1993 subvals[idex++] = rbuf4_i[ct2]; 1994 } 1995 1996 bj = subc->j + subc->i[row]; 1997 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1998 } 1999 } else { /* allcolumns */ 2000 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 2001 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 2002 ct2 += nnz; 2003 } 2004 } 2005 } 2006 2007 /* sending a->a are done */ 2008 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 2009 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 2010 2011 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2012 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2013 submats[0] = submat; 2014 2015 /* Restore the indices */ 2016 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 2017 if (!allcolumns) { 2018 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 2019 } 2020 2021 /* Destroy allocated memory */ 2022 for (i=0; i<nrqs; ++i) { 2023 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 2024 } 2025 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 2026 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2027 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2028 2029 if (scall == MAT_INITIAL_MATRIX) { 2030 ierr = PetscFree(lens);CHKERRQ(ierr); 2031 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2032 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2033 } 2034 PetscFunctionReturn(0); 2035 } 2036 2037 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2038 { 2039 PetscErrorCode ierr; 2040 PetscInt ncol; 2041 PetscBool colflag,allcolumns=PETSC_FALSE; 2042 2043 PetscFunctionBegin; 2044 /* Allocate memory to hold all the submatrices */ 2045 if (scall == MAT_INITIAL_MATRIX) { 2046 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 2047 } 2048 2049 /* Check for special case: each processor gets entire matrix columns */ 2050 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2051 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2052 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2053 2054 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2055 PetscFunctionReturn(0); 2056 } 2057 2058 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2059 { 2060 PetscErrorCode ierr; 2061 PetscInt nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2]; 2062 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 2063 Mat_SeqAIJ *subc; 2064 Mat_SubMat *smat; 2065 2066 PetscFunctionBegin; 2067 /* Check for special case: each processor has a single IS */ 2068 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 2069 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2070 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */ 2071 PetscFunctionReturn(0); 2072 } 2073 2074 /* Collect global wantallmatrix and nstages */ 2075 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 2076 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2077 if (!nmax) nmax = 1; 2078 2079 if (scall == MAT_INITIAL_MATRIX) { 2080 /* Collect global wantallmatrix and nstages */ 2081 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2082 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2083 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2084 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2085 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2086 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2087 wantallmatrix = PETSC_TRUE; 2088 2089 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2090 } 2091 } 2092 2093 /* Determine the number of stages through which submatrices are done 2094 Each stage will extract nmax submatrices. 2095 nmax is determined by the matrix column dimension. 2096 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2097 */ 2098 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2099 2100 in[0] = -1*(PetscInt)wantallmatrix; 2101 in[1] = nstages; 2102 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2103 wantallmatrix = (PetscBool)(-out[0]); 2104 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2105 2106 } else { /* MAT_REUSE_MATRIX */ 2107 if (ismax) { 2108 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2109 smat = subc->submatis1; 2110 } else { /* (*submat)[0] is a dummy matrix */ 2111 smat = (Mat_SubMat*)(*submat)[0]->data; 2112 } 2113 if (!smat) { 2114 /* smat is not generated by MatGetSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2115 wantallmatrix = PETSC_TRUE; 2116 } else if (smat->singleis) { 2117 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2118 PetscFunctionReturn(0); 2119 } else { 2120 nstages = smat->nstages; 2121 } 2122 } 2123 2124 if (wantallmatrix) { 2125 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2126 PetscFunctionReturn(0); 2127 } 2128 2129 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2130 if (scall == MAT_INITIAL_MATRIX) { 2131 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2132 } 2133 2134 #if 0 2135 PetscMPIInt rank; 2136 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)C),&rank);CHKERRQ(ierr); 2137 printf("[%d] reuse %d, ismax %d, CN %d, nstages %d, nmax %d\n",rank,scall,ismax,C->cmap->N,nstages,nmax); 2138 #endif 2139 for (i=0,pos=0; i<nstages; i++) { 2140 if (pos+nmax <= ismax) max_no = nmax; 2141 else if (pos == ismax) max_no = 0; 2142 else max_no = ismax-pos; 2143 /* if (!max_no) printf("[%d] max_no=0, %d-stage\n",rank,i); */ 2144 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2145 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2146 smat = (Mat_SubMat*)(*submat)[pos]->data; 2147 smat->nstages = nstages; 2148 } 2149 pos += max_no; 2150 } 2151 2152 if (ismax && scall == MAT_INITIAL_MATRIX) { 2153 /* save nstages for reuse */ 2154 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2155 smat = subc->submatis1; 2156 smat->nstages = nstages; 2157 } 2158 PetscFunctionReturn(0); 2159 } 2160 2161 /* -------------------------------------------------------------------------*/ 2162 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2163 { 2164 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2165 Mat A = c->A; 2166 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2167 const PetscInt **icol,**irow; 2168 PetscInt *nrow,*ncol,start; 2169 PetscErrorCode ierr; 2170 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2171 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2172 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2173 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2174 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2175 #if defined(PETSC_USE_CTABLE) 2176 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2177 #else 2178 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2179 #endif 2180 const PetscInt *irow_i; 2181 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2182 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2183 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2184 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2185 MPI_Status *r_status3,*r_status4,*s_status4; 2186 MPI_Comm comm; 2187 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2188 PetscMPIInt *onodes1,*olengths1,end; 2189 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2190 Mat_SubMat *smat_i; 2191 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2192 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2193 2194 PetscFunctionBegin; 2195 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2196 size = c->size; 2197 rank = c->rank; 2198 2199 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2200 ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2201 2202 for (i=0; i<ismax; i++) { 2203 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2204 if (!issorted[i]) iscsorted = issorted[i]; 2205 2206 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2207 2208 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2209 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2210 2211 /* Check for special case: allcolumn */ 2212 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2213 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2214 if (colflag && ncol[i] == C->cmap->N) { 2215 allcolumns[i] = PETSC_TRUE; 2216 icol[i] = NULL; 2217 } else { 2218 allcolumns[i] = PETSC_FALSE; 2219 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2220 } 2221 } 2222 2223 if (scall == MAT_REUSE_MATRIX) { 2224 /* Assumes new rows are same length as the old rows */ 2225 for (i=0; i<ismax; i++) { 2226 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2227 subc = (Mat_SeqAIJ*)submats[i]->data; 2228 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2229 2230 /* Initial matrix as if empty */ 2231 ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2232 2233 smat_i = subc->submatis1; 2234 2235 nrqs = smat_i->nrqs; 2236 nrqr = smat_i->nrqr; 2237 rbuf1 = smat_i->rbuf1; 2238 rbuf2 = smat_i->rbuf2; 2239 rbuf3 = smat_i->rbuf3; 2240 req_source2 = smat_i->req_source2; 2241 2242 sbuf1 = smat_i->sbuf1; 2243 sbuf2 = smat_i->sbuf2; 2244 ptr = smat_i->ptr; 2245 tmp = smat_i->tmp; 2246 ctr = smat_i->ctr; 2247 2248 pa = smat_i->pa; 2249 req_size = smat_i->req_size; 2250 req_source1 = smat_i->req_source1; 2251 2252 allcolumns[i] = smat_i->allcolumns; 2253 row2proc[i] = smat_i->row2proc; 2254 rmap[i] = smat_i->rmap; 2255 cmap[i] = smat_i->cmap; 2256 } 2257 2258 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2259 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2260 smat_i = (Mat_SubMat*)submats[0]->data; 2261 2262 nrqs = smat_i->nrqs; 2263 nrqr = smat_i->nrqr; 2264 rbuf1 = smat_i->rbuf1; 2265 rbuf2 = smat_i->rbuf2; 2266 rbuf3 = smat_i->rbuf3; 2267 req_source2 = smat_i->req_source2; 2268 2269 sbuf1 = smat_i->sbuf1; 2270 sbuf2 = smat_i->sbuf2; 2271 ptr = smat_i->ptr; 2272 tmp = smat_i->tmp; 2273 ctr = smat_i->ctr; 2274 2275 pa = smat_i->pa; 2276 req_size = smat_i->req_size; 2277 req_source1 = smat_i->req_source1; 2278 2279 allcolumns[0] = PETSC_TRUE; 2280 } 2281 } else { /* scall == MAT_INITIAL_MATRIX */ 2282 /* Get some new tags to keep the communication clean */ 2283 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2284 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2285 2286 /* evaluate communication - mesg to who, length of mesg, and buffer space 2287 required. Based on this, buffers are allocated, and data copied into them*/ 2288 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2289 2290 for (i=0; i<ismax; i++) { 2291 jmax = nrow[i]; 2292 irow_i = irow[i]; 2293 2294 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2295 row2proc[i] = row2proc_i; 2296 2297 if (issorted[i]) proc = 0; 2298 for (j=0; j<jmax; j++) { 2299 if (!issorted[i]) proc = 0; 2300 row = irow_i[j]; 2301 while (row >= C->rmap->range[proc+1]) proc++; 2302 w4[proc]++; 2303 row2proc_i[j] = proc; /* map row index to proc */ 2304 } 2305 for (j=0; j<size; j++) { 2306 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2307 } 2308 } 2309 2310 nrqs = 0; /* no of outgoing messages */ 2311 msz = 0; /* total mesg length (for all procs) */ 2312 w1[rank] = 0; /* no mesg sent to self */ 2313 w3[rank] = 0; 2314 for (i=0; i<size; i++) { 2315 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2316 } 2317 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2318 for (i=0,j=0; i<size; i++) { 2319 if (w1[i]) { pa[j] = i; j++; } 2320 } 2321 2322 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2323 for (i=0; i<nrqs; i++) { 2324 j = pa[i]; 2325 w1[j] += w2[j] + 2* w3[j]; 2326 msz += w1[j]; 2327 } 2328 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2329 2330 /* Determine the number of messages to expect, their lengths, from from-ids */ 2331 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2332 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2333 2334 /* Now post the Irecvs corresponding to these messages */ 2335 tag0 = ((PetscObject)C)->tag; 2336 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2337 2338 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2339 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2340 2341 /* Allocate Memory for outgoing messages */ 2342 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2343 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2344 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2345 2346 { 2347 PetscInt *iptr = tmp; 2348 k = 0; 2349 for (i=0; i<nrqs; i++) { 2350 j = pa[i]; 2351 iptr += k; 2352 sbuf1[j] = iptr; 2353 k = w1[j]; 2354 } 2355 } 2356 2357 /* Form the outgoing messages. Initialize the header space */ 2358 for (i=0; i<nrqs; i++) { 2359 j = pa[i]; 2360 sbuf1[j][0] = 0; 2361 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2362 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2363 } 2364 2365 /* Parse the isrow and copy data into outbuf */ 2366 for (i=0; i<ismax; i++) { 2367 row2proc_i = row2proc[i]; 2368 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2369 irow_i = irow[i]; 2370 jmax = nrow[i]; 2371 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2372 proc = row2proc_i[j]; 2373 if (proc != rank) { /* copy to the outgoing buf*/ 2374 ctr[proc]++; 2375 *ptr[proc] = irow_i[j]; 2376 ptr[proc]++; 2377 } 2378 } 2379 /* Update the headers for the current IS */ 2380 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2381 if ((ctr_j = ctr[j])) { 2382 sbuf1_j = sbuf1[j]; 2383 k = ++sbuf1_j[0]; 2384 sbuf1_j[2*k] = ctr_j; 2385 sbuf1_j[2*k-1] = i; 2386 } 2387 } 2388 } 2389 2390 /* Now post the sends */ 2391 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2392 for (i=0; i<nrqs; ++i) { 2393 j = pa[i]; 2394 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2395 } 2396 2397 /* Post Receives to capture the buffer size */ 2398 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2399 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2400 rbuf2[0] = tmp + msz; 2401 for (i=1; i<nrqs; ++i) { 2402 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2403 } 2404 for (i=0; i<nrqs; ++i) { 2405 j = pa[i]; 2406 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2407 } 2408 2409 /* Send to other procs the buf size they should allocate */ 2410 /* Receive messages*/ 2411 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2412 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2413 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2414 { 2415 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2416 PetscInt *sbuf2_i; 2417 2418 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2419 for (i=0; i<nrqr; ++i) { 2420 req_size[i] = 0; 2421 rbuf1_i = rbuf1[i]; 2422 start = 2*rbuf1_i[0] + 1; 2423 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2424 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2425 sbuf2_i = sbuf2[i]; 2426 for (j=start; j<end; j++) { 2427 id = rbuf1_i[j] - rstart; 2428 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2429 sbuf2_i[j] = ncols; 2430 req_size[i] += ncols; 2431 } 2432 req_source1[i] = r_status1[i].MPI_SOURCE; 2433 /* form the header */ 2434 sbuf2_i[0] = req_size[i]; 2435 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2436 2437 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2438 } 2439 } 2440 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2441 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2442 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2443 2444 /* Receive messages*/ 2445 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2446 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2447 2448 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2449 for (i=0; i<nrqs; ++i) { 2450 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2451 req_source2[i] = r_status2[i].MPI_SOURCE; 2452 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2453 } 2454 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2455 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2456 2457 /* Wait on sends1 and sends2 */ 2458 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2459 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2460 2461 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2462 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2463 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2464 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2465 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2466 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2467 2468 /* Now allocate sending buffers for a->j, and send them off */ 2469 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2470 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2471 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2472 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2473 2474 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2475 { 2476 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2477 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2478 PetscInt cend = C->cmap->rend; 2479 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2480 2481 for (i=0; i<nrqr; i++) { 2482 rbuf1_i = rbuf1[i]; 2483 sbuf_aj_i = sbuf_aj[i]; 2484 ct1 = 2*rbuf1_i[0] + 1; 2485 ct2 = 0; 2486 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2487 kmax = rbuf1[i][2*j]; 2488 for (k=0; k<kmax; k++,ct1++) { 2489 row = rbuf1_i[ct1] - rstart; 2490 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2491 ncols = nzA + nzB; 2492 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2493 2494 /* load the column indices for this row into cols */ 2495 cols = sbuf_aj_i + ct2; 2496 2497 lwrite = 0; 2498 for (l=0; l<nzB; l++) { 2499 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2500 } 2501 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2502 for (l=0; l<nzB; l++) { 2503 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2504 } 2505 2506 ct2 += ncols; 2507 } 2508 } 2509 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2510 } 2511 } 2512 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2513 2514 /* create col map: global col of C -> local col of submatrices */ 2515 { 2516 const PetscInt *icol_i; 2517 #if defined(PETSC_USE_CTABLE) 2518 for (i=0; i<ismax; i++) { 2519 if (!allcolumns[i]) { 2520 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2521 2522 jmax = ncol[i]; 2523 icol_i = icol[i]; 2524 cmap_i = cmap[i]; 2525 for (j=0; j<jmax; j++) { 2526 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2527 } 2528 } else cmap[i] = NULL; 2529 } 2530 #else 2531 for (i=0; i<ismax; i++) { 2532 if (!allcolumns[i]) { 2533 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2534 jmax = ncol[i]; 2535 icol_i = icol[i]; 2536 cmap_i = cmap[i]; 2537 for (j=0; j<jmax; j++) { 2538 cmap_i[icol_i[j]] = j+1; 2539 } 2540 } else cmap[i] = NULL; 2541 } 2542 #endif 2543 } 2544 2545 /* Create lens which is required for MatCreate... */ 2546 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2547 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2548 2549 if (ismax) { 2550 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2551 } 2552 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2553 2554 /* Update lens from local data */ 2555 for (i=0; i<ismax; i++) { 2556 row2proc_i = row2proc[i]; 2557 jmax = nrow[i]; 2558 if (!allcolumns[i]) cmap_i = cmap[i]; 2559 irow_i = irow[i]; 2560 lens_i = lens[i]; 2561 for (j=0; j<jmax; j++) { 2562 row = irow_i[j]; 2563 proc = row2proc_i[j]; 2564 if (proc == rank) { 2565 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2566 if (!allcolumns[i]) { 2567 for (k=0; k<ncols; k++) { 2568 #if defined(PETSC_USE_CTABLE) 2569 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2570 #else 2571 tcol = cmap_i[cols[k]]; 2572 #endif 2573 if (tcol) lens_i[j]++; 2574 } 2575 } else { /* allcolumns */ 2576 lens_i[j] = ncols; 2577 } 2578 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2579 } 2580 } 2581 } 2582 2583 /* Create row map: global row of C -> local row of submatrices */ 2584 #if defined(PETSC_USE_CTABLE) 2585 for (i=0; i<ismax; i++) { 2586 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2587 irow_i = irow[i]; 2588 jmax = nrow[i]; 2589 for (j=0; j<jmax; j++) { 2590 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2591 } 2592 } 2593 #else 2594 for (i=0; i<ismax; i++) { 2595 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2596 rmap_i = rmap[i]; 2597 irow_i = irow[i]; 2598 jmax = nrow[i]; 2599 for (j=0; j<jmax; j++) { 2600 rmap_i[irow_i[j]] = j; 2601 } 2602 } 2603 #endif 2604 2605 /* Update lens from offproc data */ 2606 { 2607 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2608 2609 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2610 for (tmp2=0; tmp2<nrqs; tmp2++) { 2611 sbuf1_i = sbuf1[pa[tmp2]]; 2612 jmax = sbuf1_i[0]; 2613 ct1 = 2*jmax+1; 2614 ct2 = 0; 2615 rbuf2_i = rbuf2[tmp2]; 2616 rbuf3_i = rbuf3[tmp2]; 2617 for (j=1; j<=jmax; j++) { 2618 is_no = sbuf1_i[2*j-1]; 2619 max1 = sbuf1_i[2*j]; 2620 lens_i = lens[is_no]; 2621 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2622 rmap_i = rmap[is_no]; 2623 for (k=0; k<max1; k++,ct1++) { 2624 #if defined(PETSC_USE_CTABLE) 2625 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2626 row--; 2627 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2628 #else 2629 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2630 #endif 2631 max2 = rbuf2_i[ct1]; 2632 for (l=0; l<max2; l++,ct2++) { 2633 if (!allcolumns[is_no]) { 2634 #if defined(PETSC_USE_CTABLE) 2635 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2636 #else 2637 tcol = cmap_i[rbuf3_i[ct2]]; 2638 #endif 2639 if (tcol) lens_i[row]++; 2640 } else { /* allcolumns */ 2641 lens_i[row]++; /* lens_i[row] += max2 ? */ 2642 } 2643 } 2644 } 2645 } 2646 } 2647 } 2648 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2649 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2650 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2651 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2652 2653 /* Create the submatrices */ 2654 for (i=0; i<ismax; i++) { 2655 PetscInt rbs,cbs; 2656 2657 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2658 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2659 2660 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2661 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2662 2663 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2664 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2665 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2666 2667 /* create struct Mat_SubMat and attached it to submat */ 2668 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2669 subc = (Mat_SeqAIJ*)submats[i]->data; 2670 subc->submatis1 = smat_i; 2671 2672 smat_i->destroy = submats[i]->ops->destroy; 2673 submats[i]->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 2674 submats[i]->factortype = C->factortype; 2675 2676 smat_i->id = i; 2677 smat_i->nrqs = nrqs; 2678 smat_i->nrqr = nrqr; 2679 smat_i->rbuf1 = rbuf1; 2680 smat_i->rbuf2 = rbuf2; 2681 smat_i->rbuf3 = rbuf3; 2682 smat_i->sbuf2 = sbuf2; 2683 smat_i->req_source2 = req_source2; 2684 2685 smat_i->sbuf1 = sbuf1; 2686 smat_i->ptr = ptr; 2687 smat_i->tmp = tmp; 2688 smat_i->ctr = ctr; 2689 2690 smat_i->pa = pa; 2691 smat_i->req_size = req_size; 2692 smat_i->req_source1 = req_source1; 2693 2694 smat_i->allcolumns = allcolumns[i]; 2695 smat_i->singleis = PETSC_FALSE; 2696 smat_i->row2proc = row2proc[i]; 2697 smat_i->rmap = rmap[i]; 2698 smat_i->cmap = cmap[i]; 2699 } 2700 2701 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2702 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2703 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2704 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2705 2706 /* create struct Mat_SubMat and attached it to submat */ 2707 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2708 submats[0]->data = (void*)smat_i; 2709 2710 smat_i->destroy = submats[0]->ops->destroy; 2711 submats[0]->ops->destroy = MatDestroy_Dummy_MatGetSubmatrices; 2712 submats[0]->factortype = C->factortype; 2713 2714 smat_i->id = i; 2715 smat_i->nrqs = nrqs; 2716 smat_i->nrqr = nrqr; 2717 smat_i->rbuf1 = rbuf1; 2718 smat_i->rbuf2 = rbuf2; 2719 smat_i->rbuf3 = rbuf3; 2720 smat_i->sbuf2 = sbuf2; 2721 smat_i->req_source2 = req_source2; 2722 2723 smat_i->sbuf1 = sbuf1; 2724 smat_i->ptr = ptr; 2725 smat_i->tmp = tmp; 2726 smat_i->ctr = ctr; 2727 2728 smat_i->pa = pa; 2729 smat_i->req_size = req_size; 2730 smat_i->req_source1 = req_source1; 2731 2732 smat_i->allcolumns = PETSC_TRUE; 2733 smat_i->singleis = PETSC_FALSE; 2734 smat_i->row2proc = NULL; 2735 smat_i->rmap = NULL; 2736 smat_i->cmap = NULL; 2737 } 2738 2739 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2740 ierr = PetscFree(lens);CHKERRQ(ierr); 2741 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2742 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2743 2744 } /* endof scall == MAT_INITIAL_MATRIX */ 2745 2746 /* Post recv matrix values */ 2747 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2748 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2749 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2750 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2751 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2752 for (i=0; i<nrqs; ++i) { 2753 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2754 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2755 } 2756 2757 /* Allocate sending buffers for a->a, and send them off */ 2758 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2759 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2760 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2761 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2762 2763 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2764 { 2765 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2766 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2767 PetscInt cend = C->cmap->rend; 2768 PetscInt *b_j = b->j; 2769 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2770 2771 for (i=0; i<nrqr; i++) { 2772 rbuf1_i = rbuf1[i]; 2773 sbuf_aa_i = sbuf_aa[i]; 2774 ct1 = 2*rbuf1_i[0]+1; 2775 ct2 = 0; 2776 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2777 kmax = rbuf1_i[2*j]; 2778 for (k=0; k<kmax; k++,ct1++) { 2779 row = rbuf1_i[ct1] - rstart; 2780 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2781 ncols = nzA + nzB; 2782 cworkB = b_j + b_i[row]; 2783 vworkA = a_a + a_i[row]; 2784 vworkB = b_a + b_i[row]; 2785 2786 /* load the column values for this row into vals*/ 2787 vals = sbuf_aa_i+ct2; 2788 2789 lwrite = 0; 2790 for (l=0; l<nzB; l++) { 2791 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2792 } 2793 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2794 for (l=0; l<nzB; l++) { 2795 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2796 } 2797 2798 ct2 += ncols; 2799 } 2800 } 2801 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2802 } 2803 } 2804 2805 /* Assemble the matrices */ 2806 /* First assemble the local rows */ 2807 for (i=0; i<ismax; i++) { 2808 row2proc_i = row2proc[i]; 2809 subc = (Mat_SeqAIJ*)submats[i]->data; 2810 imat_ilen = subc->ilen; 2811 imat_j = subc->j; 2812 imat_i = subc->i; 2813 imat_a = subc->a; 2814 2815 if (!allcolumns[i]) cmap_i = cmap[i]; 2816 rmap_i = rmap[i]; 2817 irow_i = irow[i]; 2818 jmax = nrow[i]; 2819 for (j=0; j<jmax; j++) { 2820 row = irow_i[j]; 2821 proc = row2proc_i[j]; 2822 if (proc == rank) { 2823 old_row = row; 2824 #if defined(PETSC_USE_CTABLE) 2825 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2826 row--; 2827 #else 2828 row = rmap_i[row]; 2829 #endif 2830 ilen_row = imat_ilen[row]; 2831 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2832 mat_i = imat_i[row]; 2833 mat_a = imat_a + mat_i; 2834 mat_j = imat_j + mat_i; 2835 if (!allcolumns[i]) { 2836 for (k=0; k<ncols; k++) { 2837 #if defined(PETSC_USE_CTABLE) 2838 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2839 #else 2840 tcol = cmap_i[cols[k]]; 2841 #endif 2842 if (tcol) { 2843 *mat_j++ = tcol - 1; 2844 *mat_a++ = vals[k]; 2845 ilen_row++; 2846 } 2847 } 2848 } else { /* allcolumns */ 2849 for (k=0; k<ncols; k++) { 2850 *mat_j++ = cols[k]; /* global col index! */ 2851 *mat_a++ = vals[k]; 2852 ilen_row++; 2853 } 2854 } 2855 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2856 2857 imat_ilen[row] = ilen_row; 2858 } 2859 } 2860 } 2861 2862 /* Now assemble the off proc rows */ 2863 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2864 for (tmp2=0; tmp2<nrqs; tmp2++) { 2865 sbuf1_i = sbuf1[pa[tmp2]]; 2866 jmax = sbuf1_i[0]; 2867 ct1 = 2*jmax + 1; 2868 ct2 = 0; 2869 rbuf2_i = rbuf2[tmp2]; 2870 rbuf3_i = rbuf3[tmp2]; 2871 rbuf4_i = rbuf4[tmp2]; 2872 for (j=1; j<=jmax; j++) { 2873 is_no = sbuf1_i[2*j-1]; 2874 rmap_i = rmap[is_no]; 2875 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2876 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2877 imat_ilen = subc->ilen; 2878 imat_j = subc->j; 2879 imat_i = subc->i; 2880 imat_a = subc->a; 2881 max1 = sbuf1_i[2*j]; 2882 for (k=0; k<max1; k++,ct1++) { 2883 row = sbuf1_i[ct1]; 2884 #if defined(PETSC_USE_CTABLE) 2885 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2886 row--; 2887 #else 2888 row = rmap_i[row]; 2889 #endif 2890 ilen = imat_ilen[row]; 2891 mat_i = imat_i[row]; 2892 mat_a = imat_a + mat_i; 2893 mat_j = imat_j + mat_i; 2894 max2 = rbuf2_i[ct1]; 2895 if (!allcolumns[is_no]) { 2896 for (l=0; l<max2; l++,ct2++) { 2897 #if defined(PETSC_USE_CTABLE) 2898 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2899 #else 2900 tcol = cmap_i[rbuf3_i[ct2]]; 2901 #endif 2902 if (tcol) { 2903 *mat_j++ = tcol - 1; 2904 *mat_a++ = rbuf4_i[ct2]; 2905 ilen++; 2906 } 2907 } 2908 } else { /* allcolumns */ 2909 for (l=0; l<max2; l++,ct2++) { 2910 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2911 *mat_a++ = rbuf4_i[ct2]; 2912 ilen++; 2913 } 2914 } 2915 imat_ilen[row] = ilen; 2916 } 2917 } 2918 } 2919 2920 if (!iscsorted) { /* sort column indices of the rows */ 2921 for (i=0; i<ismax; i++) { 2922 subc = (Mat_SeqAIJ*)submats[i]->data; 2923 imat_j = subc->j; 2924 imat_i = subc->i; 2925 imat_a = subc->a; 2926 imat_ilen = subc->ilen; 2927 2928 if (allcolumns[i]) continue; 2929 jmax = nrow[i]; 2930 for (j=0; j<jmax; j++) { 2931 mat_i = imat_i[j]; 2932 mat_a = imat_a + mat_i; 2933 mat_j = imat_j + mat_i; 2934 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2935 } 2936 } 2937 } 2938 2939 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2940 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2941 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2942 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2943 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2944 2945 /* Restore the indices */ 2946 for (i=0; i<ismax; i++) { 2947 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2948 if (!allcolumns[i]) { 2949 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2950 } 2951 } 2952 2953 for (i=0; i<ismax; i++) { 2954 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2955 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2956 } 2957 2958 /* Destroy allocated memory */ 2959 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2960 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2961 ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr); 2962 2963 for (i=0; i<nrqs; ++i) { 2964 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2965 } 2966 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2967 2968 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2969 PetscFunctionReturn(0); 2970 } 2971 2972 /* 2973 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2974 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2975 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2976 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2977 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2978 state, and needs to be "assembled" later by compressing B's column space. 2979 2980 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2981 Following this call, C->A & C->B have been created, even if empty. 2982 */ 2983 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2984 { 2985 /* If making this function public, change the error returned in this function away from _PLIB. */ 2986 PetscErrorCode ierr; 2987 Mat_MPIAIJ *aij; 2988 Mat_SeqAIJ *Baij; 2989 PetscBool seqaij,Bdisassembled; 2990 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2991 PetscScalar v; 2992 const PetscInt *rowindices,*colindices; 2993 2994 PetscFunctionBegin; 2995 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2996 if (A) { 2997 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2998 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2999 if (rowemb) { 3000 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 3001 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 3002 } else { 3003 if (C->rmap->n != A->rmap->n) { 3004 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 3005 } 3006 } 3007 if (dcolemb) { 3008 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 3009 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 3010 } else { 3011 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 3012 } 3013 } 3014 if (B) { 3015 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 3016 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 3017 if (rowemb) { 3018 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 3019 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 3020 } else { 3021 if (C->rmap->n != B->rmap->n) { 3022 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 3023 } 3024 } 3025 if (ocolemb) { 3026 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 3027 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 3028 } else { 3029 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 3030 } 3031 } 3032 3033 aij = (Mat_MPIAIJ*)C->data; 3034 if (!aij->A) { 3035 /* Mimic parts of MatMPIAIJSetPreallocation() */ 3036 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 3037 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 3038 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 3039 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 3040 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 3041 } 3042 if (A) { 3043 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 3044 } else { 3045 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 3046 } 3047 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 3048 /* 3049 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 3050 need to "disassemble" B -- convert it to using C's global indices. 3051 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 3052 3053 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 3054 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 3055 3056 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 3057 At least avoid calling MatSetValues() and the implied searches? 3058 */ 3059 3060 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 3061 #if defined(PETSC_USE_CTABLE) 3062 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 3063 #else 3064 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 3065 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 3066 if (aij->B) { 3067 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 3068 } 3069 #endif 3070 ngcol = 0; 3071 if (aij->lvec) { 3072 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 3073 } 3074 if (aij->garray) { 3075 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 3076 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 3077 } 3078 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 3079 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 3080 } 3081 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 3082 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 3083 } 3084 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 3085 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 3086 } 3087 } 3088 Bdisassembled = PETSC_FALSE; 3089 if (!aij->B) { 3090 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 3091 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 3092 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 3093 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 3094 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3095 Bdisassembled = PETSC_TRUE; 3096 } 3097 if (B) { 3098 Baij = (Mat_SeqAIJ*)B->data; 3099 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3100 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3101 for (i=0; i<B->rmap->n; i++) { 3102 nz[i] = Baij->i[i+1] - Baij->i[i]; 3103 } 3104 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3105 ierr = PetscFree(nz);CHKERRQ(ierr); 3106 } 3107 3108 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3109 shift = rend-rstart; 3110 count = 0; 3111 rowindices = NULL; 3112 colindices = NULL; 3113 if (rowemb) { 3114 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3115 } 3116 if (ocolemb) { 3117 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3118 } 3119 for (i=0; i<B->rmap->n; i++) { 3120 PetscInt row; 3121 row = i; 3122 if (rowindices) row = rowindices[i]; 3123 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3124 col = Baij->j[count]; 3125 if (colindices) col = colindices[col]; 3126 if (Bdisassembled && col>=rstart) col += shift; 3127 v = Baij->a[count]; 3128 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3129 ++count; 3130 } 3131 } 3132 /* No assembly for aij->B is necessary. */ 3133 /* FIXME: set aij->B's nonzerostate correctly. */ 3134 } else { 3135 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3136 } 3137 C->preallocated = PETSC_TRUE; 3138 C->was_assembled = PETSC_FALSE; 3139 C->assembled = PETSC_FALSE; 3140 /* 3141 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3142 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3143 */ 3144 PetscFunctionReturn(0); 3145 } 3146 3147 /* 3148 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3149 */ 3150 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3151 { 3152 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3153 3154 PetscFunctionBegin; 3155 PetscValidPointer(A,2); 3156 PetscValidPointer(B,3); 3157 /* FIXME: make sure C is assembled */ 3158 *A = aij->A; 3159 *B = aij->B; 3160 /* Note that we don't incref *A and *B, so be careful! */ 3161 PetscFunctionReturn(0); 3162 } 3163 3164 /* 3165 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3166 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3167 */ 3168 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3169 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3170 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3171 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3172 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3173 { 3174 PetscErrorCode ierr; 3175 PetscMPIInt isize,flag; 3176 PetscInt i,ii,cismax,ispar; 3177 Mat *A,*B; 3178 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3179 3180 PetscFunctionBegin; 3181 if (!ismax) PetscFunctionReturn(0); 3182 3183 for (i = 0, cismax = 0; i < ismax; ++i) { 3184 PetscMPIInt isize; 3185 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3186 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3187 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3188 if (isize > 1) ++cismax; 3189 } 3190 3191 /* 3192 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3193 ispar counts the number of parallel ISs across C's comm. 3194 */ 3195 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3196 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3197 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3198 PetscFunctionReturn(0); 3199 } 3200 3201 /* if (ispar) */ 3202 /* 3203 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3204 These are used to extract the off-diag portion of the resulting parallel matrix. 3205 The row IS for the off-diag portion is the same as for the diag portion, 3206 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3207 */ 3208 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3209 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3210 for (i = 0, ii = 0; i < ismax; ++i) { 3211 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3212 if (isize > 1) { 3213 /* 3214 TODO: This is the part that's ***NOT SCALABLE***. 3215 To fix this we need to extract just the indices of C's nonzero columns 3216 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3217 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3218 to be done without serializing on the IS list, so, most likely, it is best 3219 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3220 */ 3221 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3222 /* Now we have to 3223 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3224 were sorted on each rank, concatenated they might no longer be sorted; 3225 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3226 indices in the nondecreasing order to the original index positions. 3227 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3228 */ 3229 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3230 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3231 ++ii; 3232 } 3233 } 3234 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3235 for (i = 0, ii = 0; i < ismax; ++i) { 3236 PetscInt j,issize; 3237 const PetscInt *indices; 3238 3239 /* 3240 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3241 */ 3242 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3243 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3244 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3245 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3246 for (j = 1; j < issize; ++j) { 3247 if (indices[j] == indices[j-1]) { 3248 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3249 } 3250 } 3251 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3252 3253 3254 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3255 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3256 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3257 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3258 for (j = 1; j < issize; ++j) { 3259 if (indices[j-1] == indices[j]) { 3260 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3261 } 3262 } 3263 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3264 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3265 if (isize > 1) { 3266 cisrow[ii] = isrow[i]; 3267 ++ii; 3268 } 3269 } 3270 /* 3271 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3272 array of sequential matrices underlying the resulting parallel matrices. 3273 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3274 contain duplicates. 3275 3276 There are as many diag matrices as there are original index sets. There are only as many parallel 3277 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3278 3279 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3280 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3281 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3282 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3283 with A[i] and B[ii] extracted from the corresponding MPI submat. 3284 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3285 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3286 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3287 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3288 values into A[i] and B[ii] sitting inside the corresponding submat. 3289 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3290 A[i], B[ii], AA[i] or BB[ii] matrices. 3291 */ 3292 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3293 if (scall == MAT_INITIAL_MATRIX) { 3294 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3295 } 3296 3297 /* Now obtain the sequential A and B submatrices separately. */ 3298 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3299 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3300 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3301 3302 /* 3303 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3304 matrices A & B have been extracted directly into the parallel matrices containing them, or 3305 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3306 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3307 to have the same sparsity pattern. 3308 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3309 must be constructed for C. This is done by setseqmat(s). 3310 */ 3311 for (i = 0, ii = 0; i < ismax; ++i) { 3312 /* 3313 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3314 That way we can avoid sorting and computing permutations when reusing. 3315 To this end: 3316 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3317 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3318 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3319 */ 3320 MatStructure pattern; 3321 pattern = DIFFERENT_NONZERO_PATTERN; 3322 3323 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3324 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3325 if (isize > 1) { 3326 if (scall == MAT_INITIAL_MATRIX) { 3327 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3328 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3329 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3330 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3331 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3332 } 3333 /* 3334 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3335 */ 3336 { 3337 Mat AA,BB; 3338 AA = A[i]; 3339 BB = B[ii]; 3340 if (AA || BB) { 3341 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3342 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3343 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3344 } 3345 3346 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3347 } 3348 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3349 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3350 ++ii; 3351 } else { /* if (isize == 1) */ 3352 if (scall == MAT_REUSE_MATRIX) { 3353 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3354 } 3355 if (isrow_p[i] || iscol_p[i]) { 3356 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3357 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3358 /* Otherwise A is extracted straight into (*submats)[i]. */ 3359 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3360 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3361 } else (*submat)[i] = A[i]; 3362 } 3363 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3364 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3365 } 3366 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3367 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3368 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3369 ierr = PetscFree(A);CHKERRQ(ierr); 3370 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3371 PetscFunctionReturn(0); 3372 } 3373 3374 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3375 { 3376 PetscErrorCode ierr; 3377 3378 PetscFunctionBegin; 3379 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3380 PetscFunctionReturn(0); 3381 } 3382