1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++){ 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++){ 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank){ 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows){ 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++){ 115 if (tosizes_temp[i]){ 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++){ 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++){ 133 for (j=0; j<fromsizes[2*i]; j++){ 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++){ 169 for (j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++){ 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++){ 209 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 210 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 213 } 214 /* retrieve information to get row id and its overlap */ 215 for (i=0; i<nrecvs; ){ 216 is_id = recvdata[i++]; 217 data_size = recvdata[i++]; 218 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 219 isz_i = isz[is_id]; 220 for (j=0; j< data_size; j++){ 221 col = recvdata[i++]; 222 indices_i[isz_i++] = col; 223 } 224 isz[is_id] = isz_i; 225 } 226 /* remove duplicate entities */ 227 for (i=0; i<nidx; i++){ 228 indices_i = indices_temp+(max_lsize+nrecvs)*i; 229 isz_i = isz[i]; 230 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 231 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 232 } 233 ierr = PetscFree(isz);CHKERRQ(ierr); 234 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 239 { 240 PetscLayout rmap,cmap; 241 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 242 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 243 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 244 const PetscInt *gcols,*ai,*aj,*bi,*bj; 245 Mat amat,bmat; 246 PetscMPIInt rank; 247 PetscBool done; 248 MPI_Comm comm; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 253 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 254 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 255 /* Even if the mat is symmetric, we still assume it is not symmetric */ 256 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 257 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 258 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 /* total number of nonzero values is used to estimate the memory usage in the next step */ 261 tnz = ai[an]+bi[bn]; 262 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 263 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 264 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 265 /* to find the longest message */ 266 max_fszs = 0; 267 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 268 /* better way to estimate number of nonzero in the mat??? */ 269 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 270 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 271 rows_pos = 0; 272 totalrows = 0; 273 for (i=0; i<nfrom; i++){ 274 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 275 /* group data together */ 276 for (j=0; j<fromsizes[2*i]; j+=2){ 277 is_id = fromrows[rows_pos++];/* no of is */ 278 rows_i = rows_data[is_id]; 279 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 280 } 281 /* estimate a space to avoid multiple allocations */ 282 for (j=0; j<nidx; j++){ 283 indvc_ij = 0; 284 rows_i = rows_data[j]; 285 for (l=0; l<rows_pos_i[j]; l++){ 286 row = rows_i[l]-rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for (k=start; k<end; k++){ /* Amat */ 290 col = aj[k] + cstart; 291 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for (k=start; k<end; k++) { /* Bmat */ 296 col = gcols[bj[k]]; 297 indices_tmp[indvc_ij++] = col; 298 } 299 } 300 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 301 indv_counts[i*nidx+j] = indvc_ij; 302 totalrows += indvc_ij; 303 } 304 } 305 /* message triple <no of is, number of rows, rows> */ 306 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 307 totalrows = 0; 308 rows_pos = 0; 309 /* use this code again */ 310 for (i=0;i<nfrom;i++){ 311 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 312 for (j=0; j<fromsizes[2*i]; j+=2){ 313 is_id = fromrows[rows_pos++]; 314 rows_i = rows_data[is_id]; 315 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 316 } 317 /* add data */ 318 for (j=0; j<nidx; j++){ 319 if (!indv_counts[i*nidx+j]) continue; 320 indvc_ij = 0; 321 sbdata[totalrows++] = j; 322 sbdata[totalrows++] = indv_counts[i*nidx+j]; 323 sbsizes[2*i] += 2; 324 rows_i = rows_data[j]; 325 for (l=0; l<rows_pos_i[j]; l++){ 326 row = rows_i[l]-rstart; 327 start = ai[row]; 328 end = ai[row+1]; 329 for (k=start; k<end; k++){ /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; 332 } 333 start = bi[row]; 334 end = bi[row+1]; 335 for (k=start; k<end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 341 sbsizes[2*i] += indvc_ij; 342 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 343 totalrows += indvc_ij; 344 } 345 } 346 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 347 for (i=0; i<nfrom; i++){ 348 offsets[i+1] = offsets[i] + sbsizes[2*i]; 349 sbsizes[2*i+1] = offsets[i]; 350 } 351 ierr = PetscFree(offsets);CHKERRQ(ierr); 352 if (sbrowsizes) *sbrowsizes = sbsizes; 353 if (sbrows) *sbrows = sbdata; 354 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 355 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 356 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 361 { 362 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 363 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 364 PetscInt lsize,lsize_tmp,owner; 365 PetscMPIInt rank; 366 Mat amat,bmat; 367 PetscBool done; 368 PetscLayout cmap,rmap; 369 MPI_Comm comm; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 374 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 375 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 376 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 377 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 378 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 /* is it a safe way to compute number of nonzero values ? */ 381 tnz = ai[an]+bi[bn]; 382 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 383 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 384 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 385 /* it is a better way to estimate memory than the old implementation 386 * where global size of matrix is used 387 * */ 388 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 389 for (i=0; i<nidx; i++) { 390 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 391 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 392 lsize_tmp = 0; 393 for (j=0; j<lsize; j++) { 394 owner = -1; 395 row = indices[j]; 396 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 397 if (owner != rank) continue; 398 /* local number */ 399 row -= rstart; 400 start = ai[row]; 401 end = ai[row+1]; 402 for (k=start; k<end; k++) { /* Amat */ 403 col = aj[k] + cstart; 404 indices_temp[lsize_tmp++] = col; 405 } 406 start = bi[row]; 407 end = bi[row+1]; 408 for (k=start; k<end; k++) { /* Bmat */ 409 col = gcols[bj[k]]; 410 indices_temp[lsize_tmp++] = col; 411 } 412 } 413 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 414 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 415 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 416 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 417 } 418 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 419 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 420 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests recieved (which have to be or which have been processed 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscErrorCode ierr; 461 PetscMPIInt size,rank,tag1,tag2; 462 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 463 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 464 PetscBT *table; 465 MPI_Comm comm; 466 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 467 MPI_Status *s_status,*recv_status; 468 char *t_p; 469 470 PetscFunctionBegin; 471 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 477 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 478 479 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 480 481 for (i=0; i<imax; i++) { 482 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 483 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 489 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 490 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 491 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 492 for (i=0; i<imax; i++) { 493 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 idx_i = idx[i]; 495 len = n[i]; 496 for (j=0; j<len; j++) { 497 row = idx_i[j]; 498 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 499 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 500 w4[proc]++; 501 } 502 for (j=0; j<size; j++) { 503 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 504 } 505 } 506 507 nrqs = 0; /* no of outgoing messages */ 508 msz = 0; /* total mesg length (for all proc */ 509 w1[rank] = 0; /* no mesg sent to intself */ 510 w3[rank] = 0; 511 for (i=0; i<size; i++) { 512 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 513 } 514 /* pa - is list of processors to communicate with */ 515 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 516 for (i=0,j=0; i<size; i++) { 517 if (w1[i]) {pa[j] = i; j++;} 518 } 519 520 /* Each message would have a header = 1 + 2*(no of IS) + data */ 521 for (i=0; i<nrqs; i++) { 522 j = pa[i]; 523 w1[j] += w2[j] + 2*w3[j]; 524 msz += w1[j]; 525 } 526 527 /* Determine the number of messages to expect, their lengths, from from-ids */ 528 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 529 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 530 531 /* Now post the Irecvs corresponding to these messages */ 532 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 533 534 /* Allocate Memory for outgoing messages */ 535 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 536 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 537 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 538 539 { 540 PetscInt *iptr = tmp,ict = 0; 541 for (i=0; i<nrqs; i++) { 542 j = pa[i]; 543 iptr += ict; 544 outdat[j] = iptr; 545 ict = w1[j]; 546 } 547 } 548 549 /* Form the outgoing messages */ 550 /* plug in the headers */ 551 for (i=0; i<nrqs; i++) { 552 j = pa[i]; 553 outdat[j][0] = 0; 554 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 555 ptr[j] = outdat[j] + 2*w3[j] + 1; 556 } 557 558 /* Memory for doing local proc's work */ 559 { 560 PetscInt M_BPB_imax = 0; 561 #if defined(PETSC_USE_CTABLE) 562 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 563 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 564 for (i=0; i<imax; i++) { 565 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 566 } 567 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 570 } 571 #else 572 PetscInt Mimax = 0; 573 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 574 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 575 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 data[i] = d_p + M*i; 579 } 580 #endif 581 } 582 583 /* Parse the IS and update local tables and the outgoing buf with the data */ 584 { 585 PetscInt n_i,isz_i,*outdat_j,ctr_j; 586 PetscBT table_i; 587 588 for (i=0; i<imax; i++) { 589 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 590 n_i = n[i]; 591 table_i = table[i]; 592 idx_i = idx[i]; 593 #if defined(PETSC_USE_CTABLE) 594 table_data_i = table_data[i]; 595 #else 596 data_i = data[i]; 597 #endif 598 isz_i = isz[i]; 599 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 600 row = idx_i[j]; 601 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 602 if (proc != rank) { /* copy to the outgoing buffer */ 603 ctr[proc]++; 604 *ptr[proc] = row; 605 ptr[proc]++; 606 } else if (!PetscBTLookupSet(table_i,row)) { 607 #if defined(PETSC_USE_CTABLE) 608 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 609 #else 610 data_i[isz_i] = row; /* Update the local table */ 611 #endif 612 isz_i++; 613 } 614 } 615 /* Update the headers for the current IS */ 616 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 617 if ((ctr_j = ctr[j])) { 618 outdat_j = outdat[j]; 619 k = ++outdat_j[0]; 620 outdat_j[2*k] = ctr_j; 621 outdat_j[2*k-1] = i; 622 } 623 } 624 isz[i] = isz_i; 625 } 626 } 627 628 /* Now post the sends */ 629 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 630 for (i=0; i<nrqs; ++i) { 631 j = pa[i]; 632 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 633 } 634 635 /* No longer need the original indices */ 636 for (i=0; i<imax; ++i) { 637 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 638 } 639 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 640 641 for (i=0; i<imax; ++i) { 642 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 643 } 644 645 /* Do Local work */ 646 #if defined(PETSC_USE_CTABLE) 647 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 648 #else 649 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 650 #endif 651 652 /* Receive messages */ 653 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 654 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 655 656 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 657 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 658 659 /* Phase 1 sends are complete - deallocate buffers */ 660 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 661 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 662 663 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 664 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 665 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 666 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 667 ierr = PetscFree(rbuf);CHKERRQ(ierr); 668 669 670 /* Send the data back */ 671 /* Do a global reduction to know the buffer space req for incoming messages */ 672 { 673 PetscMPIInt *rw1; 674 675 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 676 677 for (i=0; i<nrqr; ++i) { 678 proc = recv_status[i].MPI_SOURCE; 679 680 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 681 rw1[proc] = isz1[i]; 682 } 683 ierr = PetscFree(onodes1);CHKERRQ(ierr); 684 ierr = PetscFree(olengths1);CHKERRQ(ierr); 685 686 /* Determine the number of messages to expect, their lengths, from from-ids */ 687 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 688 ierr = PetscFree(rw1);CHKERRQ(ierr); 689 } 690 /* Now post the Irecvs corresponding to these messages */ 691 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 692 693 /* Now post the sends */ 694 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 695 for (i=0; i<nrqr; ++i) { 696 j = recv_status[i].MPI_SOURCE; 697 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 698 } 699 700 /* receive work done on other processors */ 701 { 702 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 703 PetscMPIInt idex; 704 PetscBT table_i; 705 MPI_Status *status2; 706 707 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 708 for (i=0; i<nrqs; ++i) { 709 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 710 /* Process the message */ 711 rbuf2_i = rbuf2[idex]; 712 ct1 = 2*rbuf2_i[0]+1; 713 jmax = rbuf2[idex][0]; 714 for (j=1; j<=jmax; j++) { 715 max = rbuf2_i[2*j]; 716 is_no = rbuf2_i[2*j-1]; 717 isz_i = isz[is_no]; 718 table_i = table[is_no]; 719 #if defined(PETSC_USE_CTABLE) 720 table_data_i = table_data[is_no]; 721 #else 722 data_i = data[is_no]; 723 #endif 724 for (k=0; k<max; k++,ct1++) { 725 row = rbuf2_i[ct1]; 726 if (!PetscBTLookupSet(table_i,row)) { 727 #if defined(PETSC_USE_CTABLE) 728 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 729 #else 730 data_i[isz_i] = row; 731 #endif 732 isz_i++; 733 } 734 } 735 isz[is_no] = isz_i; 736 } 737 } 738 739 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 740 ierr = PetscFree(status2);CHKERRQ(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 } 768 769 ierr = PetscFree(onodes2);CHKERRQ(ierr); 770 ierr = PetscFree(olengths2);CHKERRQ(ierr); 771 772 ierr = PetscFree(pa);CHKERRQ(ierr); 773 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 774 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 775 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 776 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 779 ierr = PetscFree(s_status);CHKERRQ(ierr); 780 ierr = PetscFree(recv_status);CHKERRQ(ierr); 781 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 782 ierr = PetscFree(xdata);CHKERRQ(ierr); 783 ierr = PetscFree(isz1);CHKERRQ(ierr); 784 #if defined(PETSC_USE_CTABLE) 785 for (i=0; i<imax; i++) { 786 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 787 } 788 ierr = PetscFree(table_data);CHKERRQ(ierr); 789 ierr = PetscFree(tdata);CHKERRQ(ierr); 790 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 791 #else 792 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 793 #endif 794 PetscFunctionReturn(0); 795 } 796 797 /* 798 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 799 the work on the local processor. 800 801 Inputs: 802 C - MAT_MPIAIJ; 803 imax - total no of index sets processed at a time; 804 table - an array of char - size = m bits. 805 806 Output: 807 isz - array containing the count of the solution elements corresponding 808 to each index set; 809 data or table_data - pointer to the solutions 810 */ 811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 812 { 813 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 814 Mat A = c->A,B = c->B; 815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 816 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 817 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 818 PetscBT table_i; 819 #if defined(PETSC_USE_CTABLE) 820 PetscTable table_data_i; 821 PetscErrorCode ierr; 822 PetscTablePosition tpos; 823 PetscInt tcount,*tdata; 824 #else 825 PetscInt *data_i; 826 #endif 827 828 PetscFunctionBegin; 829 rstart = C->rmap->rstart; 830 cstart = C->cmap->rstart; 831 ai = a->i; 832 aj = a->j; 833 bi = b->i; 834 bj = b->j; 835 garray = c->garray; 836 837 for (i=0; i<imax; i++) { 838 #if defined(PETSC_USE_CTABLE) 839 /* copy existing entries of table_data_i into tdata[] */ 840 table_data_i = table_data[i]; 841 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 842 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 843 844 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 845 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 846 while (tpos) { 847 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 848 tdata[--j] = --row; 849 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 850 } 851 #else 852 data_i = data[i]; 853 #endif 854 table_i = table[i]; 855 isz_i = isz[i]; 856 max = isz[i]; 857 858 for (j=0; j<max; j++) { 859 #if defined(PETSC_USE_CTABLE) 860 row = tdata[j] - rstart; 861 #else 862 row = data_i[j] - rstart; 863 #endif 864 start = ai[row]; 865 end = ai[row+1]; 866 for (k=start; k<end; k++) { /* Amat */ 867 val = aj[k] + cstart; 868 if (!PetscBTLookupSet(table_i,val)) { 869 #if defined(PETSC_USE_CTABLE) 870 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 871 #else 872 data_i[isz_i] = val; 873 #endif 874 isz_i++; 875 } 876 } 877 start = bi[row]; 878 end = bi[row+1]; 879 for (k=start; k<end; k++) { /* Bmat */ 880 val = garray[bj[k]]; 881 if (!PetscBTLookupSet(table_i,val)) { 882 #if defined(PETSC_USE_CTABLE) 883 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 884 #else 885 data_i[isz_i] = val; 886 #endif 887 isz_i++; 888 } 889 } 890 } 891 isz[i] = isz_i; 892 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscFree(tdata);CHKERRQ(ierr); 895 #endif 896 } 897 PetscFunctionReturn(0); 898 } 899 900 /* 901 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 902 and return the output 903 904 Input: 905 C - the matrix 906 nrqr - no of messages being processed. 907 rbuf - an array of pointers to the recieved requests 908 909 Output: 910 xdata - array of messages to be sent back 911 isz1 - size of each message 912 913 For better efficiency perhaps we should malloc separately each xdata[i], 914 then if a remalloc is required we need only copy the data for that one row 915 rather then all previous rows as it is now where a single large chunck of 916 memory is used. 917 918 */ 919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 920 { 921 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 922 Mat A = c->A,B = c->B; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 924 PetscErrorCode ierr; 925 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 926 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 927 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 928 PetscInt *rbuf_i,kmax,rbuf_0; 929 PetscBT xtable; 930 931 PetscFunctionBegin; 932 m = C->rmap->N; 933 rstart = C->rmap->rstart; 934 cstart = C->cmap->rstart; 935 ai = a->i; 936 aj = a->j; 937 bi = b->i; 938 bj = b->j; 939 garray = c->garray; 940 941 942 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 943 rbuf_i = rbuf[i]; 944 rbuf_0 = rbuf_i[0]; 945 ct += rbuf_0; 946 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 947 } 948 949 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 950 else max1 = 1; 951 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 952 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 953 ++no_malloc; 954 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 955 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 956 957 ct3 = 0; 958 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 959 rbuf_i = rbuf[i]; 960 rbuf_0 = rbuf_i[0]; 961 ct1 = 2*rbuf_0+1; 962 ct2 = ct1; 963 ct3 += ct1; 964 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 965 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 966 oct2 = ct2; 967 kmax = rbuf_i[2*j]; 968 for (k=0; k<kmax; k++,ct1++) { 969 row = rbuf_i[ct1]; 970 if (!PetscBTLookupSet(xtable,row)) { 971 if (!(ct3 < mem_estimate)) { 972 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 973 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 974 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 975 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 976 xdata[0] = tmp; 977 mem_estimate = new_estimate; ++no_malloc; 978 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 979 } 980 xdata[i][ct2++] = row; 981 ct3++; 982 } 983 } 984 for (k=oct2,max2=ct2; k<max2; k++) { 985 row = xdata[i][k] - rstart; 986 start = ai[row]; 987 end = ai[row+1]; 988 for (l=start; l<end; l++) { 989 val = aj[l] + cstart; 990 if (!PetscBTLookupSet(xtable,val)) { 991 if (!(ct3 < mem_estimate)) { 992 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 993 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 994 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 995 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 996 xdata[0] = tmp; 997 mem_estimate = new_estimate; ++no_malloc; 998 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 999 } 1000 xdata[i][ct2++] = val; 1001 ct3++; 1002 } 1003 } 1004 start = bi[row]; 1005 end = bi[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = garray[bj[l]]; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 } 1023 /* Update the header*/ 1024 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1025 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1026 } 1027 xdata[i][0] = rbuf_0; 1028 xdata[i+1] = xdata[i] + ct2; 1029 isz1[i] = ct2; /* size of each message */ 1030 } 1031 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1032 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 /* -------------------------------------------------------------------------*/ 1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1038 /* 1039 Every processor gets the entire matrix 1040 */ 1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1042 { 1043 Mat B; 1044 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1045 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1046 PetscErrorCode ierr; 1047 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1048 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1049 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1050 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1051 1052 PetscFunctionBegin; 1053 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1054 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1055 1056 if (scall == MAT_INITIAL_MATRIX) { 1057 /* ---------------------------------------------------------------- 1058 Tell every processor the number of nonzeros per row 1059 */ 1060 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1061 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1062 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1063 } 1064 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1065 for (i=0; i<size; i++) { 1066 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1067 displs[i] = A->rmap->range[i]; 1068 } 1069 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1070 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1071 #else 1072 sendcount = A->rmap->rend - A->rmap->rstart; 1073 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1074 #endif 1075 /* --------------------------------------------------------------- 1076 Create the sequential matrix of the same type as the local block diagonal 1077 */ 1078 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1079 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1080 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1081 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1082 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1083 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1084 **Bin = B; 1085 b = (Mat_SeqAIJ*)B->data; 1086 1087 /*-------------------------------------------------------------------- 1088 Copy my part of matrix column indices over 1089 */ 1090 sendcount = ad->nz + bd->nz; 1091 jsendbuf = b->j + b->i[rstarts[rank]]; 1092 a_jsendbuf = ad->j; 1093 b_jsendbuf = bd->j; 1094 n = A->rmap->rend - A->rmap->rstart; 1095 cnt = 0; 1096 for (i=0; i<n; i++) { 1097 1098 /* put in lower diagonal portion */ 1099 m = bd->i[i+1] - bd->i[i]; 1100 while (m > 0) { 1101 /* is it above diagonal (in bd (compressed) numbering) */ 1102 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1103 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1104 m--; 1105 } 1106 1107 /* put in diagonal portion */ 1108 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1109 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1110 } 1111 1112 /* put in upper diagonal portion */ 1113 while (m-- > 0) { 1114 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1115 } 1116 } 1117 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1118 1119 /*-------------------------------------------------------------------- 1120 Gather all column indices to all processors 1121 */ 1122 for (i=0; i<size; i++) { 1123 recvcounts[i] = 0; 1124 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1125 recvcounts[i] += lens[j]; 1126 } 1127 } 1128 displs[0] = 0; 1129 for (i=1; i<size; i++) { 1130 displs[i] = displs[i-1] + recvcounts[i-1]; 1131 } 1132 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1133 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1134 #else 1135 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1136 #endif 1137 /*-------------------------------------------------------------------- 1138 Assemble the matrix into useable form (note numerical values not yet set) 1139 */ 1140 /* set the b->ilen (length of each row) values */ 1141 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1142 /* set the b->i indices */ 1143 b->i[0] = 0; 1144 for (i=1; i<=A->rmap->N; i++) { 1145 b->i[i] = b->i[i-1] + lens[i-1]; 1146 } 1147 ierr = PetscFree(lens);CHKERRQ(ierr); 1148 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1150 1151 } else { 1152 B = **Bin; 1153 b = (Mat_SeqAIJ*)B->data; 1154 } 1155 1156 /*-------------------------------------------------------------------- 1157 Copy my part of matrix numerical values into the values location 1158 */ 1159 if (flag == MAT_GET_VALUES) { 1160 sendcount = ad->nz + bd->nz; 1161 sendbuf = b->a + b->i[rstarts[rank]]; 1162 a_sendbuf = ad->a; 1163 b_sendbuf = bd->a; 1164 b_sendj = bd->j; 1165 n = A->rmap->rend - A->rmap->rstart; 1166 cnt = 0; 1167 for (i=0; i<n; i++) { 1168 1169 /* put in lower diagonal portion */ 1170 m = bd->i[i+1] - bd->i[i]; 1171 while (m > 0) { 1172 /* is it above diagonal (in bd (compressed) numbering) */ 1173 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1174 sendbuf[cnt++] = *b_sendbuf++; 1175 m--; 1176 b_sendj++; 1177 } 1178 1179 /* put in diagonal portion */ 1180 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1181 sendbuf[cnt++] = *a_sendbuf++; 1182 } 1183 1184 /* put in upper diagonal portion */ 1185 while (m-- > 0) { 1186 sendbuf[cnt++] = *b_sendbuf++; 1187 b_sendj++; 1188 } 1189 } 1190 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1191 1192 /* ----------------------------------------------------------------- 1193 Gather all numerical values to all processors 1194 */ 1195 if (!recvcounts) { 1196 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1197 } 1198 for (i=0; i<size; i++) { 1199 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1200 } 1201 displs[0] = 0; 1202 for (i=1; i<size; i++) { 1203 displs[i] = displs[i-1] + recvcounts[i-1]; 1204 } 1205 recvbuf = b->a; 1206 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1207 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1208 #else 1209 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1210 #endif 1211 } /* endof (flag == MAT_GET_VALUES) */ 1212 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1213 1214 if (A->symmetric) { 1215 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1216 } else if (A->hermitian) { 1217 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1218 } else if (A->structurally_symmetric) { 1219 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1220 } 1221 PetscFunctionReturn(0); 1222 } 1223 1224 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1225 { 1226 PetscErrorCode ierr; 1227 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1228 Mat_SubMat *submatj = c->submatis1; 1229 PetscInt i; 1230 1231 PetscFunctionBegin; 1232 if (!submatj->id) { /* delete data that are linked only to submats[id=0] */ 1233 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1234 1235 for (i=0; i<submatj->nrqr; ++i) { 1236 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1237 } 1238 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1239 1240 if (submatj->rbuf1) { 1241 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1242 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1243 } 1244 1245 for (i=0; i<submatj->nrqs; ++i) { 1246 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1247 } 1248 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1249 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1250 } 1251 1252 #if defined(PETSC_USE_CTABLE) 1253 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1254 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1255 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1256 #else 1257 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1258 #endif 1259 1260 if (!submatj->allcolumns) { 1261 #if defined(PETSC_USE_CTABLE) 1262 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1263 #else 1264 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1265 #endif 1266 } 1267 ierr = submatj->destroy(C);CHKERRQ(ierr); 1268 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1269 1270 ierr = PetscFree(submatj);CHKERRQ(ierr); 1271 PetscFunctionReturn(0); 1272 } 1273 1274 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1275 { 1276 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1277 Mat submat,A = c->A,B = c->B; 1278 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1279 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1280 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1281 const PetscInt *icol,*irow; 1282 PetscInt nrow,ncol,start; 1283 PetscErrorCode ierr; 1284 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1285 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1286 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1287 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1288 PetscInt *lens,rmax,ncols,*cols,Crow; 1289 #if defined(PETSC_USE_CTABLE) 1290 PetscTable cmap,rmap; 1291 PetscInt *cmap_loc,*rmap_loc; 1292 #else 1293 PetscInt *cmap,*rmap; 1294 #endif 1295 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1296 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1297 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1298 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1299 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1300 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1301 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1302 MPI_Comm comm; 1303 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1304 PetscMPIInt *onodes1,*olengths1,idex,end; 1305 Mat_SubMat *smatis1; 1306 PetscBool isrowsorted; 1307 1308 PetscFunctionBegin; 1309 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1310 1311 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1312 size = c->size; 1313 rank = c->rank; 1314 1315 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1316 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1317 1318 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1319 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1320 if (allcolumns) { 1321 icol = NULL; 1322 ncol = C->cmap->N; 1323 } else { 1324 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1325 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1326 } 1327 1328 if (scall == MAT_INITIAL_MATRIX) { 1329 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1330 1331 /* Get some new tags to keep the communication clean */ 1332 tag1 = ((PetscObject)C)->tag; 1333 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1334 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1335 1336 /* evaluate communication - mesg to who, length of mesg, and buffer space 1337 required. Based on this, buffers are allocated, and data copied into them */ 1338 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1339 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1340 1341 /* w1[proc] = num of rows owned by proc -- to be requested */ 1342 proc = 0; 1343 nrqs = 0; /* num of outgoing messages */ 1344 for (j=0; j<nrow; j++) { 1345 row = irow[j]; /* sorted! */ 1346 while (row >= C->rmap->range[proc+1]) proc++; 1347 w1[proc]++; 1348 row2proc[j] = proc; /* map row index to proc */ 1349 1350 if (proc != rank && !w2[proc]) { 1351 w2[proc] = 1; nrqs++; 1352 } 1353 } 1354 w1[rank] = 0; /* rows owned by self will not be requested */ 1355 1356 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1357 for (proc=0,j=0; proc<size; proc++) { 1358 if (w1[proc]) { pa[j++] = proc;} 1359 } 1360 1361 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1362 msz = 0; /* total mesg length (for all procs) */ 1363 for (i=0; i<nrqs; i++) { 1364 proc = pa[i]; 1365 w1[proc] += 3; 1366 msz += w1[proc]; 1367 } 1368 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1369 1370 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1371 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1372 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1373 1374 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1375 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1376 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1377 1378 /* Now post the Irecvs corresponding to these messages */ 1379 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1380 1381 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1382 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1383 1384 /* Allocate Memory for outgoing messages */ 1385 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1386 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1387 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1388 1389 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1390 iptr = tmp; 1391 for (i=0; i<nrqs; i++) { 1392 proc = pa[i]; 1393 sbuf1[proc] = iptr; 1394 iptr += w1[proc]; 1395 } 1396 1397 /* Form the outgoing messages */ 1398 /* Initialize the header space */ 1399 for (i=0; i<nrqs; i++) { 1400 proc = pa[i]; 1401 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1402 ptr[proc] = sbuf1[proc] + 3; 1403 } 1404 1405 /* Parse the isrow and copy data into outbuf */ 1406 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1407 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1408 proc = row2proc[j]; 1409 if (proc != rank) { /* copy to the outgoing buf*/ 1410 *ptr[proc] = irow[j]; 1411 ctr[proc]++; ptr[proc]++; 1412 } 1413 } 1414 1415 /* Update the headers for the current IS */ 1416 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1417 if ((ctr_j = ctr[j])) { 1418 sbuf1_j = sbuf1[j]; 1419 k = ++sbuf1_j[0]; 1420 sbuf1_j[2*k] = ctr_j; 1421 sbuf1_j[2*k-1] = 0; 1422 } 1423 } 1424 1425 /* Now post the sends */ 1426 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1427 for (i=0; i<nrqs; ++i) { 1428 proc = pa[i]; 1429 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1430 } 1431 1432 /* Post Receives to capture the buffer size */ 1433 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1434 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1435 1436 rbuf2[0] = tmp + msz; 1437 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1438 1439 for (i=0; i<nrqs; ++i) { 1440 proc = pa[i]; 1441 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1442 } 1443 1444 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1445 1446 /* Send to other procs the buf size they should allocate */ 1447 /* Receive messages*/ 1448 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1449 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1450 1451 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1452 for (i=0; i<nrqr; ++i) { 1453 req_size[i] = 0; 1454 rbuf1_i = rbuf1[i]; 1455 start = 2*rbuf1_i[0] + 1; 1456 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1457 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1458 sbuf2_i = sbuf2[i]; 1459 for (j=start; j<end; j++) { 1460 k = rbuf1_i[j] - rstart; 1461 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1462 sbuf2_i[j] = ncols; 1463 req_size[i] += ncols; 1464 } 1465 req_source1[i] = r_status1[i].MPI_SOURCE; 1466 1467 /* form the header */ 1468 sbuf2_i[0] = req_size[i]; 1469 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1470 1471 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1472 } 1473 1474 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1475 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1476 1477 /* rbuf2 is received, Post recv column indices a->j */ 1478 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1479 1480 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1481 for (i=0; i<nrqs; ++i) { 1482 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1483 req_source2[i] = r_status2[i].MPI_SOURCE; 1484 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1485 } 1486 1487 /* Wait on sends1 and sends2 */ 1488 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1489 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1490 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1491 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1492 1493 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1494 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1495 1496 /* Now allocate sending buffers for a->j, and send them off */ 1497 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1498 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1499 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1500 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1501 1502 for (i=0; i<nrqr; i++) { /* for each requested message */ 1503 rbuf1_i = rbuf1[i]; 1504 sbuf_aj_i = sbuf_aj[i]; 1505 ct1 = 2*rbuf1_i[0] + 1; 1506 ct2 = 0; 1507 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1508 1509 kmax = rbuf1[i][2]; 1510 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1511 row = rbuf1_i[ct1] - rstart; 1512 nzA = ai[row+1] - ai[row]; 1513 nzB = bi[row+1] - bi[row]; 1514 ncols = nzA + nzB; 1515 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1516 1517 /* load the column indices for this row into cols*/ 1518 cols = sbuf_aj_i + ct2; 1519 1520 lwrite = 0; 1521 for (l=0; l<nzB; l++) { 1522 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1523 } 1524 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1525 for (l=0; l<nzB; l++) { 1526 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1527 } 1528 1529 ct2 += ncols; 1530 } 1531 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1532 } 1533 1534 /* create column map (cmap): global col of C -> local col of submat */ 1535 #if defined(PETSC_USE_CTABLE) 1536 if (!allcolumns) { 1537 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1538 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1539 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1540 if (icol[j] >= cstart && icol[j] <cend) { 1541 cmap_loc[icol[j] - cstart] = j+1; 1542 } else { /* use PetscTable for non-local col indices */ 1543 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1544 } 1545 } 1546 } else { 1547 cmap = NULL; 1548 cmap_loc = NULL; 1549 } 1550 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1551 #else 1552 if (!allcolumns) { 1553 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1554 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1555 } else { 1556 cmap = NULL; 1557 } 1558 #endif 1559 1560 /* Create lens for MatSeqAIJSetPreallocation() */ 1561 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1562 1563 /* Compute lens from local part of C */ 1564 for (j=0; j<nrow; j++) { 1565 row = irow[j]; 1566 proc = row2proc[j]; 1567 if (proc == rank) { 1568 /* diagonal part A = c->A */ 1569 ncols = ai[row-rstart+1] - ai[row-rstart]; 1570 cols = aj + ai[row-rstart]; 1571 if (!allcolumns) { 1572 for (k=0; k<ncols; k++) { 1573 #if defined(PETSC_USE_CTABLE) 1574 tcol = cmap_loc[cols[k]]; 1575 #else 1576 tcol = cmap[cols[k]+cstart]; 1577 #endif 1578 if (tcol) lens[j]++; 1579 } 1580 } else { /* allcolumns */ 1581 lens[j] = ncols; 1582 } 1583 1584 /* off-diagonal part B = c->B */ 1585 ncols = bi[row-rstart+1] - bi[row-rstart]; 1586 cols = bj + bi[row-rstart]; 1587 if (!allcolumns) { 1588 for (k=0; k<ncols; k++) { 1589 #if defined(PETSC_USE_CTABLE) 1590 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1591 #else 1592 tcol = cmap[bmap[cols[k]]]; 1593 #endif 1594 if (tcol) lens[j]++; 1595 } 1596 } else { /* allcolumns */ 1597 lens[j] += ncols; 1598 } 1599 } 1600 } 1601 1602 /* Create row map (rmap): global row of C -> local row of submat */ 1603 #if defined(PETSC_USE_CTABLE) 1604 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1605 for (j=0; j<nrow; j++) { 1606 row = irow[j]; 1607 proc = row2proc[j]; 1608 if (proc == rank) { /* a local row */ 1609 rmap_loc[row - rstart] = j; 1610 } else { 1611 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1612 } 1613 } 1614 #else 1615 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1616 for (j=0; j<nrow; j++) { 1617 rmap[irow[j]] = j; 1618 } 1619 #endif 1620 1621 /* Update lens from offproc data */ 1622 /* recv a->j is done */ 1623 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1624 for (i=0; i<nrqs; i++) { 1625 proc = pa[i]; 1626 sbuf1_i = sbuf1[proc]; 1627 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1628 ct1 = 2 + 1; 1629 ct2 = 0; 1630 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1631 rbuf3_i = rbuf3[i]; /* received C->j */ 1632 1633 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1634 max1 = sbuf1_i[2]; 1635 for (k=0; k<max1; k++,ct1++) { 1636 #if defined(PETSC_USE_CTABLE) 1637 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1638 row--; 1639 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1640 #else 1641 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1642 #endif 1643 /* Now, store row index of submat in sbuf1_i[ct1] */ 1644 sbuf1_i[ct1] = row; 1645 1646 nnz = rbuf2_i[ct1]; 1647 if (!allcolumns) { 1648 for (l=0; l<nnz; l++,ct2++) { 1649 #if defined(PETSC_USE_CTABLE) 1650 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1651 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1652 } else { 1653 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1654 } 1655 #else 1656 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1657 #endif 1658 if (tcol) lens[row]++; 1659 } 1660 } else { /* allcolumns */ 1661 lens[row] += nnz; 1662 } 1663 } 1664 } 1665 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1666 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1667 1668 /* Create the submatrices */ 1669 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1670 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1671 1672 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1673 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1674 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1675 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1676 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1677 1678 /* create struct Mat_SubMat and attached it to submat */ 1679 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1680 subc = (Mat_SeqAIJ*)submat->data; 1681 subc->submatis1 = smatis1; 1682 1683 smatis1->id = 0; 1684 smatis1->nrqs = nrqs; 1685 smatis1->nrqr = nrqr; 1686 smatis1->rbuf1 = rbuf1; 1687 smatis1->rbuf2 = rbuf2; 1688 smatis1->rbuf3 = rbuf3; 1689 smatis1->sbuf2 = sbuf2; 1690 smatis1->req_source2 = req_source2; 1691 1692 smatis1->sbuf1 = sbuf1; 1693 smatis1->ptr = ptr; 1694 smatis1->tmp = tmp; 1695 smatis1->ctr = ctr; 1696 1697 smatis1->pa = pa; 1698 smatis1->req_size = req_size; 1699 smatis1->req_source1 = req_source1; 1700 1701 smatis1->allcolumns = allcolumns; 1702 smatis1->row2proc = row2proc; 1703 smatis1->rmap = rmap; 1704 smatis1->cmap = cmap; 1705 #if defined(PETSC_USE_CTABLE) 1706 smatis1->rmap_loc = rmap_loc; 1707 smatis1->cmap_loc = cmap_loc; 1708 #endif 1709 1710 smatis1->destroy = submat->ops->destroy; 1711 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1712 submat->factortype = C->factortype; 1713 1714 /* compute rmax */ 1715 rmax = 0; 1716 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1717 1718 } else { /* scall == MAT_REUSE_MATRIX */ 1719 submat = submats[0]; 1720 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1721 1722 subc = (Mat_SeqAIJ*)submat->data; 1723 rmax = subc->rmax; 1724 smatis1 = subc->submatis1; 1725 nrqs = smatis1->nrqs; 1726 nrqr = smatis1->nrqr; 1727 rbuf1 = smatis1->rbuf1; 1728 rbuf2 = smatis1->rbuf2; 1729 rbuf3 = smatis1->rbuf3; 1730 req_source2 = smatis1->req_source2; 1731 1732 sbuf1 = smatis1->sbuf1; 1733 sbuf2 = smatis1->sbuf2; 1734 ptr = smatis1->ptr; 1735 tmp = smatis1->tmp; 1736 ctr = smatis1->ctr; 1737 1738 pa = smatis1->pa; 1739 req_size = smatis1->req_size; 1740 req_source1 = smatis1->req_source1; 1741 1742 allcolumns = smatis1->allcolumns; 1743 row2proc = smatis1->row2proc; 1744 rmap = smatis1->rmap; 1745 cmap = smatis1->cmap; 1746 #if defined(PETSC_USE_CTABLE) 1747 rmap_loc = smatis1->rmap_loc; 1748 cmap_loc = smatis1->cmap_loc; 1749 #endif 1750 } 1751 1752 /* Post recv matrix values */ 1753 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1754 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1755 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1756 for (i=0; i<nrqs; ++i) { 1757 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1758 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1759 } 1760 1761 /* Allocate sending buffers for a->a, and send them off */ 1762 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1763 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1764 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1765 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1766 1767 for (i=0; i<nrqr; i++) { 1768 rbuf1_i = rbuf1[i]; 1769 sbuf_aa_i = sbuf_aa[i]; 1770 ct1 = 2*rbuf1_i[0]+1; 1771 ct2 = 0; 1772 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1773 1774 kmax = rbuf1_i[2]; 1775 for (k=0; k<kmax; k++,ct1++) { 1776 row = rbuf1_i[ct1] - rstart; 1777 nzA = ai[row+1] - ai[row]; 1778 nzB = bi[row+1] - bi[row]; 1779 ncols = nzA + nzB; 1780 cworkB = bj + bi[row]; 1781 vworkA = a_a + ai[row]; 1782 vworkB = b_a + bi[row]; 1783 1784 /* load the column values for this row into vals*/ 1785 vals = sbuf_aa_i + ct2; 1786 1787 lwrite = 0; 1788 for (l=0; l<nzB; l++) { 1789 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1790 } 1791 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1792 for (l=0; l<nzB; l++) { 1793 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1794 } 1795 1796 ct2 += ncols; 1797 } 1798 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1799 } 1800 1801 /* Assemble submat */ 1802 /* First assemble the local rows */ 1803 for (j=0; j<nrow; j++) { 1804 row = irow[j]; 1805 proc = row2proc[j]; 1806 if (proc == rank) { 1807 Crow = row - rstart; /* local row index of C */ 1808 #if defined(PETSC_USE_CTABLE) 1809 row = rmap_loc[Crow]; /* row index of submat */ 1810 #else 1811 row = rmap[row]; 1812 #endif 1813 1814 if (allcolumns) { 1815 /* diagonal part A = c->A */ 1816 ncols = ai[Crow+1] - ai[Crow]; 1817 cols = aj + ai[Crow]; 1818 vals = a->a + ai[Crow]; 1819 i = 0; 1820 for (k=0; k<ncols; k++) { 1821 subcols[i] = cols[k] + cstart; 1822 subvals[i++] = vals[k]; 1823 } 1824 1825 /* off-diagonal part B = c->B */ 1826 ncols = bi[Crow+1] - bi[Crow]; 1827 cols = bj + bi[Crow]; 1828 vals = b->a + bi[Crow]; 1829 for (k=0; k<ncols; k++) { 1830 subcols[i] = bmap[cols[k]]; 1831 subvals[i++] = vals[k]; 1832 } 1833 1834 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1835 1836 } else { /* !allcolumns */ 1837 #if defined(PETSC_USE_CTABLE) 1838 /* diagonal part A = c->A */ 1839 ncols = ai[Crow+1] - ai[Crow]; 1840 cols = aj + ai[Crow]; 1841 vals = a->a + ai[Crow]; 1842 i = 0; 1843 for (k=0; k<ncols; k++) { 1844 tcol = cmap_loc[cols[k]]; 1845 if (tcol) { 1846 subcols[i] = --tcol; 1847 subvals[i++] = vals[k]; 1848 } 1849 } 1850 1851 /* off-diagonal part B = c->B */ 1852 ncols = bi[Crow+1] - bi[Crow]; 1853 cols = bj + bi[Crow]; 1854 vals = b->a + bi[Crow]; 1855 for (k=0; k<ncols; k++) { 1856 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1857 if (tcol) { 1858 subcols[i] = --tcol; 1859 subvals[i++] = vals[k]; 1860 } 1861 } 1862 #else 1863 /* diagonal part A = c->A */ 1864 ncols = ai[Crow+1] - ai[Crow]; 1865 cols = aj + ai[Crow]; 1866 vals = a->a + ai[Crow]; 1867 i = 0; 1868 for (k=0; k<ncols; k++) { 1869 tcol = cmap[cols[k]+cstart]; 1870 if (tcol) { 1871 subcols[i] = --tcol; 1872 subvals[i++] = vals[k]; 1873 } 1874 } 1875 1876 /* off-diagonal part B = c->B */ 1877 ncols = bi[Crow+1] - bi[Crow]; 1878 cols = bj + bi[Crow]; 1879 vals = b->a + bi[Crow]; 1880 for (k=0; k<ncols; k++) { 1881 tcol = cmap[bmap[cols[k]]]; 1882 if (tcol) { 1883 subcols[i] = --tcol; 1884 subvals[i++] = vals[k]; 1885 } 1886 } 1887 #endif 1888 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1889 } 1890 } 1891 } 1892 1893 /* Now assemble the off-proc rows */ 1894 for (i=0; i<nrqs; i++) { /* for each requested message */ 1895 /* recv values from other processes */ 1896 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1897 proc = pa[idex]; 1898 sbuf1_i = sbuf1[proc]; 1899 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1900 ct1 = 2 + 1; 1901 ct2 = 0; /* count of received C->j */ 1902 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1903 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1904 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1905 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1906 1907 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1908 max1 = sbuf1_i[2]; /* num of rows */ 1909 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1910 row = sbuf1_i[ct1]; /* row index of submat */ 1911 if (!allcolumns) { 1912 idex = 0; 1913 if (scall == MAT_INITIAL_MATRIX) { 1914 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1915 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1916 #if defined(PETSC_USE_CTABLE) 1917 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1918 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1919 } else { 1920 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1921 } 1922 #else 1923 tcol = cmap[rbuf3_i[ct2]]; 1924 #endif 1925 if (tcol) { 1926 subcols[idex] = --tcol; 1927 subvals[idex++] = rbuf4_i[ct2]; 1928 1929 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1930 For reuse, we replace received C->j with index that should be inserted to submat */ 1931 rbuf3_i[ct3++] = ct2; 1932 } 1933 } 1934 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1935 1936 } else { /* scall == MAT_REUSE_MATRIX */ 1937 submat = submats[0]; 1938 subc = (Mat_SeqAIJ*)submat->data; 1939 1940 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1941 for (l=0; l<nnz; l++) { 1942 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1943 subvals[idex++] = rbuf4_i[ct2]; 1944 } 1945 1946 bj = subc->j + subc->i[row]; 1947 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1948 } 1949 } else { /* allcolumns */ 1950 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1951 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1952 ct2 += nnz; 1953 } 1954 } 1955 } 1956 1957 /* sending a->a are done */ 1958 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1959 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1960 1961 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1962 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1963 submats[0] = submat; 1964 1965 /* Restore the indices */ 1966 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1967 if (!allcolumns) { 1968 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1969 } 1970 1971 /* Destroy allocated memory */ 1972 for (i=0; i<nrqs; ++i) { 1973 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1974 } 1975 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1976 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1977 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1978 1979 if (scall == MAT_INITIAL_MATRIX) { 1980 ierr = PetscFree(lens);CHKERRQ(ierr); 1981 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1982 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1983 } 1984 PetscFunctionReturn(0); 1985 } 1986 1987 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1988 { 1989 PetscErrorCode ierr; 1990 PetscInt ncol; 1991 PetscBool colflag,allcolumns=PETSC_FALSE; 1992 1993 PetscFunctionBegin; 1994 /* Allocate memory to hold all the submatrices */ 1995 if (scall == MAT_INITIAL_MATRIX) { 1996 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 1997 } 1998 1999 /* Check for special case: each processor gets entire matrix columns */ 2000 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2001 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2002 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2003 2004 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2005 PetscFunctionReturn(0); 2006 } 2007 2008 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2009 { 2010 PetscErrorCode ierr; 2011 PetscInt nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2]; 2012 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 2013 Mat_SeqAIJ *subc; 2014 Mat_SubMat *smat; 2015 2016 PetscFunctionBegin; 2017 /* Check for special case: each processor has a single IS */ 2018 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 2019 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2020 PetscFunctionReturn(0); 2021 } 2022 2023 if (scall == MAT_REUSE_MATRIX && !ismax) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"n=0 is not supported for MatGetSubMatrices(mat,n,isrow,iscol,MAT_REUSE_MATRIX,...). Set n=1 with zero-length isrow and iscolumn instead"); 2024 2025 /* Collect global wantallmatrix and nstages */ 2026 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2027 if (!nmax) nmax = 1; 2028 2029 if (scall == MAT_INITIAL_MATRIX) { 2030 /* Collect global wantallmatrix and nstages */ 2031 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2032 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2033 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2034 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2035 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2036 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2037 wantallmatrix = PETSC_TRUE; 2038 2039 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2040 } 2041 } 2042 2043 /* Determine the number of stages through which submatrices are done 2044 Each stage will extract nmax submatrices. 2045 nmax is determined by the matrix column dimension. 2046 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2047 */ 2048 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2049 2050 in[0] = -1*(PetscInt)wantallmatrix; 2051 in[1] = nstages; 2052 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2053 wantallmatrix = (PetscBool)(-out[0]); 2054 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2055 2056 } else { 2057 subc = (Mat_SeqAIJ*)((*submat)[0]->data); 2058 smat = subc->submatis1; 2059 if (!smat) { 2060 /* smat is not generated by MatGetSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2061 wantallmatrix = PETSC_TRUE; 2062 } else { 2063 nstages = smat->nstages; 2064 } 2065 } 2066 2067 if (wantallmatrix) { 2068 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2069 PetscFunctionReturn(0); 2070 } 2071 2072 /* Allocate memory to hold all the submatrices */ 2073 if (scall == MAT_INITIAL_MATRIX) { 2074 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2075 } 2076 2077 for (i=0,pos=0; i<nstages; i++) { 2078 if (pos+nmax <= ismax) max_no = nmax; 2079 else if (pos == ismax) max_no = 0; 2080 else max_no = ismax-pos; 2081 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2082 pos += max_no; 2083 } 2084 2085 if (scall == MAT_INITIAL_MATRIX && ismax) { 2086 /* save nstages for reuse */ 2087 subc = (Mat_SeqAIJ*)((*submat)[0]->data); 2088 smat = subc->submatis1; 2089 if (!smat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"smat does not exit"); 2090 smat->nstages = nstages; 2091 } 2092 PetscFunctionReturn(0); 2093 } 2094 2095 /* -------------------------------------------------------------------------*/ 2096 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2097 { 2098 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2099 Mat A = c->A; 2100 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2101 const PetscInt **icol,**irow; 2102 PetscInt *nrow,*ncol,start; 2103 PetscErrorCode ierr; 2104 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2105 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2106 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2107 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2108 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2109 #if defined(PETSC_USE_CTABLE) 2110 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2111 #else 2112 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2113 #endif 2114 const PetscInt *irow_i; 2115 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2116 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2117 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2118 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2119 MPI_Status *r_status3,*r_status4,*s_status4; 2120 MPI_Comm comm; 2121 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2122 PetscMPIInt *onodes1,*olengths1,end; 2123 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2124 Mat_SubMat **smats,*smat_i; 2125 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2126 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2127 2128 PetscFunctionBegin; 2129 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2130 size = c->size; 2131 rank = c->rank; 2132 2133 ierr = PetscMalloc5(ismax,&smats,ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax,&allcolumns);CHKERRQ(ierr); 2134 ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2135 2136 for (i=0; i<ismax; i++) { 2137 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2138 if (!issorted[i]) iscsorted = issorted[i]; 2139 2140 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2141 2142 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2143 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2144 2145 /* Check for special case: allcolumn */ 2146 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2147 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2148 if (colflag && ncol[i] == C->cmap->N) { 2149 allcolumns[i] = PETSC_TRUE; 2150 icol[i] = NULL; 2151 } else { 2152 allcolumns[i] = PETSC_FALSE; 2153 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2154 } 2155 } 2156 2157 if (scall == MAT_REUSE_MATRIX) { 2158 /* Assumes new rows are same length as the old rows */ 2159 for (i=0; i<ismax; i++) { 2160 subc = (Mat_SeqAIJ*)(submats[i]->data); 2161 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2162 2163 /* Initial matrix as if empty */ 2164 ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2165 2166 /* Initial matrix as if empty */ 2167 submats[i]->factortype = C->factortype; 2168 2169 smat_i = subc->submatis1; 2170 smats[i] = smat_i; 2171 2172 nrqs = smat_i->nrqs; 2173 nrqr = smat_i->nrqr; 2174 rbuf1 = smat_i->rbuf1; 2175 rbuf2 = smat_i->rbuf2; 2176 rbuf3 = smat_i->rbuf3; 2177 req_source2 = smat_i->req_source2; 2178 2179 sbuf1 = smat_i->sbuf1; 2180 sbuf2 = smat_i->sbuf2; 2181 ptr = smat_i->ptr; 2182 tmp = smat_i->tmp; 2183 ctr = smat_i->ctr; 2184 2185 pa = smat_i->pa; 2186 req_size = smat_i->req_size; 2187 req_source1 = smat_i->req_source1; 2188 2189 allcolumns[i] = smat_i->allcolumns; 2190 row2proc[i] = smat_i->row2proc; 2191 rmap[i] = smat_i->rmap; 2192 cmap[i] = smat_i->cmap; 2193 } 2194 } else { /* scall == MAT_INITIAL_MATRIX */ 2195 /* Get some new tags to keep the communication clean */ 2196 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2197 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2198 2199 /* evaluate communication - mesg to who, length of mesg, and buffer space 2200 required. Based on this, buffers are allocated, and data copied into them*/ 2201 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2202 2203 for (i=0; i<ismax; i++) { 2204 jmax = nrow[i]; 2205 irow_i = irow[i]; 2206 2207 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2208 row2proc[i] = row2proc_i; 2209 2210 if (issorted[i]) proc = 0; 2211 for (j=0; j<jmax; j++) { 2212 if (!issorted[i]) proc = 0; 2213 row = irow_i[j]; 2214 while (row >= C->rmap->range[proc+1]) proc++; 2215 w4[proc]++; 2216 row2proc_i[j] = proc; /* map row index to proc */ 2217 } 2218 for (j=0; j<size; j++) { 2219 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2220 } 2221 } 2222 2223 nrqs = 0; /* no of outgoing messages */ 2224 msz = 0; /* total mesg length (for all procs) */ 2225 w1[rank] = 0; /* no mesg sent to self */ 2226 w3[rank] = 0; 2227 for (i=0; i<size; i++) { 2228 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2229 } 2230 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2231 for (i=0,j=0; i<size; i++) { 2232 if (w1[i]) { pa[j] = i; j++; } 2233 } 2234 2235 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2236 for (i=0; i<nrqs; i++) { 2237 j = pa[i]; 2238 w1[j] += w2[j] + 2* w3[j]; 2239 msz += w1[j]; 2240 } 2241 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2242 2243 /* Determine the number of messages to expect, their lengths, from from-ids */ 2244 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2245 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2246 2247 /* Now post the Irecvs corresponding to these messages */ 2248 tag0 = ((PetscObject)C)->tag; 2249 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2250 2251 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2252 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2253 2254 /* Allocate Memory for outgoing messages */ 2255 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2256 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2257 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2258 2259 { 2260 PetscInt *iptr = tmp; 2261 k = 0; 2262 for (i=0; i<nrqs; i++) { 2263 j = pa[i]; 2264 iptr += k; 2265 sbuf1[j] = iptr; 2266 k = w1[j]; 2267 } 2268 } 2269 2270 /* Form the outgoing messages. Initialize the header space */ 2271 for (i=0; i<nrqs; i++) { 2272 j = pa[i]; 2273 sbuf1[j][0] = 0; 2274 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2275 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2276 } 2277 2278 /* Parse the isrow and copy data into outbuf */ 2279 for (i=0; i<ismax; i++) { 2280 row2proc_i = row2proc[i]; 2281 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2282 irow_i = irow[i]; 2283 jmax = nrow[i]; 2284 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2285 proc = row2proc_i[j]; 2286 if (proc != rank) { /* copy to the outgoing buf*/ 2287 ctr[proc]++; 2288 *ptr[proc] = irow_i[j]; 2289 ptr[proc]++; 2290 } 2291 } 2292 /* Update the headers for the current IS */ 2293 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2294 if ((ctr_j = ctr[j])) { 2295 sbuf1_j = sbuf1[j]; 2296 k = ++sbuf1_j[0]; 2297 sbuf1_j[2*k] = ctr_j; 2298 sbuf1_j[2*k-1] = i; 2299 } 2300 } 2301 } 2302 2303 /* Now post the sends */ 2304 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2305 for (i=0; i<nrqs; ++i) { 2306 j = pa[i]; 2307 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2308 } 2309 2310 /* Post Receives to capture the buffer size */ 2311 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2312 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2313 rbuf2[0] = tmp + msz; 2314 for (i=1; i<nrqs; ++i) { 2315 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2316 } 2317 for (i=0; i<nrqs; ++i) { 2318 j = pa[i]; 2319 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2320 } 2321 2322 /* Send to other procs the buf size they should allocate */ 2323 /* Receive messages*/ 2324 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2325 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2326 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2327 { 2328 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2329 PetscInt *sbuf2_i; 2330 2331 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2332 for (i=0; i<nrqr; ++i) { 2333 req_size[i] = 0; 2334 rbuf1_i = rbuf1[i]; 2335 start = 2*rbuf1_i[0] + 1; 2336 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2337 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2338 sbuf2_i = sbuf2[i]; 2339 for (j=start; j<end; j++) { 2340 id = rbuf1_i[j] - rstart; 2341 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2342 sbuf2_i[j] = ncols; 2343 req_size[i] += ncols; 2344 } 2345 req_source1[i] = r_status1[i].MPI_SOURCE; 2346 /* form the header */ 2347 sbuf2_i[0] = req_size[i]; 2348 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2349 2350 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2351 } 2352 } 2353 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2354 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2355 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2356 2357 /* Receive messages*/ 2358 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2359 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2360 2361 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2362 for (i=0; i<nrqs; ++i) { 2363 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2364 req_source2[i] = r_status2[i].MPI_SOURCE; 2365 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2366 } 2367 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2368 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2369 2370 /* Wait on sends1 and sends2 */ 2371 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2372 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2373 2374 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2375 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2376 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2377 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2378 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2379 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2380 2381 /* Now allocate sending buffers for a->j, and send them off */ 2382 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2383 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2384 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2385 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2386 2387 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2388 { 2389 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2390 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2391 PetscInt cend = C->cmap->rend; 2392 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2393 2394 for (i=0; i<nrqr; i++) { 2395 rbuf1_i = rbuf1[i]; 2396 sbuf_aj_i = sbuf_aj[i]; 2397 ct1 = 2*rbuf1_i[0] + 1; 2398 ct2 = 0; 2399 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2400 kmax = rbuf1[i][2*j]; 2401 for (k=0; k<kmax; k++,ct1++) { 2402 row = rbuf1_i[ct1] - rstart; 2403 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2404 ncols = nzA + nzB; 2405 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2406 2407 /* load the column indices for this row into cols */ 2408 cols = sbuf_aj_i + ct2; 2409 2410 lwrite = 0; 2411 for (l=0; l<nzB; l++) { 2412 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2413 } 2414 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2415 for (l=0; l<nzB; l++) { 2416 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2417 } 2418 2419 ct2 += ncols; 2420 } 2421 } 2422 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2423 } 2424 } 2425 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2426 2427 /* create col map: global col of C -> local col of submatrices */ 2428 { 2429 const PetscInt *icol_i; 2430 #if defined(PETSC_USE_CTABLE) 2431 for (i=0; i<ismax; i++) { 2432 if (!allcolumns[i]) { 2433 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2434 2435 jmax = ncol[i]; 2436 icol_i = icol[i]; 2437 cmap_i = cmap[i]; 2438 for (j=0; j<jmax; j++) { 2439 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2440 } 2441 } else cmap[i] = NULL; 2442 } 2443 #else 2444 for (i=0; i<ismax; i++) { 2445 if (!allcolumns[i]) { 2446 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2447 jmax = ncol[i]; 2448 icol_i = icol[i]; 2449 cmap_i = cmap[i]; 2450 for (j=0; j<jmax; j++) { 2451 cmap_i[icol_i[j]] = j+1; 2452 } 2453 } else cmap[i] = NULL; 2454 } 2455 #endif 2456 } 2457 2458 /* Create lens which is required for MatCreate... */ 2459 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2460 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2461 2462 if (ismax) { 2463 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2464 } 2465 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2466 2467 /* Update lens from local data */ 2468 for (i=0; i<ismax; i++) { 2469 row2proc_i = row2proc[i]; 2470 jmax = nrow[i]; 2471 if (!allcolumns[i]) cmap_i = cmap[i]; 2472 irow_i = irow[i]; 2473 lens_i = lens[i]; 2474 for (j=0; j<jmax; j++) { 2475 row = irow_i[j]; 2476 proc = row2proc_i[j]; 2477 if (proc == rank) { 2478 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2479 if (!allcolumns[i]) { 2480 for (k=0; k<ncols; k++) { 2481 #if defined(PETSC_USE_CTABLE) 2482 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2483 #else 2484 tcol = cmap_i[cols[k]]; 2485 #endif 2486 if (tcol) lens_i[j]++; 2487 } 2488 } else { /* allcolumns */ 2489 lens_i[j] = ncols; 2490 } 2491 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2492 } 2493 } 2494 } 2495 2496 /* Create row map: global row of C -> local row of submatrices */ 2497 #if defined(PETSC_USE_CTABLE) 2498 for (i=0; i<ismax; i++) { 2499 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2500 irow_i = irow[i]; 2501 jmax = nrow[i]; 2502 for (j=0; j<jmax; j++) { 2503 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2504 } 2505 } 2506 #else 2507 for (i=0; i<ismax; i++) { 2508 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2509 rmap_i = rmap[i]; 2510 irow_i = irow[i]; 2511 jmax = nrow[i]; 2512 for (j=0; j<jmax; j++) { 2513 rmap_i[irow_i[j]] = j; 2514 } 2515 } 2516 #endif 2517 2518 /* Update lens from offproc data */ 2519 { 2520 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2521 2522 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2523 for (tmp2=0; tmp2<nrqs; tmp2++) { 2524 sbuf1_i = sbuf1[pa[tmp2]]; 2525 jmax = sbuf1_i[0]; 2526 ct1 = 2*jmax+1; 2527 ct2 = 0; 2528 rbuf2_i = rbuf2[tmp2]; 2529 rbuf3_i = rbuf3[tmp2]; 2530 for (j=1; j<=jmax; j++) { 2531 is_no = sbuf1_i[2*j-1]; 2532 max1 = sbuf1_i[2*j]; 2533 lens_i = lens[is_no]; 2534 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2535 rmap_i = rmap[is_no]; 2536 for (k=0; k<max1; k++,ct1++) { 2537 #if defined(PETSC_USE_CTABLE) 2538 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2539 row--; 2540 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2541 #else 2542 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2543 #endif 2544 max2 = rbuf2_i[ct1]; 2545 for (l=0; l<max2; l++,ct2++) { 2546 if (!allcolumns[is_no]) { 2547 #if defined(PETSC_USE_CTABLE) 2548 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2549 #else 2550 tcol = cmap_i[rbuf3_i[ct2]]; 2551 #endif 2552 if (tcol) lens_i[row]++; 2553 } else { /* allcolumns */ 2554 lens_i[row]++; /* lens_i[row] += max2 ? */ 2555 } 2556 } 2557 } 2558 } 2559 } 2560 } 2561 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2562 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2563 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2564 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2565 2566 /* Create the submatrices */ 2567 for (i=0; i<ismax; i++) { 2568 PetscInt rbs,cbs; 2569 2570 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2571 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2572 2573 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2574 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2575 2576 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2577 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2578 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2579 2580 /* create struct Mat_SubMat and attached it to submat */ 2581 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2582 subc = (Mat_SeqAIJ*)submats[i]->data; 2583 subc->submatis1 = smat_i; 2584 smats[i] = smat_i; 2585 2586 smat_i->destroy = submats[i]->ops->destroy; 2587 submats[i]->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 2588 submats[i]->factortype = C->factortype; 2589 2590 smat_i->id = i; 2591 smat_i->nrqs = nrqs; 2592 smat_i->nrqr = nrqr; 2593 smat_i->rbuf1 = rbuf1; 2594 smat_i->rbuf2 = rbuf2; 2595 smat_i->rbuf3 = rbuf3; 2596 smat_i->sbuf2 = sbuf2; 2597 smat_i->req_source2 = req_source2; 2598 2599 smat_i->sbuf1 = sbuf1; 2600 smat_i->ptr = ptr; 2601 smat_i->tmp = tmp; 2602 smat_i->ctr = ctr; 2603 2604 smat_i->pa = pa; 2605 smat_i->req_size = req_size; 2606 smat_i->req_source1 = req_source1; 2607 2608 smat_i->allcolumns = allcolumns[i]; 2609 smat_i->row2proc = row2proc[i]; 2610 smat_i->rmap = rmap[i]; 2611 smat_i->cmap = cmap[i]; 2612 } 2613 2614 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2615 ierr = PetscFree(lens);CHKERRQ(ierr); 2616 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2617 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2618 2619 } /* endof scall == MAT_INITIAL_MATRIX */ 2620 2621 /* Post recv matrix values */ 2622 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2623 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2624 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2625 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2626 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2627 for (i=0; i<nrqs; ++i) { 2628 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2629 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2630 } 2631 2632 /* Allocate sending buffers for a->a, and send them off */ 2633 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2634 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2635 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2636 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2637 2638 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2639 { 2640 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2641 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2642 PetscInt cend = C->cmap->rend; 2643 PetscInt *b_j = b->j; 2644 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2645 2646 for (i=0; i<nrqr; i++) { 2647 rbuf1_i = rbuf1[i]; 2648 sbuf_aa_i = sbuf_aa[i]; 2649 ct1 = 2*rbuf1_i[0]+1; 2650 ct2 = 0; 2651 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2652 kmax = rbuf1_i[2*j]; 2653 for (k=0; k<kmax; k++,ct1++) { 2654 row = rbuf1_i[ct1] - rstart; 2655 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2656 ncols = nzA + nzB; 2657 cworkB = b_j + b_i[row]; 2658 vworkA = a_a + a_i[row]; 2659 vworkB = b_a + b_i[row]; 2660 2661 /* load the column values for this row into vals*/ 2662 vals = sbuf_aa_i+ct2; 2663 2664 lwrite = 0; 2665 for (l=0; l<nzB; l++) { 2666 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2667 } 2668 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2669 for (l=0; l<nzB; l++) { 2670 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2671 } 2672 2673 ct2 += ncols; 2674 } 2675 } 2676 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2677 } 2678 } 2679 2680 if (!ismax) { 2681 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2682 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2683 } 2684 2685 /* Assemble the matrices */ 2686 /* First assemble the local rows */ 2687 for (i=0; i<ismax; i++) { 2688 row2proc_i = row2proc[i]; 2689 subc = (Mat_SeqAIJ*)submats[i]->data; 2690 imat_ilen = subc->ilen; 2691 imat_j = subc->j; 2692 imat_i = subc->i; 2693 imat_a = subc->a; 2694 2695 if (!allcolumns[i]) cmap_i = cmap[i]; 2696 rmap_i = rmap[i]; 2697 irow_i = irow[i]; 2698 jmax = nrow[i]; 2699 for (j=0; j<jmax; j++) { 2700 row = irow_i[j]; 2701 proc = row2proc_i[j]; 2702 if (proc == rank) { 2703 old_row = row; 2704 #if defined(PETSC_USE_CTABLE) 2705 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2706 row--; 2707 #else 2708 row = rmap_i[row]; 2709 #endif 2710 ilen_row = imat_ilen[row]; 2711 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2712 mat_i = imat_i[row]; 2713 mat_a = imat_a + mat_i; 2714 mat_j = imat_j + mat_i; 2715 if (!allcolumns[i]) { 2716 for (k=0; k<ncols; k++) { 2717 #if defined(PETSC_USE_CTABLE) 2718 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2719 #else 2720 tcol = cmap_i[cols[k]]; 2721 #endif 2722 if (tcol) { 2723 *mat_j++ = tcol - 1; 2724 *mat_a++ = vals[k]; 2725 ilen_row++; 2726 } 2727 } 2728 } else { /* allcolumns */ 2729 for (k=0; k<ncols; k++) { 2730 *mat_j++ = cols[k]; /* global col index! */ 2731 *mat_a++ = vals[k]; 2732 ilen_row++; 2733 } 2734 } 2735 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2736 2737 imat_ilen[row] = ilen_row; 2738 } 2739 } 2740 } 2741 2742 /* Now assemble the off proc rows */ 2743 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2744 for (tmp2=0; tmp2<nrqs; tmp2++) { 2745 sbuf1_i = sbuf1[pa[tmp2]]; 2746 jmax = sbuf1_i[0]; 2747 ct1 = 2*jmax + 1; 2748 ct2 = 0; 2749 rbuf2_i = rbuf2[tmp2]; 2750 rbuf3_i = rbuf3[tmp2]; 2751 rbuf4_i = rbuf4[tmp2]; 2752 for (j=1; j<=jmax; j++) { 2753 is_no = sbuf1_i[2*j-1]; 2754 rmap_i = rmap[is_no]; 2755 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2756 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2757 imat_ilen = subc->ilen; 2758 imat_j = subc->j; 2759 imat_i = subc->i; 2760 imat_a = subc->a; 2761 max1 = sbuf1_i[2*j]; 2762 for (k=0; k<max1; k++,ct1++) { 2763 row = sbuf1_i[ct1]; 2764 #if defined(PETSC_USE_CTABLE) 2765 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2766 row--; 2767 #else 2768 row = rmap_i[row]; 2769 #endif 2770 ilen = imat_ilen[row]; 2771 mat_i = imat_i[row]; 2772 mat_a = imat_a + mat_i; 2773 mat_j = imat_j + mat_i; 2774 max2 = rbuf2_i[ct1]; 2775 if (!allcolumns[is_no]) { 2776 for (l=0; l<max2; l++,ct2++) { 2777 #if defined(PETSC_USE_CTABLE) 2778 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2779 #else 2780 tcol = cmap_i[rbuf3_i[ct2]]; 2781 #endif 2782 if (tcol) { 2783 *mat_j++ = tcol - 1; 2784 *mat_a++ = rbuf4_i[ct2]; 2785 ilen++; 2786 } 2787 } 2788 } else { /* allcolumns */ 2789 for (l=0; l<max2; l++,ct2++) { 2790 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2791 *mat_a++ = rbuf4_i[ct2]; 2792 ilen++; 2793 } 2794 } 2795 imat_ilen[row] = ilen; 2796 } 2797 } 2798 } 2799 2800 if (!iscsorted) { /* sort column indices of the rows */ 2801 for (i=0; i<ismax; i++) { 2802 subc = (Mat_SeqAIJ*)submats[i]->data; 2803 imat_j = subc->j; 2804 imat_i = subc->i; 2805 imat_a = subc->a; 2806 imat_ilen = subc->ilen; 2807 2808 if (allcolumns[i]) continue; 2809 jmax = nrow[i]; 2810 for (j=0; j<jmax; j++) { 2811 PetscInt ilen; 2812 2813 mat_i = imat_i[j]; 2814 mat_a = imat_a + mat_i; 2815 mat_j = imat_j + mat_i; 2816 ilen = imat_ilen[j]; 2817 ierr = PetscSortIntWithScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2818 } 2819 } 2820 } 2821 2822 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2823 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2824 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2825 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2826 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2827 2828 /* Restore the indices */ 2829 for (i=0; i<ismax; i++) { 2830 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2831 if (!allcolumns[i]) { 2832 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2833 } 2834 } 2835 2836 for (i=0; i<ismax; i++) { 2837 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2838 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2839 } 2840 2841 /* Destroy allocated memory */ 2842 if (!ismax) { 2843 ierr = PetscFree(pa);CHKERRQ(ierr); 2844 2845 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2846 for (i=0; i<nrqr; ++i) { 2847 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2848 } 2849 for (i=0; i<nrqs; ++i) { 2850 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2851 } 2852 2853 ierr = PetscFree3(sbuf2,req_size,req_source1);CHKERRQ(ierr); 2854 ierr = PetscFree3(req_source2,rbuf2,rbuf3);CHKERRQ(ierr); 2855 } 2856 2857 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2858 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2859 ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr); 2860 2861 for (i=0; i<nrqs; ++i) { 2862 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2863 } 2864 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2865 2866 ierr = PetscFree5(smats,row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2867 PetscFunctionReturn(0); 2868 } 2869 2870 /* 2871 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2872 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2873 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2874 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2875 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2876 state, and needs to be "assembled" later by compressing B's column space. 2877 2878 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2879 Following this call, C->A & C->B have been created, even if empty. 2880 */ 2881 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2882 { 2883 /* If making this function public, change the error returned in this function away from _PLIB. */ 2884 PetscErrorCode ierr; 2885 Mat_MPIAIJ *aij; 2886 Mat_SeqAIJ *Baij; 2887 PetscBool seqaij,Bdisassembled; 2888 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2889 PetscScalar v; 2890 const PetscInt *rowindices,*colindices; 2891 2892 PetscFunctionBegin; 2893 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2894 if (A) { 2895 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2896 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2897 if (rowemb) { 2898 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2899 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2900 } else { 2901 if (C->rmap->n != A->rmap->n) { 2902 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2903 } 2904 } 2905 if (dcolemb) { 2906 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2907 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2908 } else { 2909 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2910 } 2911 } 2912 if (B) { 2913 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2914 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2915 if (rowemb) { 2916 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2917 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2918 } else { 2919 if (C->rmap->n != B->rmap->n) { 2920 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2921 } 2922 } 2923 if (ocolemb) { 2924 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2925 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2926 } else { 2927 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2928 } 2929 } 2930 2931 aij = (Mat_MPIAIJ*)(C->data); 2932 if (!aij->A) { 2933 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2934 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2935 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2936 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2937 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2938 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2939 } 2940 if (A) { 2941 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2942 } else { 2943 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2944 } 2945 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2946 /* 2947 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2948 need to "disassemble" B -- convert it to using C's global indices. 2949 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2950 2951 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2952 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2953 2954 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2955 At least avoid calling MatSetValues() and the implied searches? 2956 */ 2957 2958 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2959 #if defined(PETSC_USE_CTABLE) 2960 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2961 #else 2962 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2963 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2964 if (aij->B) { 2965 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2966 } 2967 #endif 2968 ngcol = 0; 2969 if (aij->lvec) { 2970 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2971 } 2972 if (aij->garray) { 2973 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2974 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2975 } 2976 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2977 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2978 } 2979 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2980 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2981 } 2982 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2983 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2984 } 2985 } 2986 Bdisassembled = PETSC_FALSE; 2987 if (!aij->B) { 2988 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2989 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2990 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2991 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2992 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2993 Bdisassembled = PETSC_TRUE; 2994 } 2995 if (B) { 2996 Baij = (Mat_SeqAIJ*)(B->data); 2997 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2998 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2999 for (i=0; i<B->rmap->n; i++) { 3000 nz[i] = Baij->i[i+1] - Baij->i[i]; 3001 } 3002 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3003 ierr = PetscFree(nz);CHKERRQ(ierr); 3004 } 3005 3006 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3007 shift = rend-rstart; 3008 count = 0; 3009 rowindices = NULL; 3010 colindices = NULL; 3011 if (rowemb) { 3012 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3013 } 3014 if (ocolemb) { 3015 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3016 } 3017 for (i=0; i<B->rmap->n; i++) { 3018 PetscInt row; 3019 row = i; 3020 if (rowindices) row = rowindices[i]; 3021 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3022 col = Baij->j[count]; 3023 if (colindices) col = colindices[col]; 3024 if (Bdisassembled && col>=rstart) col += shift; 3025 v = Baij->a[count]; 3026 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3027 ++count; 3028 } 3029 } 3030 /* No assembly for aij->B is necessary. */ 3031 /* FIXME: set aij->B's nonzerostate correctly. */ 3032 } else { 3033 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3034 } 3035 C->preallocated = PETSC_TRUE; 3036 C->was_assembled = PETSC_FALSE; 3037 C->assembled = PETSC_FALSE; 3038 /* 3039 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3040 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3041 */ 3042 PetscFunctionReturn(0); 3043 } 3044 3045 /* 3046 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3047 */ 3048 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3049 { 3050 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3051 3052 PetscFunctionBegin; 3053 PetscValidPointer(A,2); 3054 PetscValidPointer(B,3); 3055 /* FIXME: make sure C is assembled */ 3056 *A = aij->A; 3057 *B = aij->B; 3058 /* Note that we don't incref *A and *B, so be careful! */ 3059 PetscFunctionReturn(0); 3060 } 3061 3062 /* 3063 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3064 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3065 */ 3066 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3067 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3068 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3069 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3070 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3071 { 3072 PetscErrorCode ierr; 3073 PetscMPIInt isize,flag; 3074 PetscInt i,ii,cismax,ispar; 3075 Mat *A,*B; 3076 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3077 3078 PetscFunctionBegin; 3079 if (!ismax) PetscFunctionReturn(0); 3080 3081 for (i = 0, cismax = 0; i < ismax; ++i) { 3082 PetscMPIInt isize; 3083 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3084 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3085 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3086 if (isize > 1) ++cismax; 3087 } 3088 3089 /* 3090 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3091 ispar counts the number of parallel ISs across C's comm. 3092 */ 3093 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3094 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3095 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3096 PetscFunctionReturn(0); 3097 } 3098 3099 /* if (ispar) */ 3100 /* 3101 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3102 These are used to extract the off-diag portion of the resulting parallel matrix. 3103 The row IS for the off-diag portion is the same as for the diag portion, 3104 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3105 */ 3106 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3107 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3108 for (i = 0, ii = 0; i < ismax; ++i) { 3109 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3110 if (isize > 1) { 3111 /* 3112 TODO: This is the part that's ***NOT SCALABLE***. 3113 To fix this we need to extract just the indices of C's nonzero columns 3114 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3115 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3116 to be done without serializing on the IS list, so, most likely, it is best 3117 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3118 */ 3119 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3120 /* Now we have to 3121 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3122 were sorted on each rank, concatenated they might no longer be sorted; 3123 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3124 indices in the nondecreasing order to the original index positions. 3125 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3126 */ 3127 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3128 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3129 ++ii; 3130 } 3131 } 3132 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3133 for (i = 0, ii = 0; i < ismax; ++i) { 3134 PetscInt j,issize; 3135 const PetscInt *indices; 3136 3137 /* 3138 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3139 */ 3140 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3141 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3142 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3143 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3144 for (j = 1; j < issize; ++j) { 3145 if (indices[j] == indices[j-1]) { 3146 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3147 } 3148 } 3149 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3150 3151 3152 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3153 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3154 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3155 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3156 for (j = 1; j < issize; ++j) { 3157 if (indices[j-1] == indices[j]) { 3158 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3159 } 3160 } 3161 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3162 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3163 if (isize > 1) { 3164 cisrow[ii] = isrow[i]; 3165 ++ii; 3166 } 3167 } 3168 /* 3169 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3170 array of sequential matrices underlying the resulting parallel matrices. 3171 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3172 contain duplicates. 3173 3174 There are as many diag matrices as there are original index sets. There are only as many parallel 3175 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3176 3177 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3178 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3179 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3180 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3181 with A[i] and B[ii] extracted from the corresponding MPI submat. 3182 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3183 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3184 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3185 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3186 values into A[i] and B[ii] sitting inside the corresponding submat. 3187 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3188 A[i], B[ii], AA[i] or BB[ii] matrices. 3189 */ 3190 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3191 if (scall == MAT_INITIAL_MATRIX) { 3192 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3193 } 3194 3195 /* Now obtain the sequential A and B submatrices separately. */ 3196 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3197 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3198 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3199 3200 /* 3201 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3202 matrices A & B have been extracted directly into the parallel matrices containing them, or 3203 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3204 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3205 to have the same sparsity pattern. 3206 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3207 must be constructed for C. This is done by setseqmat(s). 3208 */ 3209 for (i = 0, ii = 0; i < ismax; ++i) { 3210 /* 3211 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3212 That way we can avoid sorting and computing permutations when reusing. 3213 To this end: 3214 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3215 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3216 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3217 */ 3218 MatStructure pattern; 3219 pattern = DIFFERENT_NONZERO_PATTERN; 3220 3221 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3222 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3223 if (isize > 1) { 3224 if (scall == MAT_INITIAL_MATRIX) { 3225 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3226 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3227 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3228 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3229 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3230 } 3231 /* 3232 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3233 */ 3234 { 3235 Mat AA,BB; 3236 AA = A[i]; 3237 BB = B[ii]; 3238 if (AA || BB) { 3239 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3240 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3241 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3242 } 3243 3244 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3245 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3246 } 3247 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3248 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3249 ++ii; 3250 } else { /* if (isize == 1) */ 3251 if (scall == MAT_REUSE_MATRIX) { 3252 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3253 } 3254 if (isrow_p[i] || iscol_p[i]) { 3255 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3256 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3257 /* Otherwise A is extracted straight into (*submats)[i]. */ 3258 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3259 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3260 } else (*submat)[i] = A[i]; 3261 } 3262 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3263 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3264 } 3265 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3266 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3267 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3268 ierr = PetscFree(A);CHKERRQ(ierr); 3269 ierr = PetscFree(B);CHKERRQ(ierr); 3270 PetscFunctionReturn(0); 3271 } 3272 3273 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3274 { 3275 PetscErrorCode ierr; 3276 3277 PetscFunctionBegin; 3278 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3279 PetscFunctionReturn(0); 3280 } 3281