1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++){ 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++){ 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank){ 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows){ 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++){ 115 if (tosizes_temp[i]){ 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++){ 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++){ 133 for (j=0; j<fromsizes[2*i]; j++){ 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++){ 169 for (j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++){ 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++){ 209 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 210 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 213 } 214 /* retrieve information to get row id and its overlap */ 215 for (i=0; i<nrecvs; ){ 216 is_id = recvdata[i++]; 217 data_size = recvdata[i++]; 218 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 219 isz_i = isz[is_id]; 220 for (j=0; j< data_size; j++){ 221 col = recvdata[i++]; 222 indices_i[isz_i++] = col; 223 } 224 isz[is_id] = isz_i; 225 } 226 /* remove duplicate entities */ 227 for (i=0; i<nidx; i++){ 228 indices_i = indices_temp+(max_lsize+nrecvs)*i; 229 isz_i = isz[i]; 230 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 231 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 232 } 233 ierr = PetscFree(isz);CHKERRQ(ierr); 234 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 239 { 240 PetscLayout rmap,cmap; 241 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 242 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 243 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 244 const PetscInt *gcols,*ai,*aj,*bi,*bj; 245 Mat amat,bmat; 246 PetscMPIInt rank; 247 PetscBool done; 248 MPI_Comm comm; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 253 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 254 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 255 /* Even if the mat is symmetric, we still assume it is not symmetric */ 256 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 257 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 258 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 /* total number of nonzero values is used to estimate the memory usage in the next step */ 261 tnz = ai[an]+bi[bn]; 262 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 263 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 264 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 265 /* to find the longest message */ 266 max_fszs = 0; 267 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 268 /* better way to estimate number of nonzero in the mat??? */ 269 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 270 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 271 rows_pos = 0; 272 totalrows = 0; 273 for (i=0; i<nfrom; i++){ 274 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 275 /* group data together */ 276 for (j=0; j<fromsizes[2*i]; j+=2){ 277 is_id = fromrows[rows_pos++];/* no of is */ 278 rows_i = rows_data[is_id]; 279 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 280 } 281 /* estimate a space to avoid multiple allocations */ 282 for (j=0; j<nidx; j++){ 283 indvc_ij = 0; 284 rows_i = rows_data[j]; 285 for (l=0; l<rows_pos_i[j]; l++){ 286 row = rows_i[l]-rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for (k=start; k<end; k++){ /* Amat */ 290 col = aj[k] + cstart; 291 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for (k=start; k<end; k++) { /* Bmat */ 296 col = gcols[bj[k]]; 297 indices_tmp[indvc_ij++] = col; 298 } 299 } 300 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 301 indv_counts[i*nidx+j] = indvc_ij; 302 totalrows += indvc_ij; 303 } 304 } 305 /* message triple <no of is, number of rows, rows> */ 306 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 307 totalrows = 0; 308 rows_pos = 0; 309 /* use this code again */ 310 for (i=0;i<nfrom;i++){ 311 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 312 for (j=0; j<fromsizes[2*i]; j+=2){ 313 is_id = fromrows[rows_pos++]; 314 rows_i = rows_data[is_id]; 315 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 316 } 317 /* add data */ 318 for (j=0; j<nidx; j++){ 319 if (!indv_counts[i*nidx+j]) continue; 320 indvc_ij = 0; 321 sbdata[totalrows++] = j; 322 sbdata[totalrows++] = indv_counts[i*nidx+j]; 323 sbsizes[2*i] += 2; 324 rows_i = rows_data[j]; 325 for (l=0; l<rows_pos_i[j]; l++){ 326 row = rows_i[l]-rstart; 327 start = ai[row]; 328 end = ai[row+1]; 329 for (k=start; k<end; k++){ /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; 332 } 333 start = bi[row]; 334 end = bi[row+1]; 335 for (k=start; k<end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 341 sbsizes[2*i] += indvc_ij; 342 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 343 totalrows += indvc_ij; 344 } 345 } 346 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 347 for (i=0; i<nfrom; i++){ 348 offsets[i+1] = offsets[i] + sbsizes[2*i]; 349 sbsizes[2*i+1] = offsets[i]; 350 } 351 ierr = PetscFree(offsets);CHKERRQ(ierr); 352 if (sbrowsizes) *sbrowsizes = sbsizes; 353 if (sbrows) *sbrows = sbdata; 354 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 355 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 356 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 361 { 362 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 363 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 364 PetscInt lsize,lsize_tmp,owner; 365 PetscMPIInt rank; 366 Mat amat,bmat; 367 PetscBool done; 368 PetscLayout cmap,rmap; 369 MPI_Comm comm; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 374 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 375 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 376 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 377 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 378 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 /* is it a safe way to compute number of nonzero values ? */ 381 tnz = ai[an]+bi[bn]; 382 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 383 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 384 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 385 /* it is a better way to estimate memory than the old implementation 386 * where global size of matrix is used 387 * */ 388 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 389 for (i=0; i<nidx; i++) { 390 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 391 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 392 lsize_tmp = 0; 393 for (j=0; j<lsize; j++) { 394 owner = -1; 395 row = indices[j]; 396 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 397 if (owner != rank) continue; 398 /* local number */ 399 row -= rstart; 400 start = ai[row]; 401 end = ai[row+1]; 402 for (k=start; k<end; k++) { /* Amat */ 403 col = aj[k] + cstart; 404 indices_temp[lsize_tmp++] = col; 405 } 406 start = bi[row]; 407 end = bi[row+1]; 408 for (k=start; k<end; k++) { /* Bmat */ 409 col = gcols[bj[k]]; 410 indices_temp[lsize_tmp++] = col; 411 } 412 } 413 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 414 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 415 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 416 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 417 } 418 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 419 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 420 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests recieved (which have to be or which have been processed 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscErrorCode ierr; 461 PetscMPIInt size,rank,tag1,tag2; 462 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 463 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 464 PetscBT *table; 465 MPI_Comm comm; 466 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 467 MPI_Status *s_status,*recv_status; 468 char *t_p; 469 470 PetscFunctionBegin; 471 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 477 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 478 479 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 480 481 for (i=0; i<imax; i++) { 482 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 483 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 489 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 490 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 491 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 492 for (i=0; i<imax; i++) { 493 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 idx_i = idx[i]; 495 len = n[i]; 496 for (j=0; j<len; j++) { 497 row = idx_i[j]; 498 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 499 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 500 w4[proc]++; 501 } 502 for (j=0; j<size; j++) { 503 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 504 } 505 } 506 507 nrqs = 0; /* no of outgoing messages */ 508 msz = 0; /* total mesg length (for all proc */ 509 w1[rank] = 0; /* no mesg sent to intself */ 510 w3[rank] = 0; 511 for (i=0; i<size; i++) { 512 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 513 } 514 /* pa - is list of processors to communicate with */ 515 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 516 for (i=0,j=0; i<size; i++) { 517 if (w1[i]) {pa[j] = i; j++;} 518 } 519 520 /* Each message would have a header = 1 + 2*(no of IS) + data */ 521 for (i=0; i<nrqs; i++) { 522 j = pa[i]; 523 w1[j] += w2[j] + 2*w3[j]; 524 msz += w1[j]; 525 } 526 527 /* Determine the number of messages to expect, their lengths, from from-ids */ 528 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 529 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 530 531 /* Now post the Irecvs corresponding to these messages */ 532 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 533 534 /* Allocate Memory for outgoing messages */ 535 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 536 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 537 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 538 539 { 540 PetscInt *iptr = tmp,ict = 0; 541 for (i=0; i<nrqs; i++) { 542 j = pa[i]; 543 iptr += ict; 544 outdat[j] = iptr; 545 ict = w1[j]; 546 } 547 } 548 549 /* Form the outgoing messages */ 550 /* plug in the headers */ 551 for (i=0; i<nrqs; i++) { 552 j = pa[i]; 553 outdat[j][0] = 0; 554 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 555 ptr[j] = outdat[j] + 2*w3[j] + 1; 556 } 557 558 /* Memory for doing local proc's work */ 559 { 560 PetscInt M_BPB_imax = 0; 561 #if defined(PETSC_USE_CTABLE) 562 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 563 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 564 for (i=0; i<imax; i++) { 565 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 566 } 567 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 570 } 571 #else 572 PetscInt Mimax = 0; 573 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 574 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 575 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 data[i] = d_p + M*i; 579 } 580 #endif 581 } 582 583 /* Parse the IS and update local tables and the outgoing buf with the data */ 584 { 585 PetscInt n_i,isz_i,*outdat_j,ctr_j; 586 PetscBT table_i; 587 588 for (i=0; i<imax; i++) { 589 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 590 n_i = n[i]; 591 table_i = table[i]; 592 idx_i = idx[i]; 593 #if defined(PETSC_USE_CTABLE) 594 table_data_i = table_data[i]; 595 #else 596 data_i = data[i]; 597 #endif 598 isz_i = isz[i]; 599 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 600 row = idx_i[j]; 601 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 602 if (proc != rank) { /* copy to the outgoing buffer */ 603 ctr[proc]++; 604 *ptr[proc] = row; 605 ptr[proc]++; 606 } else if (!PetscBTLookupSet(table_i,row)) { 607 #if defined(PETSC_USE_CTABLE) 608 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 609 #else 610 data_i[isz_i] = row; /* Update the local table */ 611 #endif 612 isz_i++; 613 } 614 } 615 /* Update the headers for the current IS */ 616 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 617 if ((ctr_j = ctr[j])) { 618 outdat_j = outdat[j]; 619 k = ++outdat_j[0]; 620 outdat_j[2*k] = ctr_j; 621 outdat_j[2*k-1] = i; 622 } 623 } 624 isz[i] = isz_i; 625 } 626 } 627 628 /* Now post the sends */ 629 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 630 for (i=0; i<nrqs; ++i) { 631 j = pa[i]; 632 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 633 } 634 635 /* No longer need the original indices */ 636 for (i=0; i<imax; ++i) { 637 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 638 } 639 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 640 641 for (i=0; i<imax; ++i) { 642 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 643 } 644 645 /* Do Local work */ 646 #if defined(PETSC_USE_CTABLE) 647 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 648 #else 649 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 650 #endif 651 652 /* Receive messages */ 653 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 654 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 655 656 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 657 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 658 659 /* Phase 1 sends are complete - deallocate buffers */ 660 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 661 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 662 663 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 664 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 665 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 666 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 667 ierr = PetscFree(rbuf);CHKERRQ(ierr); 668 669 670 /* Send the data back */ 671 /* Do a global reduction to know the buffer space req for incoming messages */ 672 { 673 PetscMPIInt *rw1; 674 675 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 676 677 for (i=0; i<nrqr; ++i) { 678 proc = recv_status[i].MPI_SOURCE; 679 680 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 681 rw1[proc] = isz1[i]; 682 } 683 ierr = PetscFree(onodes1);CHKERRQ(ierr); 684 ierr = PetscFree(olengths1);CHKERRQ(ierr); 685 686 /* Determine the number of messages to expect, their lengths, from from-ids */ 687 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 688 ierr = PetscFree(rw1);CHKERRQ(ierr); 689 } 690 /* Now post the Irecvs corresponding to these messages */ 691 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 692 693 /* Now post the sends */ 694 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 695 for (i=0; i<nrqr; ++i) { 696 j = recv_status[i].MPI_SOURCE; 697 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 698 } 699 700 /* receive work done on other processors */ 701 { 702 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 703 PetscMPIInt idex; 704 PetscBT table_i; 705 MPI_Status *status2; 706 707 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 708 for (i=0; i<nrqs; ++i) { 709 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 710 /* Process the message */ 711 rbuf2_i = rbuf2[idex]; 712 ct1 = 2*rbuf2_i[0]+1; 713 jmax = rbuf2[idex][0]; 714 for (j=1; j<=jmax; j++) { 715 max = rbuf2_i[2*j]; 716 is_no = rbuf2_i[2*j-1]; 717 isz_i = isz[is_no]; 718 table_i = table[is_no]; 719 #if defined(PETSC_USE_CTABLE) 720 table_data_i = table_data[is_no]; 721 #else 722 data_i = data[is_no]; 723 #endif 724 for (k=0; k<max; k++,ct1++) { 725 row = rbuf2_i[ct1]; 726 if (!PetscBTLookupSet(table_i,row)) { 727 #if defined(PETSC_USE_CTABLE) 728 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 729 #else 730 data_i[isz_i] = row; 731 #endif 732 isz_i++; 733 } 734 } 735 isz[is_no] = isz_i; 736 } 737 } 738 739 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 740 ierr = PetscFree(status2);CHKERRQ(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 } 768 769 ierr = PetscFree(onodes2);CHKERRQ(ierr); 770 ierr = PetscFree(olengths2);CHKERRQ(ierr); 771 772 ierr = PetscFree(pa);CHKERRQ(ierr); 773 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 774 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 775 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 776 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 779 ierr = PetscFree(s_status);CHKERRQ(ierr); 780 ierr = PetscFree(recv_status);CHKERRQ(ierr); 781 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 782 ierr = PetscFree(xdata);CHKERRQ(ierr); 783 ierr = PetscFree(isz1);CHKERRQ(ierr); 784 #if defined(PETSC_USE_CTABLE) 785 for (i=0; i<imax; i++) { 786 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 787 } 788 ierr = PetscFree(table_data);CHKERRQ(ierr); 789 ierr = PetscFree(tdata);CHKERRQ(ierr); 790 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 791 #else 792 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 793 #endif 794 PetscFunctionReturn(0); 795 } 796 797 /* 798 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 799 the work on the local processor. 800 801 Inputs: 802 C - MAT_MPIAIJ; 803 imax - total no of index sets processed at a time; 804 table - an array of char - size = m bits. 805 806 Output: 807 isz - array containing the count of the solution elements corresponding 808 to each index set; 809 data or table_data - pointer to the solutions 810 */ 811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 812 { 813 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 814 Mat A = c->A,B = c->B; 815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 816 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 817 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 818 PetscBT table_i; 819 #if defined(PETSC_USE_CTABLE) 820 PetscTable table_data_i; 821 PetscErrorCode ierr; 822 PetscTablePosition tpos; 823 PetscInt tcount,*tdata; 824 #else 825 PetscInt *data_i; 826 #endif 827 828 PetscFunctionBegin; 829 rstart = C->rmap->rstart; 830 cstart = C->cmap->rstart; 831 ai = a->i; 832 aj = a->j; 833 bi = b->i; 834 bj = b->j; 835 garray = c->garray; 836 837 for (i=0; i<imax; i++) { 838 #if defined(PETSC_USE_CTABLE) 839 /* copy existing entries of table_data_i into tdata[] */ 840 table_data_i = table_data[i]; 841 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 842 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 843 844 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 845 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 846 while (tpos) { 847 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 848 tdata[--j] = --row; 849 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 850 } 851 #else 852 data_i = data[i]; 853 #endif 854 table_i = table[i]; 855 isz_i = isz[i]; 856 max = isz[i]; 857 858 for (j=0; j<max; j++) { 859 #if defined(PETSC_USE_CTABLE) 860 row = tdata[j] - rstart; 861 #else 862 row = data_i[j] - rstart; 863 #endif 864 start = ai[row]; 865 end = ai[row+1]; 866 for (k=start; k<end; k++) { /* Amat */ 867 val = aj[k] + cstart; 868 if (!PetscBTLookupSet(table_i,val)) { 869 #if defined(PETSC_USE_CTABLE) 870 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 871 #else 872 data_i[isz_i] = val; 873 #endif 874 isz_i++; 875 } 876 } 877 start = bi[row]; 878 end = bi[row+1]; 879 for (k=start; k<end; k++) { /* Bmat */ 880 val = garray[bj[k]]; 881 if (!PetscBTLookupSet(table_i,val)) { 882 #if defined(PETSC_USE_CTABLE) 883 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 884 #else 885 data_i[isz_i] = val; 886 #endif 887 isz_i++; 888 } 889 } 890 } 891 isz[i] = isz_i; 892 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscFree(tdata);CHKERRQ(ierr); 895 #endif 896 } 897 PetscFunctionReturn(0); 898 } 899 900 /* 901 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 902 and return the output 903 904 Input: 905 C - the matrix 906 nrqr - no of messages being processed. 907 rbuf - an array of pointers to the recieved requests 908 909 Output: 910 xdata - array of messages to be sent back 911 isz1 - size of each message 912 913 For better efficiency perhaps we should malloc separately each xdata[i], 914 then if a remalloc is required we need only copy the data for that one row 915 rather then all previous rows as it is now where a single large chunck of 916 memory is used. 917 918 */ 919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 920 { 921 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 922 Mat A = c->A,B = c->B; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 924 PetscErrorCode ierr; 925 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 926 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 927 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 928 PetscInt *rbuf_i,kmax,rbuf_0; 929 PetscBT xtable; 930 931 PetscFunctionBegin; 932 m = C->rmap->N; 933 rstart = C->rmap->rstart; 934 cstart = C->cmap->rstart; 935 ai = a->i; 936 aj = a->j; 937 bi = b->i; 938 bj = b->j; 939 garray = c->garray; 940 941 942 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 943 rbuf_i = rbuf[i]; 944 rbuf_0 = rbuf_i[0]; 945 ct += rbuf_0; 946 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 947 } 948 949 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 950 else max1 = 1; 951 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 952 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 953 ++no_malloc; 954 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 955 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 956 957 ct3 = 0; 958 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 959 rbuf_i = rbuf[i]; 960 rbuf_0 = rbuf_i[0]; 961 ct1 = 2*rbuf_0+1; 962 ct2 = ct1; 963 ct3 += ct1; 964 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 965 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 966 oct2 = ct2; 967 kmax = rbuf_i[2*j]; 968 for (k=0; k<kmax; k++,ct1++) { 969 row = rbuf_i[ct1]; 970 if (!PetscBTLookupSet(xtable,row)) { 971 if (!(ct3 < mem_estimate)) { 972 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 973 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 974 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 975 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 976 xdata[0] = tmp; 977 mem_estimate = new_estimate; ++no_malloc; 978 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 979 } 980 xdata[i][ct2++] = row; 981 ct3++; 982 } 983 } 984 for (k=oct2,max2=ct2; k<max2; k++) { 985 row = xdata[i][k] - rstart; 986 start = ai[row]; 987 end = ai[row+1]; 988 for (l=start; l<end; l++) { 989 val = aj[l] + cstart; 990 if (!PetscBTLookupSet(xtable,val)) { 991 if (!(ct3 < mem_estimate)) { 992 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 993 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 994 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 995 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 996 xdata[0] = tmp; 997 mem_estimate = new_estimate; ++no_malloc; 998 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 999 } 1000 xdata[i][ct2++] = val; 1001 ct3++; 1002 } 1003 } 1004 start = bi[row]; 1005 end = bi[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = garray[bj[l]]; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 } 1023 /* Update the header*/ 1024 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1025 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1026 } 1027 xdata[i][0] = rbuf_0; 1028 xdata[i+1] = xdata[i] + ct2; 1029 isz1[i] = ct2; /* size of each message */ 1030 } 1031 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1032 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 /* -------------------------------------------------------------------------*/ 1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1038 /* 1039 Every processor gets the entire matrix 1040 */ 1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1042 { 1043 Mat B; 1044 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1045 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1046 PetscErrorCode ierr; 1047 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1048 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1049 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1050 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1051 1052 PetscFunctionBegin; 1053 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1054 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1055 1056 if (scall == MAT_INITIAL_MATRIX) { 1057 /* ---------------------------------------------------------------- 1058 Tell every processor the number of nonzeros per row 1059 */ 1060 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1061 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1062 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1063 } 1064 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1065 for (i=0; i<size; i++) { 1066 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1067 displs[i] = A->rmap->range[i]; 1068 } 1069 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1070 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1071 #else 1072 sendcount = A->rmap->rend - A->rmap->rstart; 1073 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1074 #endif 1075 /* --------------------------------------------------------------- 1076 Create the sequential matrix of the same type as the local block diagonal 1077 */ 1078 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1079 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1080 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1081 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1082 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1083 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1084 **Bin = B; 1085 b = (Mat_SeqAIJ*)B->data; 1086 1087 /*-------------------------------------------------------------------- 1088 Copy my part of matrix column indices over 1089 */ 1090 sendcount = ad->nz + bd->nz; 1091 jsendbuf = b->j + b->i[rstarts[rank]]; 1092 a_jsendbuf = ad->j; 1093 b_jsendbuf = bd->j; 1094 n = A->rmap->rend - A->rmap->rstart; 1095 cnt = 0; 1096 for (i=0; i<n; i++) { 1097 1098 /* put in lower diagonal portion */ 1099 m = bd->i[i+1] - bd->i[i]; 1100 while (m > 0) { 1101 /* is it above diagonal (in bd (compressed) numbering) */ 1102 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1103 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1104 m--; 1105 } 1106 1107 /* put in diagonal portion */ 1108 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1109 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1110 } 1111 1112 /* put in upper diagonal portion */ 1113 while (m-- > 0) { 1114 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1115 } 1116 } 1117 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1118 1119 /*-------------------------------------------------------------------- 1120 Gather all column indices to all processors 1121 */ 1122 for (i=0; i<size; i++) { 1123 recvcounts[i] = 0; 1124 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1125 recvcounts[i] += lens[j]; 1126 } 1127 } 1128 displs[0] = 0; 1129 for (i=1; i<size; i++) { 1130 displs[i] = displs[i-1] + recvcounts[i-1]; 1131 } 1132 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1133 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1134 #else 1135 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1136 #endif 1137 /*-------------------------------------------------------------------- 1138 Assemble the matrix into useable form (note numerical values not yet set) 1139 */ 1140 /* set the b->ilen (length of each row) values */ 1141 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1142 /* set the b->i indices */ 1143 b->i[0] = 0; 1144 for (i=1; i<=A->rmap->N; i++) { 1145 b->i[i] = b->i[i-1] + lens[i-1]; 1146 } 1147 ierr = PetscFree(lens);CHKERRQ(ierr); 1148 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1150 1151 } else { 1152 B = **Bin; 1153 b = (Mat_SeqAIJ*)B->data; 1154 } 1155 1156 /*-------------------------------------------------------------------- 1157 Copy my part of matrix numerical values into the values location 1158 */ 1159 if (flag == MAT_GET_VALUES) { 1160 sendcount = ad->nz + bd->nz; 1161 sendbuf = b->a + b->i[rstarts[rank]]; 1162 a_sendbuf = ad->a; 1163 b_sendbuf = bd->a; 1164 b_sendj = bd->j; 1165 n = A->rmap->rend - A->rmap->rstart; 1166 cnt = 0; 1167 for (i=0; i<n; i++) { 1168 1169 /* put in lower diagonal portion */ 1170 m = bd->i[i+1] - bd->i[i]; 1171 while (m > 0) { 1172 /* is it above diagonal (in bd (compressed) numbering) */ 1173 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1174 sendbuf[cnt++] = *b_sendbuf++; 1175 m--; 1176 b_sendj++; 1177 } 1178 1179 /* put in diagonal portion */ 1180 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1181 sendbuf[cnt++] = *a_sendbuf++; 1182 } 1183 1184 /* put in upper diagonal portion */ 1185 while (m-- > 0) { 1186 sendbuf[cnt++] = *b_sendbuf++; 1187 b_sendj++; 1188 } 1189 } 1190 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1191 1192 /* ----------------------------------------------------------------- 1193 Gather all numerical values to all processors 1194 */ 1195 if (!recvcounts) { 1196 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1197 } 1198 for (i=0; i<size; i++) { 1199 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1200 } 1201 displs[0] = 0; 1202 for (i=1; i<size; i++) { 1203 displs[i] = displs[i-1] + recvcounts[i-1]; 1204 } 1205 recvbuf = b->a; 1206 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1207 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1208 #else 1209 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1210 #endif 1211 } /* endof (flag == MAT_GET_VALUES) */ 1212 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1213 1214 if (A->symmetric) { 1215 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1216 } else if (A->hermitian) { 1217 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1218 } else if (A->structurally_symmetric) { 1219 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1220 } 1221 PetscFunctionReturn(0); 1222 } 1223 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1224 { 1225 PetscErrorCode ierr; 1226 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1227 Mat_SubMat *submatj = c->submatis1; 1228 PetscInt i; 1229 1230 PetscFunctionBegin; 1231 ierr = submatj->destroy(C);CHKERRQ(ierr); 1232 1233 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1234 1235 for (i=0; i<submatj->nrqr; ++i) { 1236 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1237 } 1238 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1239 1240 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1241 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1242 1243 for (i=0; i<submatj->nrqs; ++i) { 1244 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1245 } 1246 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1247 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1248 1249 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1250 1251 #if defined(PETSC_USE_CTABLE) 1252 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1253 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1254 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1255 #else 1256 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1257 #endif 1258 1259 if (!submatj->allcolumns) { 1260 #if defined(PETSC_USE_CTABLE) 1261 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1262 #else 1263 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1264 #endif 1265 } 1266 1267 ierr = PetscFree(submatj);CHKERRQ(ierr); 1268 PetscFunctionReturn(0); 1269 } 1270 1271 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1272 { 1273 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1274 Mat submat,A = c->A,B = c->B; 1275 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1276 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1277 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1278 const PetscInt *icol,*irow; 1279 PetscInt nrow,ncol,start; 1280 PetscErrorCode ierr; 1281 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1282 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1283 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1284 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1285 PetscInt *lens,rmax,ncols,*cols,Crow; 1286 #if defined(PETSC_USE_CTABLE) 1287 PetscTable cmap,rmap; 1288 PetscInt *cmap_loc,*rmap_loc; 1289 #else 1290 PetscInt *cmap,*rmap; 1291 #endif 1292 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1293 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1294 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1295 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1296 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1297 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1298 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1299 MPI_Comm comm; 1300 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1301 PetscMPIInt *onodes1,*olengths1,idex,end; 1302 Mat_SubMat *smatis1; 1303 PetscBool isrowsorted; 1304 1305 PetscFunctionBegin; 1306 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1307 1308 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1309 size = c->size; 1310 rank = c->rank; 1311 1312 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1313 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1314 1315 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1316 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1317 if (allcolumns) { 1318 icol = NULL; 1319 ncol = C->cmap->N; 1320 } else { 1321 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1322 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1323 } 1324 1325 if (scall == MAT_INITIAL_MATRIX) { 1326 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1327 1328 /* Get some new tags to keep the communication clean */ 1329 tag1 = ((PetscObject)C)->tag; 1330 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1331 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1332 1333 /* evaluate communication - mesg to who, length of mesg, and buffer space 1334 required. Based on this, buffers are allocated, and data copied into them */ 1335 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1336 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1337 1338 /* w1[proc] = num of rows owned by proc -- to be requested */ 1339 proc = 0; 1340 nrqs = 0; /* num of outgoing messages */ 1341 for (j=0; j<nrow; j++) { 1342 row = irow[j]; /* sorted! */ 1343 while (row >= C->rmap->range[proc+1]) proc++; 1344 w1[proc]++; 1345 row2proc[j] = proc; /* map row index to proc */ 1346 1347 if (proc != rank && !w2[proc]) { 1348 w2[proc] = 1; nrqs++; 1349 } 1350 } 1351 w1[rank] = 0; /* rows owned by self will not be requested */ 1352 1353 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1354 for (proc=0,j=0; proc<size; proc++) { 1355 if (w1[proc]) { pa[j++] = proc;} 1356 } 1357 1358 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1359 msz = 0; /* total mesg length (for all procs) */ 1360 for (i=0; i<nrqs; i++) { 1361 proc = pa[i]; 1362 w1[proc] += 3; 1363 msz += w1[proc]; 1364 } 1365 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1366 1367 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1368 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1369 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1370 1371 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1372 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1373 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1374 1375 /* Now post the Irecvs corresponding to these messages */ 1376 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1377 1378 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1379 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1380 1381 /* Allocate Memory for outgoing messages */ 1382 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1383 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1384 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1385 1386 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1387 iptr = tmp; 1388 for (i=0; i<nrqs; i++) { 1389 proc = pa[i]; 1390 sbuf1[proc] = iptr; 1391 iptr += w1[proc]; 1392 } 1393 1394 /* Form the outgoing messages */ 1395 /* Initialize the header space */ 1396 for (i=0; i<nrqs; i++) { 1397 proc = pa[i]; 1398 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1399 ptr[proc] = sbuf1[proc] + 3; 1400 } 1401 1402 /* Parse the isrow and copy data into outbuf */ 1403 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1404 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1405 proc = row2proc[j]; 1406 if (proc != rank) { /* copy to the outgoing buf*/ 1407 *ptr[proc] = irow[j]; 1408 ctr[proc]++; ptr[proc]++; 1409 } 1410 } 1411 1412 /* Update the headers for the current IS */ 1413 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1414 if ((ctr_j = ctr[j])) { 1415 sbuf1_j = sbuf1[j]; 1416 k = ++sbuf1_j[0]; 1417 sbuf1_j[2*k] = ctr_j; 1418 sbuf1_j[2*k-1] = 0; 1419 } 1420 } 1421 1422 /* Now post the sends */ 1423 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1424 for (i=0; i<nrqs; ++i) { 1425 proc = pa[i]; 1426 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1427 } 1428 1429 /* Post Receives to capture the buffer size */ 1430 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1431 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1432 1433 rbuf2[0] = tmp + msz; 1434 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1435 1436 for (i=0; i<nrqs; ++i) { 1437 proc = pa[i]; 1438 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1439 } 1440 1441 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1442 1443 /* Send to other procs the buf size they should allocate */ 1444 /* Receive messages*/ 1445 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1446 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1447 1448 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1449 for (i=0; i<nrqr; ++i) { 1450 req_size[i] = 0; 1451 rbuf1_i = rbuf1[i]; 1452 start = 2*rbuf1_i[0] + 1; 1453 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1454 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1455 sbuf2_i = sbuf2[i]; 1456 for (j=start; j<end; j++) { 1457 k = rbuf1_i[j] - rstart; 1458 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1459 sbuf2_i[j] = ncols; 1460 req_size[i] += ncols; 1461 } 1462 req_source1[i] = r_status1[i].MPI_SOURCE; 1463 1464 /* form the header */ 1465 sbuf2_i[0] = req_size[i]; 1466 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1467 1468 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1469 } 1470 1471 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1472 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1473 1474 /* rbuf2 is received, Post recv column indices a->j */ 1475 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1476 1477 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1478 for (i=0; i<nrqs; ++i) { 1479 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1480 req_source2[i] = r_status2[i].MPI_SOURCE; 1481 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1482 } 1483 1484 /* Wait on sends1 and sends2 */ 1485 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1486 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1487 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1488 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1489 1490 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1491 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1492 1493 /* Now allocate sending buffers for a->j, and send them off */ 1494 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1495 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1496 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1497 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1498 1499 for (i=0; i<nrqr; i++) { /* for each requested message */ 1500 rbuf1_i = rbuf1[i]; 1501 sbuf_aj_i = sbuf_aj[i]; 1502 ct1 = 2*rbuf1_i[0] + 1; 1503 ct2 = 0; 1504 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1505 1506 kmax = rbuf1[i][2]; 1507 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1508 row = rbuf1_i[ct1] - rstart; 1509 nzA = ai[row+1] - ai[row]; 1510 nzB = bi[row+1] - bi[row]; 1511 ncols = nzA + nzB; 1512 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1513 1514 /* load the column indices for this row into cols*/ 1515 cols = sbuf_aj_i + ct2; 1516 1517 lwrite = 0; 1518 for (l=0; l<nzB; l++) { 1519 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1520 } 1521 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1522 for (l=0; l<nzB; l++) { 1523 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1524 } 1525 1526 ct2 += ncols; 1527 } 1528 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1529 } 1530 1531 /* create column map (cmap): global col of C -> local col of submat */ 1532 #if defined(PETSC_USE_CTABLE) 1533 if (!allcolumns) { 1534 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1535 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1536 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1537 if (icol[j] >= cstart && icol[j] <cend) { 1538 cmap_loc[icol[j] - cstart] = j+1; 1539 } else { /* use PetscTable for non-local col indices */ 1540 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1541 } 1542 } 1543 } else { 1544 cmap = NULL; 1545 cmap_loc = NULL; 1546 } 1547 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1548 #else 1549 if (!allcolumns) { 1550 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1551 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1552 } else { 1553 cmap = NULL; 1554 } 1555 #endif 1556 1557 /* Create lens for MatSeqAIJSetPreallocation() */ 1558 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1559 1560 /* Compute lens from local part of C */ 1561 for (j=0; j<nrow; j++) { 1562 row = irow[j]; 1563 proc = row2proc[j]; 1564 if (proc == rank) { 1565 /* diagonal part A = c->A */ 1566 ncols = ai[row-rstart+1] - ai[row-rstart]; 1567 cols = aj + ai[row-rstart]; 1568 if (!allcolumns) { 1569 for (k=0; k<ncols; k++) { 1570 #if defined(PETSC_USE_CTABLE) 1571 tcol = cmap_loc[cols[k]]; 1572 #else 1573 tcol = cmap[cols[k]+cstart]; 1574 #endif 1575 if (tcol) lens[j]++; 1576 } 1577 } else { /* allcolumns */ 1578 lens[j] = ncols; 1579 } 1580 1581 /* off-diagonal part B = c->B */ 1582 ncols = bi[row-rstart+1] - bi[row-rstart]; 1583 cols = bj + bi[row-rstart]; 1584 if (!allcolumns) { 1585 for (k=0; k<ncols; k++) { 1586 #if defined(PETSC_USE_CTABLE) 1587 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1588 #else 1589 tcol = cmap[bmap[cols[k]]]; 1590 #endif 1591 if (tcol) lens[j]++; 1592 } 1593 } else { /* allcolumns */ 1594 lens[j] += ncols; 1595 } 1596 } 1597 } 1598 1599 /* Create row map (rmap): global row of C -> local row of submat */ 1600 #if defined(PETSC_USE_CTABLE) 1601 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1602 for (j=0; j<nrow; j++) { 1603 row = irow[j]; 1604 proc = row2proc[j]; 1605 if (proc == rank) { /* a local row */ 1606 rmap_loc[row - rstart] = j; 1607 } else { 1608 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1609 } 1610 } 1611 #else 1612 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1613 for (j=0; j<nrow; j++) { 1614 rmap[irow[j]] = j; 1615 } 1616 #endif 1617 1618 /* Update lens from offproc data */ 1619 /* recv a->j is done */ 1620 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1621 for (i=0; i<nrqs; i++) { 1622 proc = pa[i]; 1623 sbuf1_i = sbuf1[proc]; 1624 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1625 ct1 = 2 + 1; 1626 ct2 = 0; 1627 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1628 rbuf3_i = rbuf3[i]; /* received C->j */ 1629 1630 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1631 max1 = sbuf1_i[2]; 1632 for (k=0; k<max1; k++,ct1++) { 1633 #if defined(PETSC_USE_CTABLE) 1634 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1635 row--; 1636 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1637 #else 1638 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1639 #endif 1640 /* Now, store row index of submat in sbuf1_i[ct1] */ 1641 sbuf1_i[ct1] = row; 1642 1643 nnz = rbuf2_i[ct1]; 1644 if (!allcolumns) { 1645 for (l=0; l<nnz; l++,ct2++) { 1646 #if defined(PETSC_USE_CTABLE) 1647 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1648 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1649 } else { 1650 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1651 } 1652 #else 1653 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1654 #endif 1655 if (tcol) lens[row]++; 1656 } 1657 } else { /* allcolumns */ 1658 lens[row] += nnz; 1659 } 1660 } 1661 } 1662 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1663 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1664 1665 /* Create the submatrices */ 1666 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1667 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1668 1669 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1670 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1671 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1672 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1673 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1674 1675 /* create struct Mat_SubMat and attached it to submat */ 1676 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1677 subc = (Mat_SeqAIJ*)submat->data; 1678 subc->submatis1 = smatis1; 1679 1680 smatis1->nrqs = nrqs; 1681 smatis1->nrqr = nrqr; 1682 smatis1->rbuf1 = rbuf1; 1683 smatis1->rbuf2 = rbuf2; 1684 smatis1->rbuf3 = rbuf3; 1685 smatis1->sbuf2 = sbuf2; 1686 smatis1->req_source2 = req_source2; 1687 1688 smatis1->sbuf1 = sbuf1; 1689 smatis1->ptr = ptr; 1690 smatis1->tmp = tmp; 1691 smatis1->ctr = ctr; 1692 1693 smatis1->pa = pa; 1694 smatis1->req_size = req_size; 1695 smatis1->req_source1 = req_source1; 1696 1697 smatis1->allcolumns = allcolumns; 1698 smatis1->row2proc = row2proc; 1699 smatis1->rmap = rmap; 1700 smatis1->cmap = cmap; 1701 #if defined(PETSC_USE_CTABLE) 1702 smatis1->rmap_loc = rmap_loc; 1703 smatis1->cmap_loc = cmap_loc; 1704 #endif 1705 1706 smatis1->destroy = submat->ops->destroy; 1707 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1708 submat->factortype = C->factortype; 1709 1710 /* compute rmax */ 1711 rmax = 0; 1712 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1713 1714 } else { /* scall == MAT_REUSE_MATRIX */ 1715 submat = submats[0]; 1716 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1717 1718 subc = (Mat_SeqAIJ*)submat->data; 1719 rmax = subc->rmax; 1720 smatis1 = subc->submatis1; 1721 nrqs = smatis1->nrqs; 1722 nrqr = smatis1->nrqr; 1723 rbuf1 = smatis1->rbuf1; 1724 rbuf2 = smatis1->rbuf2; 1725 rbuf3 = smatis1->rbuf3; 1726 req_source2 = smatis1->req_source2; 1727 1728 sbuf1 = smatis1->sbuf1; 1729 sbuf2 = smatis1->sbuf2; 1730 ptr = smatis1->ptr; 1731 tmp = smatis1->tmp; 1732 ctr = smatis1->ctr; 1733 1734 pa = smatis1->pa; 1735 req_size = smatis1->req_size; 1736 req_source1 = smatis1->req_source1; 1737 1738 allcolumns = smatis1->allcolumns; 1739 row2proc = smatis1->row2proc; 1740 rmap = smatis1->rmap; 1741 cmap = smatis1->cmap; 1742 #if defined(PETSC_USE_CTABLE) 1743 rmap_loc = smatis1->rmap_loc; 1744 cmap_loc = smatis1->cmap_loc; 1745 #endif 1746 } 1747 1748 /* Post recv matrix values */ 1749 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1750 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1751 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1752 for (i=0; i<nrqs; ++i) { 1753 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1754 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1755 } 1756 1757 /* Allocate sending buffers for a->a, and send them off */ 1758 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1759 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1760 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1761 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1762 1763 for (i=0; i<nrqr; i++) { 1764 rbuf1_i = rbuf1[i]; 1765 sbuf_aa_i = sbuf_aa[i]; 1766 ct1 = 2*rbuf1_i[0]+1; 1767 ct2 = 0; 1768 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1769 1770 kmax = rbuf1_i[2]; 1771 for (k=0; k<kmax; k++,ct1++) { 1772 row = rbuf1_i[ct1] - rstart; 1773 nzA = ai[row+1] - ai[row]; 1774 nzB = bi[row+1] - bi[row]; 1775 ncols = nzA + nzB; 1776 cworkB = bj + bi[row]; 1777 vworkA = a_a + ai[row]; 1778 vworkB = b_a + bi[row]; 1779 1780 /* load the column values for this row into vals*/ 1781 vals = sbuf_aa_i + ct2; 1782 1783 lwrite = 0; 1784 for (l=0; l<nzB; l++) { 1785 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1786 } 1787 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1788 for (l=0; l<nzB; l++) { 1789 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1790 } 1791 1792 ct2 += ncols; 1793 } 1794 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1795 } 1796 1797 /* Assemble submat */ 1798 /* First assemble the local rows */ 1799 for (j=0; j<nrow; j++) { 1800 row = irow[j]; 1801 proc = row2proc[j]; 1802 if (proc == rank) { 1803 Crow = row - rstart; /* local row index of C */ 1804 #if defined(PETSC_USE_CTABLE) 1805 row = rmap_loc[Crow]; /* row index of submat */ 1806 #else 1807 row = rmap[row]; 1808 #endif 1809 1810 if (allcolumns) { 1811 /* diagonal part A = c->A */ 1812 ncols = ai[Crow+1] - ai[Crow]; 1813 cols = aj + ai[Crow]; 1814 vals = a->a + ai[Crow]; 1815 i = 0; 1816 for (k=0; k<ncols; k++) { 1817 subcols[i] = cols[k] + cstart; 1818 subvals[i++] = vals[k]; 1819 } 1820 1821 /* off-diagonal part B = c->B */ 1822 ncols = bi[Crow+1] - bi[Crow]; 1823 cols = bj + bi[Crow]; 1824 vals = b->a + bi[Crow]; 1825 for (k=0; k<ncols; k++) { 1826 subcols[i] = bmap[cols[k]]; 1827 subvals[i++] = vals[k]; 1828 } 1829 1830 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1831 1832 } else { /* !allcolumns */ 1833 #if defined(PETSC_USE_CTABLE) 1834 /* diagonal part A = c->A */ 1835 ncols = ai[Crow+1] - ai[Crow]; 1836 cols = aj + ai[Crow]; 1837 vals = a->a + ai[Crow]; 1838 i = 0; 1839 for (k=0; k<ncols; k++) { 1840 tcol = cmap_loc[cols[k]]; 1841 if (tcol) { 1842 subcols[i] = --tcol; 1843 subvals[i++] = vals[k]; 1844 } 1845 } 1846 1847 /* off-diagonal part B = c->B */ 1848 ncols = bi[Crow+1] - bi[Crow]; 1849 cols = bj + bi[Crow]; 1850 vals = b->a + bi[Crow]; 1851 for (k=0; k<ncols; k++) { 1852 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1853 if (tcol) { 1854 subcols[i] = --tcol; 1855 subvals[i++] = vals[k]; 1856 } 1857 } 1858 #else 1859 /* diagonal part A = c->A */ 1860 ncols = ai[Crow+1] - ai[Crow]; 1861 cols = aj + ai[Crow]; 1862 vals = a->a + ai[Crow]; 1863 i = 0; 1864 for (k=0; k<ncols; k++) { 1865 tcol = cmap[cols[k]+cstart]; 1866 if (tcol) { 1867 subcols[i] = --tcol; 1868 subvals[i++] = vals[k]; 1869 } 1870 } 1871 1872 /* off-diagonal part B = c->B */ 1873 ncols = bi[Crow+1] - bi[Crow]; 1874 cols = bj + bi[Crow]; 1875 vals = b->a + bi[Crow]; 1876 for (k=0; k<ncols; k++) { 1877 tcol = cmap[bmap[cols[k]]]; 1878 if (tcol) { 1879 subcols[i] = --tcol; 1880 subvals[i++] = vals[k]; 1881 } 1882 } 1883 #endif 1884 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1885 } 1886 } 1887 } 1888 1889 /* Now assemble the off-proc rows */ 1890 for (i=0; i<nrqs; i++) { /* for each requested message */ 1891 /* recv values from other processes */ 1892 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1893 proc = pa[idex]; 1894 sbuf1_i = sbuf1[proc]; 1895 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1896 ct1 = 2 + 1; 1897 ct2 = 0; /* count of received C->j */ 1898 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1899 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1900 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1901 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1902 1903 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1904 max1 = sbuf1_i[2]; /* num of rows */ 1905 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1906 row = sbuf1_i[ct1]; /* row index of submat */ 1907 if (!allcolumns) { 1908 idex = 0; 1909 if (scall == MAT_INITIAL_MATRIX) { 1910 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1911 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1912 #if defined(PETSC_USE_CTABLE) 1913 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1914 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1915 } else { 1916 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1917 } 1918 #else 1919 tcol = cmap[rbuf3_i[ct2]]; 1920 #endif 1921 if (tcol) { 1922 subcols[idex] = --tcol; 1923 subvals[idex++] = rbuf4_i[ct2]; 1924 1925 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1926 For reuse, we replace received C->j with index that should be inserted to submat */ 1927 rbuf3_i[ct3++] = ct2; 1928 } 1929 } 1930 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1931 1932 } else { /* scall == MAT_REUSE_MATRIX */ 1933 submat = submats[0]; 1934 subc = (Mat_SeqAIJ*)submat->data; 1935 1936 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1937 for (l=0; l<nnz; l++) { 1938 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1939 subvals[idex++] = rbuf4_i[ct2]; 1940 } 1941 1942 bj = subc->j + subc->i[row]; 1943 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1944 } 1945 } else { /* allcolumns */ 1946 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1947 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1948 ct2 += nnz; 1949 } 1950 } 1951 } 1952 1953 /* sending a->a are done */ 1954 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1955 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1956 1957 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1958 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1959 submats[0] = submat; 1960 1961 /* Restore the indices */ 1962 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1963 if (!allcolumns) { 1964 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1965 } 1966 1967 /* Destroy allocated memory */ 1968 for (i=0; i<nrqs; ++i) { 1969 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1970 } 1971 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1972 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1973 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1974 1975 if (scall == MAT_INITIAL_MATRIX) { 1976 ierr = PetscFree(lens);CHKERRQ(ierr); 1977 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1978 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1979 } 1980 PetscFunctionReturn(0); 1981 } 1982 1983 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1984 { 1985 PetscErrorCode ierr; 1986 PetscInt ncol; 1987 PetscBool colflag,allcolumns=PETSC_FALSE; 1988 1989 PetscFunctionBegin; 1990 /* Allocate memory to hold all the submatrices */ 1991 if (scall != MAT_REUSE_MATRIX) { 1992 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 1993 } 1994 1995 /* Check for special case: each processor gets entire matrix columns */ 1996 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1997 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1998 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1999 2000 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2001 PetscFunctionReturn(0); 2002 } 2003 2004 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2005 { 2006 PetscErrorCode ierr; 2007 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 2008 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 2009 2010 PetscFunctionBegin; 2011 /* Check for special case: each processor gets entire matrix */ 2012 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */ 2013 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2014 PetscFunctionReturn(0); 2015 } 2016 2017 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2018 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2019 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2020 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2021 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2022 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2023 wantallmatrix = PETSC_TRUE; 2024 2025 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2026 } 2027 } 2028 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2029 if (twantallmatrix) { 2030 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2031 PetscFunctionReturn(0); 2032 } 2033 2034 /* Allocate memory to hold all the submatrices */ 2035 if (scall != MAT_REUSE_MATRIX) { 2036 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2037 } 2038 2039 /* Check for special case: each processor gets entire matrix columns */ 2040 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 2041 for (i=0; i<ismax; i++) { 2042 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2043 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 2044 if (colflag && ncol == C->cmap->N) { 2045 allcolumns[i] = PETSC_TRUE; 2046 } else { 2047 allcolumns[i] = PETSC_FALSE; 2048 } 2049 } 2050 2051 /* Determine the number of stages through which submatrices are done */ 2052 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2053 2054 /* 2055 Each stage will extract nmax submatrices. 2056 nmax is determined by the matrix column dimension. 2057 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2058 */ 2059 if (!nmax) nmax = 1; 2060 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 2061 2062 /* Make sure every processor loops through the nstages */ 2063 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2064 2065 for (i=0,pos=0; i<nstages; i++) { 2066 if (pos+nmax <= ismax) max_no = nmax; 2067 else if (pos == ismax) max_no = 0; 2068 else max_no = ismax-pos; 2069 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 2070 pos += max_no; 2071 } 2072 2073 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 2074 PetscFunctionReturn(0); 2075 } 2076 2077 /* -------------------------------------------------------------------------*/ 2078 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 2079 { 2080 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2081 Mat A = c->A; 2082 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 2083 const PetscInt **icol,**irow; 2084 PetscInt *nrow,*ncol,start; 2085 PetscErrorCode ierr; 2086 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 2087 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 2088 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 2089 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 2090 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2091 #if defined(PETSC_USE_CTABLE) 2092 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2093 #else 2094 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2095 #endif 2096 const PetscInt *irow_i; 2097 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2098 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2099 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2100 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2101 MPI_Status *r_status3,*r_status4,*s_status4; 2102 MPI_Comm comm; 2103 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 2104 PetscMPIInt *onodes1,*olengths1; 2105 PetscMPIInt idex,idex2,end; 2106 2107 PetscFunctionBegin; 2108 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2109 tag0 = ((PetscObject)C)->tag; 2110 size = c->size; 2111 rank = c->rank; 2112 2113 /* Get some new tags to keep the communication clean */ 2114 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 2115 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2116 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2117 2118 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 2119 2120 for (i=0; i<ismax; i++) { 2121 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2122 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2123 if (allcolumns[i]) { 2124 icol[i] = NULL; 2125 ncol[i] = C->cmap->N; 2126 } else { 2127 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2128 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2129 } 2130 } 2131 2132 /* evaluate communication - mesg to who, length of mesg, and buffer space 2133 required. Based on this, buffers are allocated, and data copied into them*/ 2134 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 2135 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2136 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2137 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2138 for (i=0; i<ismax; i++) { 2139 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2140 jmax = nrow[i]; 2141 irow_i = irow[i]; 2142 for (j=0; j<jmax; j++) { 2143 l = 0; 2144 row = irow_i[j]; 2145 while (row >= C->rmap->range[l+1]) l++; 2146 proc = l; 2147 w4[proc]++; 2148 } 2149 for (j=0; j<size; j++) { 2150 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 2151 } 2152 } 2153 2154 nrqs = 0; /* no of outgoing messages */ 2155 msz = 0; /* total mesg length (for all procs) */ 2156 w1[rank] = 0; /* no mesg sent to self */ 2157 w3[rank] = 0; 2158 for (i=0; i<size; i++) { 2159 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2160 } 2161 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2162 for (i=0,j=0; i<size; i++) { 2163 if (w1[i]) { pa[j] = i; j++; } 2164 } 2165 2166 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2167 for (i=0; i<nrqs; i++) { 2168 j = pa[i]; 2169 w1[j] += w2[j] + 2* w3[j]; 2170 msz += w1[j]; 2171 } 2172 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2173 2174 /* Determine the number of messages to expect, their lengths, from from-ids */ 2175 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2176 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2177 2178 /* Now post the Irecvs corresponding to these messages */ 2179 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2180 2181 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2182 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2183 2184 /* Allocate Memory for outgoing messages */ 2185 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2186 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2187 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2188 2189 { 2190 PetscInt *iptr = tmp; 2191 k = 0; 2192 for (i=0; i<nrqs; i++) { 2193 j = pa[i]; 2194 iptr += k; 2195 sbuf1[j] = iptr; 2196 k = w1[j]; 2197 } 2198 } 2199 2200 /* Form the outgoing messages */ 2201 /* Initialize the header space */ 2202 for (i=0; i<nrqs; i++) { 2203 j = pa[i]; 2204 sbuf1[j][0] = 0; 2205 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2206 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2207 } 2208 2209 /* Parse the isrow and copy data into outbuf */ 2210 for (i=0; i<ismax; i++) { 2211 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2212 irow_i = irow[i]; 2213 jmax = nrow[i]; 2214 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2215 l = 0; 2216 row = irow_i[j]; 2217 while (row >= C->rmap->range[l+1]) l++; 2218 proc = l; 2219 if (proc != rank) { /* copy to the outgoing buf*/ 2220 ctr[proc]++; 2221 *ptr[proc] = row; 2222 ptr[proc]++; 2223 } 2224 } 2225 /* Update the headers for the current IS */ 2226 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2227 if ((ctr_j = ctr[j])) { 2228 sbuf1_j = sbuf1[j]; 2229 k = ++sbuf1_j[0]; 2230 sbuf1_j[2*k] = ctr_j; 2231 sbuf1_j[2*k-1] = i; 2232 } 2233 } 2234 } 2235 2236 /* Now post the sends */ 2237 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2238 for (i=0; i<nrqs; ++i) { 2239 j = pa[i]; 2240 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2241 } 2242 2243 /* Post Receives to capture the buffer size */ 2244 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2245 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 2246 rbuf2[0] = tmp + msz; 2247 for (i=1; i<nrqs; ++i) { 2248 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2249 } 2250 for (i=0; i<nrqs; ++i) { 2251 j = pa[i]; 2252 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 2253 } 2254 2255 /* Send to other procs the buf size they should allocate */ 2256 2257 2258 /* Receive messages*/ 2259 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2260 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2261 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 2262 { 2263 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2264 PetscInt *sbuf2_i; 2265 2266 for (i=0; i<nrqr; ++i) { 2267 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 2268 2269 req_size[idex] = 0; 2270 rbuf1_i = rbuf1[idex]; 2271 start = 2*rbuf1_i[0] + 1; 2272 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2273 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 2274 sbuf2_i = sbuf2[idex]; 2275 for (j=start; j<end; j++) { 2276 id = rbuf1_i[j] - rstart; 2277 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2278 sbuf2_i[j] = ncols; 2279 req_size[idex] += ncols; 2280 } 2281 req_source[idex] = r_status1[i].MPI_SOURCE; 2282 /* form the header */ 2283 sbuf2_i[0] = req_size[idex]; 2284 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2285 2286 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 2287 } 2288 } 2289 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2290 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2291 2292 /* recv buffer sizes */ 2293 /* Receive messages*/ 2294 2295 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 2296 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2297 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2298 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2299 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2300 2301 for (i=0; i<nrqs; ++i) { 2302 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 2303 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 2304 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 2305 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 2306 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 2307 } 2308 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2309 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2310 2311 /* Wait on sends1 and sends2 */ 2312 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2313 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2314 2315 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2316 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2317 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2318 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2319 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2320 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2321 2322 /* Now allocate buffers for a->j, and send them off */ 2323 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2324 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2325 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2326 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2327 2328 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2329 { 2330 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2331 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2332 PetscInt cend = C->cmap->rend; 2333 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2334 2335 for (i=0; i<nrqr; i++) { 2336 rbuf1_i = rbuf1[i]; 2337 sbuf_aj_i = sbuf_aj[i]; 2338 ct1 = 2*rbuf1_i[0] + 1; 2339 ct2 = 0; 2340 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2341 kmax = rbuf1[i][2*j]; 2342 for (k=0; k<kmax; k++,ct1++) { 2343 row = rbuf1_i[ct1] - rstart; 2344 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2345 ncols = nzA + nzB; 2346 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2347 2348 /* load the column indices for this row into cols*/ 2349 cols = sbuf_aj_i + ct2; 2350 2351 lwrite = 0; 2352 for (l=0; l<nzB; l++) { 2353 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2354 } 2355 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2356 for (l=0; l<nzB; l++) { 2357 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2358 } 2359 2360 ct2 += ncols; 2361 } 2362 } 2363 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 2364 } 2365 } 2366 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 2367 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 2368 2369 /* Allocate buffers for a->a, and send them off */ 2370 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2371 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2372 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2373 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2374 2375 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2376 { 2377 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2378 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2379 PetscInt cend = C->cmap->rend; 2380 PetscInt *b_j = b->j; 2381 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2382 2383 for (i=0; i<nrqr; i++) { 2384 rbuf1_i = rbuf1[i]; 2385 sbuf_aa_i = sbuf_aa[i]; 2386 ct1 = 2*rbuf1_i[0]+1; 2387 ct2 = 0; 2388 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2389 kmax = rbuf1_i[2*j]; 2390 for (k=0; k<kmax; k++,ct1++) { 2391 row = rbuf1_i[ct1] - rstart; 2392 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2393 ncols = nzA + nzB; 2394 cworkB = b_j + b_i[row]; 2395 vworkA = a_a + a_i[row]; 2396 vworkB = b_a + b_i[row]; 2397 2398 /* load the column values for this row into vals*/ 2399 vals = sbuf_aa_i+ct2; 2400 2401 lwrite = 0; 2402 for (l=0; l<nzB; l++) { 2403 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2404 } 2405 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2406 for (l=0; l<nzB; l++) { 2407 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2408 } 2409 2410 ct2 += ncols; 2411 } 2412 } 2413 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 2414 } 2415 } 2416 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2417 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2418 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2419 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2420 2421 /* Form the matrix */ 2422 /* create col map: global col of C -> local col of submatrices */ 2423 { 2424 const PetscInt *icol_i; 2425 #if defined(PETSC_USE_CTABLE) 2426 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 2427 for (i=0; i<ismax; i++) { 2428 if (!allcolumns[i]) { 2429 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2430 2431 jmax = ncol[i]; 2432 icol_i = icol[i]; 2433 cmap_i = cmap[i]; 2434 for (j=0; j<jmax; j++) { 2435 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2436 } 2437 } else { 2438 cmap[i] = NULL; 2439 } 2440 } 2441 #else 2442 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 2443 for (i=0; i<ismax; i++) { 2444 if (!allcolumns[i]) { 2445 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2446 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2447 jmax = ncol[i]; 2448 icol_i = icol[i]; 2449 cmap_i = cmap[i]; 2450 for (j=0; j<jmax; j++) { 2451 cmap_i[icol_i[j]] = j+1; 2452 } 2453 } else { 2454 cmap[i] = NULL; 2455 } 2456 } 2457 #endif 2458 } 2459 2460 /* Create lens which is required for MatCreate... */ 2461 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2462 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2463 if (ismax) { 2464 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 2465 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 2466 } 2467 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2468 2469 /* Update lens from local data */ 2470 for (i=0; i<ismax; i++) { 2471 jmax = nrow[i]; 2472 if (!allcolumns[i]) cmap_i = cmap[i]; 2473 irow_i = irow[i]; 2474 lens_i = lens[i]; 2475 for (j=0; j<jmax; j++) { 2476 l = 0; 2477 row = irow_i[j]; 2478 while (row >= C->rmap->range[l+1]) l++; 2479 proc = l; 2480 if (proc == rank) { 2481 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2482 if (!allcolumns[i]) { 2483 for (k=0; k<ncols; k++) { 2484 #if defined(PETSC_USE_CTABLE) 2485 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2486 #else 2487 tcol = cmap_i[cols[k]]; 2488 #endif 2489 if (tcol) lens_i[j]++; 2490 } 2491 } else { /* allcolumns */ 2492 lens_i[j] = ncols; 2493 } 2494 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2495 } 2496 } 2497 } 2498 2499 /* Create row map: global row of C -> local row of submatrices */ 2500 #if defined(PETSC_USE_CTABLE) 2501 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 2502 for (i=0; i<ismax; i++) { 2503 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2504 irow_i = irow[i]; 2505 jmax = nrow[i]; 2506 for (j=0; j<jmax; j++) { 2507 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2508 } 2509 } 2510 #else 2511 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 2512 if (ismax) { 2513 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 2514 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2515 } 2516 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 2517 for (i=0; i<ismax; i++) { 2518 rmap_i = rmap[i]; 2519 irow_i = irow[i]; 2520 jmax = nrow[i]; 2521 for (j=0; j<jmax; j++) { 2522 rmap_i[irow_i[j]] = j; 2523 } 2524 } 2525 #endif 2526 2527 /* Update lens from offproc data */ 2528 { 2529 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2530 2531 for (tmp2=0; tmp2<nrqs; tmp2++) { 2532 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 2533 idex = pa[idex2]; 2534 sbuf1_i = sbuf1[idex]; 2535 jmax = sbuf1_i[0]; 2536 ct1 = 2*jmax+1; 2537 ct2 = 0; 2538 rbuf2_i = rbuf2[idex2]; 2539 rbuf3_i = rbuf3[idex2]; 2540 for (j=1; j<=jmax; j++) { 2541 is_no = sbuf1_i[2*j-1]; 2542 max1 = sbuf1_i[2*j]; 2543 lens_i = lens[is_no]; 2544 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2545 rmap_i = rmap[is_no]; 2546 for (k=0; k<max1; k++,ct1++) { 2547 #if defined(PETSC_USE_CTABLE) 2548 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2549 row--; 2550 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2551 #else 2552 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2553 #endif 2554 max2 = rbuf2_i[ct1]; 2555 for (l=0; l<max2; l++,ct2++) { 2556 if (!allcolumns[is_no]) { 2557 #if defined(PETSC_USE_CTABLE) 2558 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2559 #else 2560 tcol = cmap_i[rbuf3_i[ct2]]; 2561 #endif 2562 if (tcol) lens_i[row]++; 2563 } else { /* allcolumns */ 2564 lens_i[row]++; /* lens_i[row] += max2 ? */ 2565 } 2566 } 2567 } 2568 } 2569 } 2570 } 2571 ierr = PetscFree(r_status3);CHKERRQ(ierr); 2572 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2573 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2574 ierr = PetscFree(s_status3);CHKERRQ(ierr); 2575 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2576 2577 /* Create the submatrices */ 2578 if (scall == MAT_REUSE_MATRIX) { 2579 PetscBool flag; 2580 2581 /* 2582 Assumes new rows are same length as the old rows,hence bug! 2583 */ 2584 for (i=0; i<ismax; i++) { 2585 mat = (Mat_SeqAIJ*)(submats[i]->data); 2586 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2587 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2588 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 2589 /* Initial matrix as if empty */ 2590 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2591 2592 submats[i]->factortype = C->factortype; 2593 } 2594 } else { 2595 for (i=0; i<ismax; i++) { 2596 PetscInt rbs,cbs; 2597 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2598 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2599 2600 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2601 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2602 2603 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2604 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2605 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2606 } 2607 } 2608 2609 /* Assemble the matrices */ 2610 /* First assemble the local rows */ 2611 { 2612 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2613 PetscScalar *imat_a; 2614 2615 for (i=0; i<ismax; i++) { 2616 mat = (Mat_SeqAIJ*)submats[i]->data; 2617 imat_ilen = mat->ilen; 2618 imat_j = mat->j; 2619 imat_i = mat->i; 2620 imat_a = mat->a; 2621 2622 if (!allcolumns[i]) cmap_i = cmap[i]; 2623 rmap_i = rmap[i]; 2624 irow_i = irow[i]; 2625 jmax = nrow[i]; 2626 for (j=0; j<jmax; j++) { 2627 l = 0; 2628 row = irow_i[j]; 2629 while (row >= C->rmap->range[l+1]) l++; 2630 proc = l; 2631 if (proc == rank) { 2632 old_row = row; 2633 #if defined(PETSC_USE_CTABLE) 2634 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2635 row--; 2636 #else 2637 row = rmap_i[row]; 2638 #endif 2639 ilen_row = imat_ilen[row]; 2640 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2641 mat_i = imat_i[row]; 2642 mat_a = imat_a + mat_i; 2643 mat_j = imat_j + mat_i; 2644 if (!allcolumns[i]) { 2645 for (k=0; k<ncols; k++) { 2646 #if defined(PETSC_USE_CTABLE) 2647 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2648 #else 2649 tcol = cmap_i[cols[k]]; 2650 #endif 2651 if (tcol) { 2652 *mat_j++ = tcol - 1; 2653 *mat_a++ = vals[k]; 2654 ilen_row++; 2655 } 2656 } 2657 } else { /* allcolumns */ 2658 for (k=0; k<ncols; k++) { 2659 *mat_j++ = cols[k]; /* global col index! */ 2660 *mat_a++ = vals[k]; 2661 ilen_row++; 2662 } 2663 } 2664 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2665 2666 imat_ilen[row] = ilen_row; 2667 } 2668 } 2669 } 2670 } 2671 2672 /* Now assemble the off proc rows*/ 2673 { 2674 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 2675 PetscInt *imat_j,*imat_i; 2676 PetscScalar *imat_a,*rbuf4_i; 2677 2678 for (tmp2=0; tmp2<nrqs; tmp2++) { 2679 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 2680 idex = pa[idex2]; 2681 sbuf1_i = sbuf1[idex]; 2682 jmax = sbuf1_i[0]; 2683 ct1 = 2*jmax + 1; 2684 ct2 = 0; 2685 rbuf2_i = rbuf2[idex2]; 2686 rbuf3_i = rbuf3[idex2]; 2687 rbuf4_i = rbuf4[idex2]; 2688 for (j=1; j<=jmax; j++) { 2689 is_no = sbuf1_i[2*j-1]; 2690 rmap_i = rmap[is_no]; 2691 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2692 mat = (Mat_SeqAIJ*)submats[is_no]->data; 2693 imat_ilen = mat->ilen; 2694 imat_j = mat->j; 2695 imat_i = mat->i; 2696 imat_a = mat->a; 2697 max1 = sbuf1_i[2*j]; 2698 for (k=0; k<max1; k++,ct1++) { 2699 row = sbuf1_i[ct1]; 2700 #if defined(PETSC_USE_CTABLE) 2701 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2702 row--; 2703 #else 2704 row = rmap_i[row]; 2705 #endif 2706 ilen = imat_ilen[row]; 2707 mat_i = imat_i[row]; 2708 mat_a = imat_a + mat_i; 2709 mat_j = imat_j + mat_i; 2710 max2 = rbuf2_i[ct1]; 2711 if (!allcolumns[is_no]) { 2712 for (l=0; l<max2; l++,ct2++) { 2713 2714 #if defined(PETSC_USE_CTABLE) 2715 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2716 #else 2717 tcol = cmap_i[rbuf3_i[ct2]]; 2718 #endif 2719 if (tcol) { 2720 *mat_j++ = tcol - 1; 2721 *mat_a++ = rbuf4_i[ct2]; 2722 ilen++; 2723 } 2724 } 2725 } else { /* allcolumns */ 2726 for (l=0; l<max2; l++,ct2++) { 2727 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2728 *mat_a++ = rbuf4_i[ct2]; 2729 ilen++; 2730 } 2731 } 2732 imat_ilen[row] = ilen; 2733 } 2734 } 2735 } 2736 } 2737 2738 /* sort the rows */ 2739 { 2740 PetscInt *imat_ilen,*imat_j,*imat_i; 2741 PetscScalar *imat_a; 2742 2743 for (i=0; i<ismax; i++) { 2744 mat = (Mat_SeqAIJ*)submats[i]->data; 2745 imat_j = mat->j; 2746 imat_i = mat->i; 2747 imat_a = mat->a; 2748 imat_ilen = mat->ilen; 2749 2750 if (allcolumns[i]) continue; 2751 jmax = nrow[i]; 2752 for (j=0; j<jmax; j++) { 2753 PetscInt ilen; 2754 2755 mat_i = imat_i[j]; 2756 mat_a = imat_a + mat_i; 2757 mat_j = imat_j + mat_i; 2758 ilen = imat_ilen[j]; 2759 ierr = PetscSortIntWithScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2760 } 2761 } 2762 } 2763 2764 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2765 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2766 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2767 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2768 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2769 2770 /* Restore the indices */ 2771 for (i=0; i<ismax; i++) { 2772 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2773 if (!allcolumns[i]) { 2774 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2775 } 2776 } 2777 2778 /* Destroy allocated memory */ 2779 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 2780 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2781 ierr = PetscFree(pa);CHKERRQ(ierr); 2782 2783 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2784 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 2785 for (i=0; i<nrqr; ++i) { 2786 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2787 } 2788 for (i=0; i<nrqs; ++i) { 2789 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2790 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2791 } 2792 2793 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 2794 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 2795 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2796 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2797 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2798 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2799 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2800 2801 #if defined(PETSC_USE_CTABLE) 2802 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 2803 #else 2804 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 2805 #endif 2806 ierr = PetscFree(rmap);CHKERRQ(ierr); 2807 2808 for (i=0; i<ismax; i++) { 2809 if (!allcolumns[i]) { 2810 #if defined(PETSC_USE_CTABLE) 2811 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 2812 #else 2813 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 2814 #endif 2815 } 2816 } 2817 ierr = PetscFree(cmap);CHKERRQ(ierr); 2818 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2819 ierr = PetscFree(lens);CHKERRQ(ierr); 2820 2821 for (i=0; i<ismax; i++) { 2822 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2823 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2824 } 2825 PetscFunctionReturn(0); 2826 } 2827 2828 /* 2829 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2830 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2831 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2832 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2833 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2834 state, and needs to be "assembled" later by compressing B's column space. 2835 2836 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2837 Following this call, C->A & C->B have been created, even if empty. 2838 */ 2839 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2840 { 2841 /* If making this function public, change the error returned in this function away from _PLIB. */ 2842 PetscErrorCode ierr; 2843 Mat_MPIAIJ *aij; 2844 Mat_SeqAIJ *Baij; 2845 PetscBool seqaij,Bdisassembled; 2846 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2847 PetscScalar v; 2848 const PetscInt *rowindices,*colindices; 2849 2850 PetscFunctionBegin; 2851 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2852 if (A) { 2853 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2854 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2855 if (rowemb) { 2856 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2857 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2858 } else { 2859 if (C->rmap->n != A->rmap->n) { 2860 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2861 } 2862 } 2863 if (dcolemb) { 2864 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2865 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2866 } else { 2867 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2868 } 2869 } 2870 if (B) { 2871 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2872 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2873 if (rowemb) { 2874 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2875 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2876 } else { 2877 if (C->rmap->n != B->rmap->n) { 2878 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2879 } 2880 } 2881 if (ocolemb) { 2882 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2883 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2884 } else { 2885 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2886 } 2887 } 2888 2889 aij = (Mat_MPIAIJ*)(C->data); 2890 if (!aij->A) { 2891 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2892 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2893 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2894 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2895 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2896 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2897 } 2898 if (A) { 2899 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2900 } else { 2901 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2902 } 2903 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2904 /* 2905 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2906 need to "disassemble" B -- convert it to using C's global indices. 2907 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2908 2909 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2910 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2911 2912 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2913 At least avoid calling MatSetValues() and the implied searches? 2914 */ 2915 2916 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2917 #if defined(PETSC_USE_CTABLE) 2918 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2919 #else 2920 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2921 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2922 if (aij->B) { 2923 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2924 } 2925 #endif 2926 ngcol = 0; 2927 if (aij->lvec) { 2928 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2929 } 2930 if (aij->garray) { 2931 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2932 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2933 } 2934 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2935 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2936 } 2937 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2938 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2939 } 2940 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2941 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2942 } 2943 } 2944 Bdisassembled = PETSC_FALSE; 2945 if (!aij->B) { 2946 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2947 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2948 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2949 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2950 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2951 Bdisassembled = PETSC_TRUE; 2952 } 2953 if (B) { 2954 Baij = (Mat_SeqAIJ*)(B->data); 2955 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2956 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2957 for (i=0; i<B->rmap->n; i++) { 2958 nz[i] = Baij->i[i+1] - Baij->i[i]; 2959 } 2960 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2961 ierr = PetscFree(nz);CHKERRQ(ierr); 2962 } 2963 2964 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 2965 shift = rend-rstart; 2966 count = 0; 2967 rowindices = NULL; 2968 colindices = NULL; 2969 if (rowemb) { 2970 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 2971 } 2972 if (ocolemb) { 2973 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 2974 } 2975 for (i=0; i<B->rmap->n; i++) { 2976 PetscInt row; 2977 row = i; 2978 if (rowindices) row = rowindices[i]; 2979 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 2980 col = Baij->j[count]; 2981 if (colindices) col = colindices[col]; 2982 if (Bdisassembled && col>=rstart) col += shift; 2983 v = Baij->a[count]; 2984 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 2985 ++count; 2986 } 2987 } 2988 /* No assembly for aij->B is necessary. */ 2989 /* FIXME: set aij->B's nonzerostate correctly. */ 2990 } else { 2991 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 2992 } 2993 C->preallocated = PETSC_TRUE; 2994 C->was_assembled = PETSC_FALSE; 2995 C->assembled = PETSC_FALSE; 2996 /* 2997 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 2998 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 2999 */ 3000 PetscFunctionReturn(0); 3001 } 3002 3003 /* 3004 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3005 */ 3006 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3007 { 3008 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3009 3010 PetscFunctionBegin; 3011 PetscValidPointer(A,2); 3012 PetscValidPointer(B,3); 3013 /* FIXME: make sure C is assembled */ 3014 *A = aij->A; 3015 *B = aij->B; 3016 /* Note that we don't incref *A and *B, so be careful! */ 3017 PetscFunctionReturn(0); 3018 } 3019 3020 /* 3021 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3022 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3023 */ 3024 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3025 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3026 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3027 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3028 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3029 { 3030 PetscErrorCode ierr; 3031 PetscMPIInt isize,flag; 3032 PetscInt i,ii,cismax,ispar,ciscol_localsize; 3033 Mat *A,*B; 3034 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3035 3036 PetscFunctionBegin; 3037 if (!ismax) PetscFunctionReturn(0); 3038 3039 for (i = 0, cismax = 0; i < ismax; ++i) { 3040 PetscMPIInt isize; 3041 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3042 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3043 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3044 if (isize > 1) ++cismax; 3045 } 3046 /* 3047 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3048 ispar counts the number of parallel ISs across C's comm. 3049 */ 3050 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3051 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3052 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3053 PetscFunctionReturn(0); 3054 } 3055 3056 /* if (ispar) */ 3057 /* 3058 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3059 These are used to extract the off-diag portion of the resulting parallel matrix. 3060 The row IS for the off-diag portion is the same as for the diag portion, 3061 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3062 */ 3063 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3064 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3065 for (i = 0, ii = 0; i < ismax; ++i) { 3066 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3067 if (isize > 1) { 3068 /* 3069 TODO: This is the part that's ***NOT SCALABLE***. 3070 To fix this we need to extract just the indices of C's nonzero columns 3071 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3072 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3073 to be done without serializing on the IS list, so, most likely, it is best 3074 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3075 */ 3076 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3077 /* Now we have to 3078 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3079 were sorted on each rank, concatenated they might no longer be sorted; 3080 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3081 indices in the nondecreasing order to the original index positions. 3082 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3083 */ 3084 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3085 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3086 ++ii; 3087 } 3088 } 3089 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3090 for (i = 0, ii = 0; i < ismax; ++i) { 3091 PetscInt j,issize; 3092 const PetscInt *indices; 3093 3094 /* 3095 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3096 */ 3097 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3098 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3099 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3100 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3101 for (j = 1; j < issize; ++j) { 3102 if (indices[j] == indices[j-1]) { 3103 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3104 } 3105 } 3106 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3107 3108 3109 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3110 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3111 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3112 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3113 for (j = 1; j < issize; ++j) { 3114 if (indices[j-1] == indices[j]) { 3115 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3116 } 3117 } 3118 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3119 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3120 if (isize > 1) { 3121 cisrow[ii] = isrow[i]; 3122 ++ii; 3123 } 3124 } 3125 /* 3126 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3127 array of sequential matrices underlying the resulting parallel matrices. 3128 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3129 contain duplicates. 3130 3131 There are as many diag matrices as there are original index sets. There are only as many parallel 3132 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3133 3134 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3135 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3136 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3137 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3138 with A[i] and B[ii] extracted from the corresponding MPI submat. 3139 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3140 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3141 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3142 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3143 values into A[i] and B[ii] sitting inside the corresponding submat. 3144 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3145 A[i], B[ii], AA[i] or BB[ii] matrices. 3146 */ 3147 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3148 if (scall == MAT_INITIAL_MATRIX) { 3149 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3150 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 3151 } else { 3152 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 3153 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 3154 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 3155 for (i = 0, ii = 0; i < ismax; ++i) { 3156 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3157 if (isize > 1) { 3158 Mat AA,BB; 3159 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 3160 if (!isrow_p[i] && !iscol_p[i]) { 3161 A[i] = AA; 3162 } else { 3163 /* TODO: extract A[i] composed on AA. */ 3164 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3165 } 3166 if (!isrow_p[i] && !ciscol_p[ii]) { 3167 B[ii] = BB; 3168 } else { 3169 /* TODO: extract B[ii] composed on BB. */ 3170 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 3171 } 3172 ++ii; 3173 } else { 3174 if (!isrow_p[i] && !iscol_p[i]) { 3175 A[i] = (*submat)[i]; 3176 } else { 3177 /* TODO: extract A[i] composed on (*submat)[i]. */ 3178 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3179 } 3180 } 3181 } 3182 } 3183 /* Now obtain the sequential A and B submatrices separately. */ 3184 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 3185 /* I did not figure out a good way to fix it right now. 3186 * Local column size of B[i] is different from the size of ciscol[i]. 3187 * B[i]'s size is finally determined by MatAssembly, while 3188 * ciscol[i] is computed as the complement of iscol[i]. 3189 * It is better to keep only nonzero indices when computing 3190 * the complement ciscol[i]. 3191 * */ 3192 if(scall==MAT_REUSE_MATRIX){ 3193 for(i=0; i<cismax; i++){ 3194 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 3195 B[i]->cmap->n = ciscol_localsize; 3196 } 3197 } 3198 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 3199 /* 3200 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3201 matrices A & B have been extracted directly into the parallel matrices containing them, or 3202 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3203 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3204 to have the same sparsity pattern. 3205 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3206 must be constructed for C. This is done by setseqmat(s). 3207 */ 3208 for (i = 0, ii = 0; i < ismax; ++i) { 3209 /* 3210 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3211 That way we can avoid sorting and computing permutations when reusing. 3212 To this end: 3213 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3214 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3215 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3216 */ 3217 MatStructure pattern; 3218 if (scall == MAT_INITIAL_MATRIX) { 3219 pattern = DIFFERENT_NONZERO_PATTERN; 3220 } else { 3221 pattern = SAME_NONZERO_PATTERN; 3222 } 3223 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3224 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3225 if (isize > 1) { 3226 if (scall == MAT_INITIAL_MATRIX) { 3227 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3228 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3229 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3230 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3231 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3232 } 3233 /* 3234 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3235 */ 3236 { 3237 Mat AA,BB; 3238 AA = NULL; 3239 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 3240 BB = NULL; 3241 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 3242 if (AA || BB) { 3243 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3244 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3245 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3246 } 3247 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 3248 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 3249 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3250 } 3251 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 3252 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 3253 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3254 } 3255 } 3256 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3257 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3258 ++ii; 3259 } else { /* if (isize == 1) */ 3260 if (scall == MAT_INITIAL_MATRIX) { 3261 if (isrow_p[i] || iscol_p[i]) { 3262 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3263 } else (*submat)[i] = A[i]; 3264 } 3265 if (isrow_p[i] || iscol_p[i]) { 3266 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3267 /* Otherwise A is extracted straight into (*submats)[i]. */ 3268 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3269 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3270 } 3271 } 3272 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3273 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3274 } 3275 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3276 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3277 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3278 ierr = PetscFree(A);CHKERRQ(ierr); 3279 ierr = PetscFree(B);CHKERRQ(ierr); 3280 PetscFunctionReturn(0); 3281 } 3282 3283 3284 3285 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3286 { 3287 PetscErrorCode ierr; 3288 3289 PetscFunctionBegin; 3290 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3291 PetscFunctionReturn(0); 3292 } 3293