1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++){ 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++){ 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank){ 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows){ 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++){ 115 if (tosizes_temp[i]){ 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++){ 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++){ 133 for (j=0; j<fromsizes[2*i]; j++){ 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++){ 169 for (j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++){ 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++){ 209 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 210 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 213 } 214 /* retrieve information to get row id and its overlap */ 215 for (i=0; i<nrecvs; ){ 216 is_id = recvdata[i++]; 217 data_size = recvdata[i++]; 218 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 219 isz_i = isz[is_id]; 220 for (j=0; j< data_size; j++){ 221 col = recvdata[i++]; 222 indices_i[isz_i++] = col; 223 } 224 isz[is_id] = isz_i; 225 } 226 /* remove duplicate entities */ 227 for (i=0; i<nidx; i++){ 228 indices_i = indices_temp+(max_lsize+nrecvs)*i; 229 isz_i = isz[i]; 230 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 231 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 232 } 233 ierr = PetscFree(isz);CHKERRQ(ierr); 234 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 239 { 240 PetscLayout rmap,cmap; 241 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 242 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 243 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 244 const PetscInt *gcols,*ai,*aj,*bi,*bj; 245 Mat amat,bmat; 246 PetscMPIInt rank; 247 PetscBool done; 248 MPI_Comm comm; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 253 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 254 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 255 /* Even if the mat is symmetric, we still assume it is not symmetric */ 256 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 257 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 258 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 /* total number of nonzero values is used to estimate the memory usage in the next step */ 261 tnz = ai[an]+bi[bn]; 262 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 263 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 264 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 265 /* to find the longest message */ 266 max_fszs = 0; 267 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 268 /* better way to estimate number of nonzero in the mat??? */ 269 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 270 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 271 rows_pos = 0; 272 totalrows = 0; 273 for (i=0; i<nfrom; i++){ 274 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 275 /* group data together */ 276 for (j=0; j<fromsizes[2*i]; j+=2){ 277 is_id = fromrows[rows_pos++];/* no of is */ 278 rows_i = rows_data[is_id]; 279 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 280 } 281 /* estimate a space to avoid multiple allocations */ 282 for (j=0; j<nidx; j++){ 283 indvc_ij = 0; 284 rows_i = rows_data[j]; 285 for (l=0; l<rows_pos_i[j]; l++){ 286 row = rows_i[l]-rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for (k=start; k<end; k++){ /* Amat */ 290 col = aj[k] + cstart; 291 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for (k=start; k<end; k++) { /* Bmat */ 296 col = gcols[bj[k]]; 297 indices_tmp[indvc_ij++] = col; 298 } 299 } 300 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 301 indv_counts[i*nidx+j] = indvc_ij; 302 totalrows += indvc_ij; 303 } 304 } 305 /* message triple <no of is, number of rows, rows> */ 306 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 307 totalrows = 0; 308 rows_pos = 0; 309 /* use this code again */ 310 for (i=0;i<nfrom;i++){ 311 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 312 for (j=0; j<fromsizes[2*i]; j+=2){ 313 is_id = fromrows[rows_pos++]; 314 rows_i = rows_data[is_id]; 315 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 316 } 317 /* add data */ 318 for (j=0; j<nidx; j++){ 319 if (!indv_counts[i*nidx+j]) continue; 320 indvc_ij = 0; 321 sbdata[totalrows++] = j; 322 sbdata[totalrows++] = indv_counts[i*nidx+j]; 323 sbsizes[2*i] += 2; 324 rows_i = rows_data[j]; 325 for (l=0; l<rows_pos_i[j]; l++){ 326 row = rows_i[l]-rstart; 327 start = ai[row]; 328 end = ai[row+1]; 329 for (k=start; k<end; k++){ /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; 332 } 333 start = bi[row]; 334 end = bi[row+1]; 335 for (k=start; k<end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 341 sbsizes[2*i] += indvc_ij; 342 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 343 totalrows += indvc_ij; 344 } 345 } 346 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 347 for (i=0; i<nfrom; i++){ 348 offsets[i+1] = offsets[i] + sbsizes[2*i]; 349 sbsizes[2*i+1] = offsets[i]; 350 } 351 ierr = PetscFree(offsets);CHKERRQ(ierr); 352 if (sbrowsizes) *sbrowsizes = sbsizes; 353 if (sbrows) *sbrows = sbdata; 354 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 355 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 356 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 361 { 362 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 363 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 364 PetscInt lsize,lsize_tmp,owner; 365 PetscMPIInt rank; 366 Mat amat,bmat; 367 PetscBool done; 368 PetscLayout cmap,rmap; 369 MPI_Comm comm; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 374 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 375 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 376 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 377 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 378 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 /* is it a safe way to compute number of nonzero values ? */ 381 tnz = ai[an]+bi[bn]; 382 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 383 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 384 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 385 /* it is a better way to estimate memory than the old implementation 386 * where global size of matrix is used 387 * */ 388 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 389 for (i=0; i<nidx; i++) { 390 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 391 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 392 lsize_tmp = 0; 393 for (j=0; j<lsize; j++) { 394 owner = -1; 395 row = indices[j]; 396 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 397 if (owner != rank) continue; 398 /* local number */ 399 row -= rstart; 400 start = ai[row]; 401 end = ai[row+1]; 402 for (k=start; k<end; k++) { /* Amat */ 403 col = aj[k] + cstart; 404 indices_temp[lsize_tmp++] = col; 405 } 406 start = bi[row]; 407 end = bi[row+1]; 408 for (k=start; k<end; k++) { /* Bmat */ 409 col = gcols[bj[k]]; 410 indices_temp[lsize_tmp++] = col; 411 } 412 } 413 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 414 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 415 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 416 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 417 } 418 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 419 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 420 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests recieved (which have to be or which have been processed 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscErrorCode ierr; 461 PetscMPIInt size,rank,tag1,tag2; 462 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 463 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 464 PetscBT *table; 465 MPI_Comm comm; 466 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 467 MPI_Status *s_status,*recv_status; 468 char *t_p; 469 470 PetscFunctionBegin; 471 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 477 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 478 479 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 480 481 for (i=0; i<imax; i++) { 482 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 483 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 489 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 490 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 491 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 492 for (i=0; i<imax; i++) { 493 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 idx_i = idx[i]; 495 len = n[i]; 496 for (j=0; j<len; j++) { 497 row = idx_i[j]; 498 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 499 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 500 w4[proc]++; 501 } 502 for (j=0; j<size; j++) { 503 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 504 } 505 } 506 507 nrqs = 0; /* no of outgoing messages */ 508 msz = 0; /* total mesg length (for all proc */ 509 w1[rank] = 0; /* no mesg sent to intself */ 510 w3[rank] = 0; 511 for (i=0; i<size; i++) { 512 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 513 } 514 /* pa - is list of processors to communicate with */ 515 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 516 for (i=0,j=0; i<size; i++) { 517 if (w1[i]) {pa[j] = i; j++;} 518 } 519 520 /* Each message would have a header = 1 + 2*(no of IS) + data */ 521 for (i=0; i<nrqs; i++) { 522 j = pa[i]; 523 w1[j] += w2[j] + 2*w3[j]; 524 msz += w1[j]; 525 } 526 527 /* Determine the number of messages to expect, their lengths, from from-ids */ 528 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 529 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 530 531 /* Now post the Irecvs corresponding to these messages */ 532 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 533 534 /* Allocate Memory for outgoing messages */ 535 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 536 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 537 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 538 539 { 540 PetscInt *iptr = tmp,ict = 0; 541 for (i=0; i<nrqs; i++) { 542 j = pa[i]; 543 iptr += ict; 544 outdat[j] = iptr; 545 ict = w1[j]; 546 } 547 } 548 549 /* Form the outgoing messages */ 550 /* plug in the headers */ 551 for (i=0; i<nrqs; i++) { 552 j = pa[i]; 553 outdat[j][0] = 0; 554 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 555 ptr[j] = outdat[j] + 2*w3[j] + 1; 556 } 557 558 /* Memory for doing local proc's work */ 559 { 560 PetscInt M_BPB_imax = 0; 561 #if defined(PETSC_USE_CTABLE) 562 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 563 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 564 for (i=0; i<imax; i++) { 565 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 566 } 567 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 570 } 571 #else 572 PetscInt Mimax = 0; 573 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 574 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 575 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 data[i] = d_p + M*i; 579 } 580 #endif 581 } 582 583 /* Parse the IS and update local tables and the outgoing buf with the data */ 584 { 585 PetscInt n_i,isz_i,*outdat_j,ctr_j; 586 PetscBT table_i; 587 588 for (i=0; i<imax; i++) { 589 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 590 n_i = n[i]; 591 table_i = table[i]; 592 idx_i = idx[i]; 593 #if defined(PETSC_USE_CTABLE) 594 table_data_i = table_data[i]; 595 #else 596 data_i = data[i]; 597 #endif 598 isz_i = isz[i]; 599 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 600 row = idx_i[j]; 601 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 602 if (proc != rank) { /* copy to the outgoing buffer */ 603 ctr[proc]++; 604 *ptr[proc] = row; 605 ptr[proc]++; 606 } else if (!PetscBTLookupSet(table_i,row)) { 607 #if defined(PETSC_USE_CTABLE) 608 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 609 #else 610 data_i[isz_i] = row; /* Update the local table */ 611 #endif 612 isz_i++; 613 } 614 } 615 /* Update the headers for the current IS */ 616 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 617 if ((ctr_j = ctr[j])) { 618 outdat_j = outdat[j]; 619 k = ++outdat_j[0]; 620 outdat_j[2*k] = ctr_j; 621 outdat_j[2*k-1] = i; 622 } 623 } 624 isz[i] = isz_i; 625 } 626 } 627 628 /* Now post the sends */ 629 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 630 for (i=0; i<nrqs; ++i) { 631 j = pa[i]; 632 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 633 } 634 635 /* No longer need the original indices */ 636 for (i=0; i<imax; ++i) { 637 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 638 } 639 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 640 641 for (i=0; i<imax; ++i) { 642 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 643 } 644 645 /* Do Local work */ 646 #if defined(PETSC_USE_CTABLE) 647 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 648 #else 649 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 650 #endif 651 652 /* Receive messages */ 653 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 654 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 655 656 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 657 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 658 659 /* Phase 1 sends are complete - deallocate buffers */ 660 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 661 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 662 663 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 664 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 665 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 666 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 667 ierr = PetscFree(rbuf);CHKERRQ(ierr); 668 669 670 /* Send the data back */ 671 /* Do a global reduction to know the buffer space req for incoming messages */ 672 { 673 PetscMPIInt *rw1; 674 675 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 676 677 for (i=0; i<nrqr; ++i) { 678 proc = recv_status[i].MPI_SOURCE; 679 680 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 681 rw1[proc] = isz1[i]; 682 } 683 ierr = PetscFree(onodes1);CHKERRQ(ierr); 684 ierr = PetscFree(olengths1);CHKERRQ(ierr); 685 686 /* Determine the number of messages to expect, their lengths, from from-ids */ 687 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 688 ierr = PetscFree(rw1);CHKERRQ(ierr); 689 } 690 /* Now post the Irecvs corresponding to these messages */ 691 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 692 693 /* Now post the sends */ 694 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 695 for (i=0; i<nrqr; ++i) { 696 j = recv_status[i].MPI_SOURCE; 697 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 698 } 699 700 /* receive work done on other processors */ 701 { 702 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 703 PetscMPIInt idex; 704 PetscBT table_i; 705 MPI_Status *status2; 706 707 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 708 for (i=0; i<nrqs; ++i) { 709 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 710 /* Process the message */ 711 rbuf2_i = rbuf2[idex]; 712 ct1 = 2*rbuf2_i[0]+1; 713 jmax = rbuf2[idex][0]; 714 for (j=1; j<=jmax; j++) { 715 max = rbuf2_i[2*j]; 716 is_no = rbuf2_i[2*j-1]; 717 isz_i = isz[is_no]; 718 table_i = table[is_no]; 719 #if defined(PETSC_USE_CTABLE) 720 table_data_i = table_data[is_no]; 721 #else 722 data_i = data[is_no]; 723 #endif 724 for (k=0; k<max; k++,ct1++) { 725 row = rbuf2_i[ct1]; 726 if (!PetscBTLookupSet(table_i,row)) { 727 #if defined(PETSC_USE_CTABLE) 728 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 729 #else 730 data_i[isz_i] = row; 731 #endif 732 isz_i++; 733 } 734 } 735 isz[is_no] = isz_i; 736 } 737 } 738 739 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 740 ierr = PetscFree(status2);CHKERRQ(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 } 768 769 ierr = PetscFree(onodes2);CHKERRQ(ierr); 770 ierr = PetscFree(olengths2);CHKERRQ(ierr); 771 772 ierr = PetscFree(pa);CHKERRQ(ierr); 773 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 774 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 775 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 776 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 779 ierr = PetscFree(s_status);CHKERRQ(ierr); 780 ierr = PetscFree(recv_status);CHKERRQ(ierr); 781 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 782 ierr = PetscFree(xdata);CHKERRQ(ierr); 783 ierr = PetscFree(isz1);CHKERRQ(ierr); 784 #if defined(PETSC_USE_CTABLE) 785 for (i=0; i<imax; i++) { 786 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 787 } 788 ierr = PetscFree(table_data);CHKERRQ(ierr); 789 ierr = PetscFree(tdata);CHKERRQ(ierr); 790 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 791 #else 792 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 793 #endif 794 PetscFunctionReturn(0); 795 } 796 797 /* 798 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 799 the work on the local processor. 800 801 Inputs: 802 C - MAT_MPIAIJ; 803 imax - total no of index sets processed at a time; 804 table - an array of char - size = m bits. 805 806 Output: 807 isz - array containing the count of the solution elements corresponding 808 to each index set; 809 data or table_data - pointer to the solutions 810 */ 811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 812 { 813 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 814 Mat A = c->A,B = c->B; 815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 816 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 817 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 818 PetscBT table_i; 819 #if defined(PETSC_USE_CTABLE) 820 PetscTable table_data_i; 821 PetscErrorCode ierr; 822 PetscTablePosition tpos; 823 PetscInt tcount,*tdata; 824 #else 825 PetscInt *data_i; 826 #endif 827 828 PetscFunctionBegin; 829 rstart = C->rmap->rstart; 830 cstart = C->cmap->rstart; 831 ai = a->i; 832 aj = a->j; 833 bi = b->i; 834 bj = b->j; 835 garray = c->garray; 836 837 for (i=0; i<imax; i++) { 838 #if defined(PETSC_USE_CTABLE) 839 /* copy existing entries of table_data_i into tdata[] */ 840 table_data_i = table_data[i]; 841 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 842 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 843 844 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 845 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 846 while (tpos) { 847 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 848 tdata[--j] = --row; 849 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 850 } 851 #else 852 data_i = data[i]; 853 #endif 854 table_i = table[i]; 855 isz_i = isz[i]; 856 max = isz[i]; 857 858 for (j=0; j<max; j++) { 859 #if defined(PETSC_USE_CTABLE) 860 row = tdata[j] - rstart; 861 #else 862 row = data_i[j] - rstart; 863 #endif 864 start = ai[row]; 865 end = ai[row+1]; 866 for (k=start; k<end; k++) { /* Amat */ 867 val = aj[k] + cstart; 868 if (!PetscBTLookupSet(table_i,val)) { 869 #if defined(PETSC_USE_CTABLE) 870 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 871 #else 872 data_i[isz_i] = val; 873 #endif 874 isz_i++; 875 } 876 } 877 start = bi[row]; 878 end = bi[row+1]; 879 for (k=start; k<end; k++) { /* Bmat */ 880 val = garray[bj[k]]; 881 if (!PetscBTLookupSet(table_i,val)) { 882 #if defined(PETSC_USE_CTABLE) 883 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 884 #else 885 data_i[isz_i] = val; 886 #endif 887 isz_i++; 888 } 889 } 890 } 891 isz[i] = isz_i; 892 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscFree(tdata);CHKERRQ(ierr); 895 #endif 896 } 897 PetscFunctionReturn(0); 898 } 899 900 /* 901 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 902 and return the output 903 904 Input: 905 C - the matrix 906 nrqr - no of messages being processed. 907 rbuf - an array of pointers to the recieved requests 908 909 Output: 910 xdata - array of messages to be sent back 911 isz1 - size of each message 912 913 For better efficiency perhaps we should malloc separately each xdata[i], 914 then if a remalloc is required we need only copy the data for that one row 915 rather then all previous rows as it is now where a single large chunck of 916 memory is used. 917 918 */ 919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 920 { 921 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 922 Mat A = c->A,B = c->B; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 924 PetscErrorCode ierr; 925 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 926 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 927 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 928 PetscInt *rbuf_i,kmax,rbuf_0; 929 PetscBT xtable; 930 931 PetscFunctionBegin; 932 m = C->rmap->N; 933 rstart = C->rmap->rstart; 934 cstart = C->cmap->rstart; 935 ai = a->i; 936 aj = a->j; 937 bi = b->i; 938 bj = b->j; 939 garray = c->garray; 940 941 942 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 943 rbuf_i = rbuf[i]; 944 rbuf_0 = rbuf_i[0]; 945 ct += rbuf_0; 946 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 947 } 948 949 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 950 else max1 = 1; 951 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 952 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 953 ++no_malloc; 954 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 955 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 956 957 ct3 = 0; 958 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 959 rbuf_i = rbuf[i]; 960 rbuf_0 = rbuf_i[0]; 961 ct1 = 2*rbuf_0+1; 962 ct2 = ct1; 963 ct3 += ct1; 964 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 965 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 966 oct2 = ct2; 967 kmax = rbuf_i[2*j]; 968 for (k=0; k<kmax; k++,ct1++) { 969 row = rbuf_i[ct1]; 970 if (!PetscBTLookupSet(xtable,row)) { 971 if (!(ct3 < mem_estimate)) { 972 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 973 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 974 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 975 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 976 xdata[0] = tmp; 977 mem_estimate = new_estimate; ++no_malloc; 978 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 979 } 980 xdata[i][ct2++] = row; 981 ct3++; 982 } 983 } 984 for (k=oct2,max2=ct2; k<max2; k++) { 985 row = xdata[i][k] - rstart; 986 start = ai[row]; 987 end = ai[row+1]; 988 for (l=start; l<end; l++) { 989 val = aj[l] + cstart; 990 if (!PetscBTLookupSet(xtable,val)) { 991 if (!(ct3 < mem_estimate)) { 992 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 993 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 994 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 995 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 996 xdata[0] = tmp; 997 mem_estimate = new_estimate; ++no_malloc; 998 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 999 } 1000 xdata[i][ct2++] = val; 1001 ct3++; 1002 } 1003 } 1004 start = bi[row]; 1005 end = bi[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = garray[bj[l]]; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 } 1023 /* Update the header*/ 1024 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1025 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1026 } 1027 xdata[i][0] = rbuf_0; 1028 xdata[i+1] = xdata[i] + ct2; 1029 isz1[i] = ct2; /* size of each message */ 1030 } 1031 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1032 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 /* -------------------------------------------------------------------------*/ 1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1038 /* 1039 Every processor gets the entire matrix 1040 */ 1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1042 { 1043 Mat B; 1044 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1045 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1046 PetscErrorCode ierr; 1047 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1048 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1049 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1050 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1051 1052 PetscFunctionBegin; 1053 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1054 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1055 1056 if (scall == MAT_INITIAL_MATRIX) { 1057 /* ---------------------------------------------------------------- 1058 Tell every processor the number of nonzeros per row 1059 */ 1060 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1061 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1062 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1063 } 1064 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1065 for (i=0; i<size; i++) { 1066 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1067 displs[i] = A->rmap->range[i]; 1068 } 1069 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1070 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1071 #else 1072 sendcount = A->rmap->rend - A->rmap->rstart; 1073 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1074 #endif 1075 /* --------------------------------------------------------------- 1076 Create the sequential matrix of the same type as the local block diagonal 1077 */ 1078 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1079 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1080 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1081 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1082 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1083 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1084 **Bin = B; 1085 b = (Mat_SeqAIJ*)B->data; 1086 1087 /*-------------------------------------------------------------------- 1088 Copy my part of matrix column indices over 1089 */ 1090 sendcount = ad->nz + bd->nz; 1091 jsendbuf = b->j + b->i[rstarts[rank]]; 1092 a_jsendbuf = ad->j; 1093 b_jsendbuf = bd->j; 1094 n = A->rmap->rend - A->rmap->rstart; 1095 cnt = 0; 1096 for (i=0; i<n; i++) { 1097 1098 /* put in lower diagonal portion */ 1099 m = bd->i[i+1] - bd->i[i]; 1100 while (m > 0) { 1101 /* is it above diagonal (in bd (compressed) numbering) */ 1102 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1103 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1104 m--; 1105 } 1106 1107 /* put in diagonal portion */ 1108 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1109 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1110 } 1111 1112 /* put in upper diagonal portion */ 1113 while (m-- > 0) { 1114 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1115 } 1116 } 1117 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1118 1119 /*-------------------------------------------------------------------- 1120 Gather all column indices to all processors 1121 */ 1122 for (i=0; i<size; i++) { 1123 recvcounts[i] = 0; 1124 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1125 recvcounts[i] += lens[j]; 1126 } 1127 } 1128 displs[0] = 0; 1129 for (i=1; i<size; i++) { 1130 displs[i] = displs[i-1] + recvcounts[i-1]; 1131 } 1132 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1133 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1134 #else 1135 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1136 #endif 1137 /*-------------------------------------------------------------------- 1138 Assemble the matrix into useable form (note numerical values not yet set) 1139 */ 1140 /* set the b->ilen (length of each row) values */ 1141 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1142 /* set the b->i indices */ 1143 b->i[0] = 0; 1144 for (i=1; i<=A->rmap->N; i++) { 1145 b->i[i] = b->i[i-1] + lens[i-1]; 1146 } 1147 ierr = PetscFree(lens);CHKERRQ(ierr); 1148 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1150 1151 } else { 1152 B = **Bin; 1153 b = (Mat_SeqAIJ*)B->data; 1154 } 1155 1156 /*-------------------------------------------------------------------- 1157 Copy my part of matrix numerical values into the values location 1158 */ 1159 if (flag == MAT_GET_VALUES) { 1160 sendcount = ad->nz + bd->nz; 1161 sendbuf = b->a + b->i[rstarts[rank]]; 1162 a_sendbuf = ad->a; 1163 b_sendbuf = bd->a; 1164 b_sendj = bd->j; 1165 n = A->rmap->rend - A->rmap->rstart; 1166 cnt = 0; 1167 for (i=0; i<n; i++) { 1168 1169 /* put in lower diagonal portion */ 1170 m = bd->i[i+1] - bd->i[i]; 1171 while (m > 0) { 1172 /* is it above diagonal (in bd (compressed) numbering) */ 1173 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1174 sendbuf[cnt++] = *b_sendbuf++; 1175 m--; 1176 b_sendj++; 1177 } 1178 1179 /* put in diagonal portion */ 1180 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1181 sendbuf[cnt++] = *a_sendbuf++; 1182 } 1183 1184 /* put in upper diagonal portion */ 1185 while (m-- > 0) { 1186 sendbuf[cnt++] = *b_sendbuf++; 1187 b_sendj++; 1188 } 1189 } 1190 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1191 1192 /* ----------------------------------------------------------------- 1193 Gather all numerical values to all processors 1194 */ 1195 if (!recvcounts) { 1196 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1197 } 1198 for (i=0; i<size; i++) { 1199 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1200 } 1201 displs[0] = 0; 1202 for (i=1; i<size; i++) { 1203 displs[i] = displs[i-1] + recvcounts[i-1]; 1204 } 1205 recvbuf = b->a; 1206 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1207 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1208 #else 1209 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1210 #endif 1211 } /* endof (flag == MAT_GET_VALUES) */ 1212 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1213 1214 if (A->symmetric) { 1215 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1216 } else if (A->hermitian) { 1217 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1218 } else if (A->structurally_symmetric) { 1219 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1220 } 1221 PetscFunctionReturn(0); 1222 } 1223 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1224 { 1225 PetscErrorCode ierr; 1226 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1227 Mat_SubMat *submatj = c->submatis1; 1228 PetscInt i; 1229 1230 PetscFunctionBegin; 1231 if (submatj->id) PetscFunctionReturn(0); 1232 1233 ierr = submatj->destroy(C);CHKERRQ(ierr); 1234 1235 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1236 1237 for (i=0; i<submatj->nrqr; ++i) { 1238 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1239 } 1240 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1241 1242 if (submatj->rbuf1) { 1243 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1244 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1245 } 1246 1247 for (i=0; i<submatj->nrqs; ++i) { 1248 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1249 } 1250 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1251 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1252 1253 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1254 1255 #if defined(PETSC_USE_CTABLE) 1256 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1257 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1258 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1259 #else 1260 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1261 #endif 1262 1263 if (!submatj->allcolumns) { 1264 #if defined(PETSC_USE_CTABLE) 1265 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1266 #else 1267 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1268 #endif 1269 } 1270 1271 ierr = PetscFree(submatj);CHKERRQ(ierr); 1272 PetscFunctionReturn(0); 1273 } 1274 1275 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1276 { 1277 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1278 Mat submat,A = c->A,B = c->B; 1279 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1280 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1281 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1282 const PetscInt *icol,*irow; 1283 PetscInt nrow,ncol,start; 1284 PetscErrorCode ierr; 1285 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1286 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1287 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1288 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1289 PetscInt *lens,rmax,ncols,*cols,Crow; 1290 #if defined(PETSC_USE_CTABLE) 1291 PetscTable cmap,rmap; 1292 PetscInt *cmap_loc,*rmap_loc; 1293 #else 1294 PetscInt *cmap,*rmap; 1295 #endif 1296 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1297 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1298 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1299 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1300 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1301 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1302 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1303 MPI_Comm comm; 1304 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1305 PetscMPIInt *onodes1,*olengths1,idex,end; 1306 Mat_SubMat *smatis1; 1307 PetscBool isrowsorted; 1308 1309 PetscFunctionBegin; 1310 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1311 1312 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1313 size = c->size; 1314 rank = c->rank; 1315 1316 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1317 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1318 1319 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1320 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1321 if (allcolumns) { 1322 icol = NULL; 1323 ncol = C->cmap->N; 1324 } else { 1325 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1326 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1327 } 1328 1329 if (scall == MAT_INITIAL_MATRIX) { 1330 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1331 1332 /* Get some new tags to keep the communication clean */ 1333 tag1 = ((PetscObject)C)->tag; 1334 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1335 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1336 1337 /* evaluate communication - mesg to who, length of mesg, and buffer space 1338 required. Based on this, buffers are allocated, and data copied into them */ 1339 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1340 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1341 1342 /* w1[proc] = num of rows owned by proc -- to be requested */ 1343 proc = 0; 1344 nrqs = 0; /* num of outgoing messages */ 1345 for (j=0; j<nrow; j++) { 1346 row = irow[j]; /* sorted! */ 1347 while (row >= C->rmap->range[proc+1]) proc++; 1348 w1[proc]++; 1349 row2proc[j] = proc; /* map row index to proc */ 1350 1351 if (proc != rank && !w2[proc]) { 1352 w2[proc] = 1; nrqs++; 1353 } 1354 } 1355 w1[rank] = 0; /* rows owned by self will not be requested */ 1356 1357 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1358 for (proc=0,j=0; proc<size; proc++) { 1359 if (w1[proc]) { pa[j++] = proc;} 1360 } 1361 1362 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1363 msz = 0; /* total mesg length (for all procs) */ 1364 for (i=0; i<nrqs; i++) { 1365 proc = pa[i]; 1366 w1[proc] += 3; 1367 msz += w1[proc]; 1368 } 1369 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1370 1371 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1372 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1373 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1374 1375 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1376 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1377 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1378 1379 /* Now post the Irecvs corresponding to these messages */ 1380 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1381 1382 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1383 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1384 1385 /* Allocate Memory for outgoing messages */ 1386 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1387 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1388 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1389 1390 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1391 iptr = tmp; 1392 for (i=0; i<nrqs; i++) { 1393 proc = pa[i]; 1394 sbuf1[proc] = iptr; 1395 iptr += w1[proc]; 1396 } 1397 1398 /* Form the outgoing messages */ 1399 /* Initialize the header space */ 1400 for (i=0; i<nrqs; i++) { 1401 proc = pa[i]; 1402 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1403 ptr[proc] = sbuf1[proc] + 3; 1404 } 1405 1406 /* Parse the isrow and copy data into outbuf */ 1407 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1408 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1409 proc = row2proc[j]; 1410 if (proc != rank) { /* copy to the outgoing buf*/ 1411 *ptr[proc] = irow[j]; 1412 ctr[proc]++; ptr[proc]++; 1413 } 1414 } 1415 1416 /* Update the headers for the current IS */ 1417 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1418 if ((ctr_j = ctr[j])) { 1419 sbuf1_j = sbuf1[j]; 1420 k = ++sbuf1_j[0]; 1421 sbuf1_j[2*k] = ctr_j; 1422 sbuf1_j[2*k-1] = 0; 1423 } 1424 } 1425 1426 /* Now post the sends */ 1427 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1428 for (i=0; i<nrqs; ++i) { 1429 proc = pa[i]; 1430 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1431 } 1432 1433 /* Post Receives to capture the buffer size */ 1434 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1435 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1436 1437 rbuf2[0] = tmp + msz; 1438 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1439 1440 for (i=0; i<nrqs; ++i) { 1441 proc = pa[i]; 1442 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1443 } 1444 1445 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1446 1447 /* Send to other procs the buf size they should allocate */ 1448 /* Receive messages*/ 1449 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1450 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1451 1452 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1453 for (i=0; i<nrqr; ++i) { 1454 req_size[i] = 0; 1455 rbuf1_i = rbuf1[i]; 1456 start = 2*rbuf1_i[0] + 1; 1457 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1458 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1459 sbuf2_i = sbuf2[i]; 1460 for (j=start; j<end; j++) { 1461 k = rbuf1_i[j] - rstart; 1462 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1463 sbuf2_i[j] = ncols; 1464 req_size[i] += ncols; 1465 } 1466 req_source1[i] = r_status1[i].MPI_SOURCE; 1467 1468 /* form the header */ 1469 sbuf2_i[0] = req_size[i]; 1470 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1471 1472 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1473 } 1474 1475 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1476 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1477 1478 /* rbuf2 is received, Post recv column indices a->j */ 1479 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1480 1481 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1482 for (i=0; i<nrqs; ++i) { 1483 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1484 req_source2[i] = r_status2[i].MPI_SOURCE; 1485 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1486 } 1487 1488 /* Wait on sends1 and sends2 */ 1489 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1490 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1491 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1492 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1493 1494 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1495 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1496 1497 /* Now allocate sending buffers for a->j, and send them off */ 1498 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1499 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1500 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1501 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1502 1503 for (i=0; i<nrqr; i++) { /* for each requested message */ 1504 rbuf1_i = rbuf1[i]; 1505 sbuf_aj_i = sbuf_aj[i]; 1506 ct1 = 2*rbuf1_i[0] + 1; 1507 ct2 = 0; 1508 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1509 1510 kmax = rbuf1[i][2]; 1511 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1512 row = rbuf1_i[ct1] - rstart; 1513 nzA = ai[row+1] - ai[row]; 1514 nzB = bi[row+1] - bi[row]; 1515 ncols = nzA + nzB; 1516 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1517 1518 /* load the column indices for this row into cols*/ 1519 cols = sbuf_aj_i + ct2; 1520 1521 lwrite = 0; 1522 for (l=0; l<nzB; l++) { 1523 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1524 } 1525 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1526 for (l=0; l<nzB; l++) { 1527 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1528 } 1529 1530 ct2 += ncols; 1531 } 1532 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1533 } 1534 1535 /* create column map (cmap): global col of C -> local col of submat */ 1536 #if defined(PETSC_USE_CTABLE) 1537 if (!allcolumns) { 1538 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1539 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1540 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1541 if (icol[j] >= cstart && icol[j] <cend) { 1542 cmap_loc[icol[j] - cstart] = j+1; 1543 } else { /* use PetscTable for non-local col indices */ 1544 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1545 } 1546 } 1547 } else { 1548 cmap = NULL; 1549 cmap_loc = NULL; 1550 } 1551 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1552 #else 1553 if (!allcolumns) { 1554 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1555 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1556 } else { 1557 cmap = NULL; 1558 } 1559 #endif 1560 1561 /* Create lens for MatSeqAIJSetPreallocation() */ 1562 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1563 1564 /* Compute lens from local part of C */ 1565 for (j=0; j<nrow; j++) { 1566 row = irow[j]; 1567 proc = row2proc[j]; 1568 if (proc == rank) { 1569 /* diagonal part A = c->A */ 1570 ncols = ai[row-rstart+1] - ai[row-rstart]; 1571 cols = aj + ai[row-rstart]; 1572 if (!allcolumns) { 1573 for (k=0; k<ncols; k++) { 1574 #if defined(PETSC_USE_CTABLE) 1575 tcol = cmap_loc[cols[k]]; 1576 #else 1577 tcol = cmap[cols[k]+cstart]; 1578 #endif 1579 if (tcol) lens[j]++; 1580 } 1581 } else { /* allcolumns */ 1582 lens[j] = ncols; 1583 } 1584 1585 /* off-diagonal part B = c->B */ 1586 ncols = bi[row-rstart+1] - bi[row-rstart]; 1587 cols = bj + bi[row-rstart]; 1588 if (!allcolumns) { 1589 for (k=0; k<ncols; k++) { 1590 #if defined(PETSC_USE_CTABLE) 1591 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1592 #else 1593 tcol = cmap[bmap[cols[k]]]; 1594 #endif 1595 if (tcol) lens[j]++; 1596 } 1597 } else { /* allcolumns */ 1598 lens[j] += ncols; 1599 } 1600 } 1601 } 1602 1603 /* Create row map (rmap): global row of C -> local row of submat */ 1604 #if defined(PETSC_USE_CTABLE) 1605 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1606 for (j=0; j<nrow; j++) { 1607 row = irow[j]; 1608 proc = row2proc[j]; 1609 if (proc == rank) { /* a local row */ 1610 rmap_loc[row - rstart] = j; 1611 } else { 1612 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1613 } 1614 } 1615 #else 1616 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1617 for (j=0; j<nrow; j++) { 1618 rmap[irow[j]] = j; 1619 } 1620 #endif 1621 1622 /* Update lens from offproc data */ 1623 /* recv a->j is done */ 1624 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1625 for (i=0; i<nrqs; i++) { 1626 proc = pa[i]; 1627 sbuf1_i = sbuf1[proc]; 1628 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1629 ct1 = 2 + 1; 1630 ct2 = 0; 1631 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1632 rbuf3_i = rbuf3[i]; /* received C->j */ 1633 1634 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1635 max1 = sbuf1_i[2]; 1636 for (k=0; k<max1; k++,ct1++) { 1637 #if defined(PETSC_USE_CTABLE) 1638 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1639 row--; 1640 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1641 #else 1642 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1643 #endif 1644 /* Now, store row index of submat in sbuf1_i[ct1] */ 1645 sbuf1_i[ct1] = row; 1646 1647 nnz = rbuf2_i[ct1]; 1648 if (!allcolumns) { 1649 for (l=0; l<nnz; l++,ct2++) { 1650 #if defined(PETSC_USE_CTABLE) 1651 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1652 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1653 } else { 1654 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1655 } 1656 #else 1657 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1658 #endif 1659 if (tcol) lens[row]++; 1660 } 1661 } else { /* allcolumns */ 1662 lens[row] += nnz; 1663 } 1664 } 1665 } 1666 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1667 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1668 1669 /* Create the submatrices */ 1670 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1671 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1672 1673 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1674 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1675 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1676 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1677 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1678 1679 /* create struct Mat_SubMat and attached it to submat */ 1680 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1681 subc = (Mat_SeqAIJ*)submat->data; 1682 subc->submatis1 = smatis1; 1683 1684 smatis1->id = 0; 1685 smatis1->nrqs = nrqs; 1686 smatis1->nrqr = nrqr; 1687 smatis1->rbuf1 = rbuf1; 1688 smatis1->rbuf2 = rbuf2; 1689 smatis1->rbuf3 = rbuf3; 1690 smatis1->sbuf2 = sbuf2; 1691 smatis1->req_source2 = req_source2; 1692 1693 smatis1->sbuf1 = sbuf1; 1694 smatis1->ptr = ptr; 1695 smatis1->tmp = tmp; 1696 smatis1->ctr = ctr; 1697 1698 smatis1->pa = pa; 1699 smatis1->req_size = req_size; 1700 smatis1->req_source1 = req_source1; 1701 1702 smatis1->allcolumns = allcolumns; 1703 smatis1->row2proc = row2proc; 1704 smatis1->rmap = rmap; 1705 smatis1->cmap = cmap; 1706 #if defined(PETSC_USE_CTABLE) 1707 smatis1->rmap_loc = rmap_loc; 1708 smatis1->cmap_loc = cmap_loc; 1709 #endif 1710 1711 smatis1->destroy = submat->ops->destroy; 1712 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1713 submat->factortype = C->factortype; 1714 1715 /* compute rmax */ 1716 rmax = 0; 1717 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1718 1719 } else { /* scall == MAT_REUSE_MATRIX */ 1720 submat = submats[0]; 1721 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1722 1723 subc = (Mat_SeqAIJ*)submat->data; 1724 rmax = subc->rmax; 1725 smatis1 = subc->submatis1; 1726 nrqs = smatis1->nrqs; 1727 nrqr = smatis1->nrqr; 1728 rbuf1 = smatis1->rbuf1; 1729 rbuf2 = smatis1->rbuf2; 1730 rbuf3 = smatis1->rbuf3; 1731 req_source2 = smatis1->req_source2; 1732 1733 sbuf1 = smatis1->sbuf1; 1734 sbuf2 = smatis1->sbuf2; 1735 ptr = smatis1->ptr; 1736 tmp = smatis1->tmp; 1737 ctr = smatis1->ctr; 1738 1739 pa = smatis1->pa; 1740 req_size = smatis1->req_size; 1741 req_source1 = smatis1->req_source1; 1742 1743 allcolumns = smatis1->allcolumns; 1744 row2proc = smatis1->row2proc; 1745 rmap = smatis1->rmap; 1746 cmap = smatis1->cmap; 1747 #if defined(PETSC_USE_CTABLE) 1748 rmap_loc = smatis1->rmap_loc; 1749 cmap_loc = smatis1->cmap_loc; 1750 #endif 1751 } 1752 1753 /* Post recv matrix values */ 1754 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1755 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1756 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1757 for (i=0; i<nrqs; ++i) { 1758 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1759 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1760 } 1761 1762 /* Allocate sending buffers for a->a, and send them off */ 1763 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1764 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1765 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1766 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1767 1768 for (i=0; i<nrqr; i++) { 1769 rbuf1_i = rbuf1[i]; 1770 sbuf_aa_i = sbuf_aa[i]; 1771 ct1 = 2*rbuf1_i[0]+1; 1772 ct2 = 0; 1773 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1774 1775 kmax = rbuf1_i[2]; 1776 for (k=0; k<kmax; k++,ct1++) { 1777 row = rbuf1_i[ct1] - rstart; 1778 nzA = ai[row+1] - ai[row]; 1779 nzB = bi[row+1] - bi[row]; 1780 ncols = nzA + nzB; 1781 cworkB = bj + bi[row]; 1782 vworkA = a_a + ai[row]; 1783 vworkB = b_a + bi[row]; 1784 1785 /* load the column values for this row into vals*/ 1786 vals = sbuf_aa_i + ct2; 1787 1788 lwrite = 0; 1789 for (l=0; l<nzB; l++) { 1790 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1791 } 1792 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1793 for (l=0; l<nzB; l++) { 1794 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1795 } 1796 1797 ct2 += ncols; 1798 } 1799 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1800 } 1801 1802 /* Assemble submat */ 1803 /* First assemble the local rows */ 1804 for (j=0; j<nrow; j++) { 1805 row = irow[j]; 1806 proc = row2proc[j]; 1807 if (proc == rank) { 1808 Crow = row - rstart; /* local row index of C */ 1809 #if defined(PETSC_USE_CTABLE) 1810 row = rmap_loc[Crow]; /* row index of submat */ 1811 #else 1812 row = rmap[row]; 1813 #endif 1814 1815 if (allcolumns) { 1816 /* diagonal part A = c->A */ 1817 ncols = ai[Crow+1] - ai[Crow]; 1818 cols = aj + ai[Crow]; 1819 vals = a->a + ai[Crow]; 1820 i = 0; 1821 for (k=0; k<ncols; k++) { 1822 subcols[i] = cols[k] + cstart; 1823 subvals[i++] = vals[k]; 1824 } 1825 1826 /* off-diagonal part B = c->B */ 1827 ncols = bi[Crow+1] - bi[Crow]; 1828 cols = bj + bi[Crow]; 1829 vals = b->a + bi[Crow]; 1830 for (k=0; k<ncols; k++) { 1831 subcols[i] = bmap[cols[k]]; 1832 subvals[i++] = vals[k]; 1833 } 1834 1835 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1836 1837 } else { /* !allcolumns */ 1838 #if defined(PETSC_USE_CTABLE) 1839 /* diagonal part A = c->A */ 1840 ncols = ai[Crow+1] - ai[Crow]; 1841 cols = aj + ai[Crow]; 1842 vals = a->a + ai[Crow]; 1843 i = 0; 1844 for (k=0; k<ncols; k++) { 1845 tcol = cmap_loc[cols[k]]; 1846 if (tcol) { 1847 subcols[i] = --tcol; 1848 subvals[i++] = vals[k]; 1849 } 1850 } 1851 1852 /* off-diagonal part B = c->B */ 1853 ncols = bi[Crow+1] - bi[Crow]; 1854 cols = bj + bi[Crow]; 1855 vals = b->a + bi[Crow]; 1856 for (k=0; k<ncols; k++) { 1857 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1858 if (tcol) { 1859 subcols[i] = --tcol; 1860 subvals[i++] = vals[k]; 1861 } 1862 } 1863 #else 1864 /* diagonal part A = c->A */ 1865 ncols = ai[Crow+1] - ai[Crow]; 1866 cols = aj + ai[Crow]; 1867 vals = a->a + ai[Crow]; 1868 i = 0; 1869 for (k=0; k<ncols; k++) { 1870 tcol = cmap[cols[k]+cstart]; 1871 if (tcol) { 1872 subcols[i] = --tcol; 1873 subvals[i++] = vals[k]; 1874 } 1875 } 1876 1877 /* off-diagonal part B = c->B */ 1878 ncols = bi[Crow+1] - bi[Crow]; 1879 cols = bj + bi[Crow]; 1880 vals = b->a + bi[Crow]; 1881 for (k=0; k<ncols; k++) { 1882 tcol = cmap[bmap[cols[k]]]; 1883 if (tcol) { 1884 subcols[i] = --tcol; 1885 subvals[i++] = vals[k]; 1886 } 1887 } 1888 #endif 1889 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1890 } 1891 } 1892 } 1893 1894 /* Now assemble the off-proc rows */ 1895 for (i=0; i<nrqs; i++) { /* for each requested message */ 1896 /* recv values from other processes */ 1897 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1898 proc = pa[idex]; 1899 sbuf1_i = sbuf1[proc]; 1900 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1901 ct1 = 2 + 1; 1902 ct2 = 0; /* count of received C->j */ 1903 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1904 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1905 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1906 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1907 1908 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1909 max1 = sbuf1_i[2]; /* num of rows */ 1910 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1911 row = sbuf1_i[ct1]; /* row index of submat */ 1912 if (!allcolumns) { 1913 idex = 0; 1914 if (scall == MAT_INITIAL_MATRIX) { 1915 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1916 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1917 #if defined(PETSC_USE_CTABLE) 1918 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1919 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1920 } else { 1921 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1922 } 1923 #else 1924 tcol = cmap[rbuf3_i[ct2]]; 1925 #endif 1926 if (tcol) { 1927 subcols[idex] = --tcol; 1928 subvals[idex++] = rbuf4_i[ct2]; 1929 1930 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1931 For reuse, we replace received C->j with index that should be inserted to submat */ 1932 rbuf3_i[ct3++] = ct2; 1933 } 1934 } 1935 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1936 1937 } else { /* scall == MAT_REUSE_MATRIX */ 1938 submat = submats[0]; 1939 subc = (Mat_SeqAIJ*)submat->data; 1940 1941 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1942 for (l=0; l<nnz; l++) { 1943 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1944 subvals[idex++] = rbuf4_i[ct2]; 1945 } 1946 1947 bj = subc->j + subc->i[row]; 1948 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1949 } 1950 } else { /* allcolumns */ 1951 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1952 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1953 ct2 += nnz; 1954 } 1955 } 1956 } 1957 1958 /* sending a->a are done */ 1959 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1960 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1961 1962 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1963 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1964 submats[0] = submat; 1965 1966 /* Restore the indices */ 1967 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1968 if (!allcolumns) { 1969 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1970 } 1971 1972 /* Destroy allocated memory */ 1973 for (i=0; i<nrqs; ++i) { 1974 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1975 } 1976 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1977 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1978 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1979 1980 if (scall == MAT_INITIAL_MATRIX) { 1981 ierr = PetscFree(lens);CHKERRQ(ierr); 1982 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1983 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1984 } 1985 PetscFunctionReturn(0); 1986 } 1987 1988 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1989 { 1990 PetscErrorCode ierr; 1991 PetscInt ncol; 1992 PetscBool colflag,allcolumns=PETSC_FALSE; 1993 1994 PetscFunctionBegin; 1995 /* Allocate memory to hold all the submatrices */ 1996 if (scall == MAT_INITIAL_MATRIX) { 1997 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 1998 } 1999 2000 /* Check for special case: each processor gets entire matrix columns */ 2001 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2002 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2003 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2004 2005 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2006 PetscFunctionReturn(0); 2007 } 2008 2009 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2010 { 2011 PetscErrorCode ierr; 2012 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 2013 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 2014 2015 PetscFunctionBegin; 2016 #if 0 2017 /* Check for special case: each processor gets entire matrix */ 2018 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */ 2019 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2020 PetscFunctionReturn(0); 2021 } 2022 #endif 2023 2024 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2025 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2026 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2027 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2028 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2029 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2030 wantallmatrix = PETSC_TRUE; 2031 2032 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2033 } 2034 } 2035 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2036 if (twantallmatrix) { 2037 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2038 PetscFunctionReturn(0); 2039 } 2040 2041 /* Allocate memory to hold all the submatrices */ 2042 if (scall == MAT_INITIAL_MATRIX) { 2043 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2044 } 2045 2046 /* Check for special case: each processor gets entire matrix columns */ 2047 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 2048 for (i=0; i<ismax; i++) { 2049 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2050 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 2051 if (colflag && ncol == C->cmap->N) { 2052 allcolumns[i] = PETSC_TRUE; 2053 } else { 2054 allcolumns[i] = PETSC_FALSE; 2055 } 2056 } 2057 2058 /* Determine the number of stages through which submatrices are done */ 2059 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2060 2061 /* 2062 Each stage will extract nmax submatrices. 2063 nmax is determined by the matrix column dimension. 2064 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2065 */ 2066 if (!nmax) nmax = 1; 2067 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 2068 2069 /* Make sure every processor loops through the nstages */ 2070 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2071 2072 for (i=0,pos=0; i<nstages; i++) { 2073 if (pos+nmax <= ismax) max_no = nmax; 2074 else if (pos == ismax) max_no = 0; 2075 else max_no = ismax-pos; 2076 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 2077 pos += max_no; 2078 } 2079 2080 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 2081 PetscFunctionReturn(0); 2082 } 2083 2084 /* -------------------------------------------------------------------------*/ 2085 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 2086 { 2087 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2088 Mat A = c->A; 2089 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 2090 const PetscInt **icol,**irow; 2091 PetscInt *nrow,*ncol,start; 2092 PetscErrorCode ierr; 2093 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 2094 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 2095 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 2096 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,max2; 2097 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2098 #if defined(PETSC_USE_CTABLE) 2099 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2100 #else 2101 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2102 #endif 2103 const PetscInt *irow_i; 2104 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2105 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2106 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2107 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2108 MPI_Status *r_status3,*r_status4,*s_status4; 2109 MPI_Comm comm; 2110 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 2111 PetscMPIInt *onodes1,*olengths1; 2112 PetscMPIInt idex,idex2,end; 2113 PetscInt **row2proc,*row2proc_i; 2114 Mat_SubMat **smats,*smat_i; 2115 PetscBool *issorted; 2116 2117 PetscFunctionBegin; 2118 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2119 tag0 = ((PetscObject)C)->tag; 2120 size = c->size; 2121 rank = c->rank; 2122 if (ismax == 0) printf("[%d] MatGetSubMatrices_MPIAIJ_Local, scall %d, ismax = 0!!! \n",rank,scall); 2123 //printf("[%d] MatGetSubMatrices_MPIAIJ_Local, scall %d\n",rank,scall); 2124 2125 /* Get some new tags to keep the communication clean */ 2126 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 2127 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2128 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2129 2130 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 2131 ierr = PetscMalloc1(ismax,&issorted);CHKERRQ(ierr); 2132 2133 for (i=0; i<ismax; i++) { 2134 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2135 2136 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2137 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2138 if (allcolumns[i]) { 2139 icol[i] = NULL; 2140 ncol[i] = C->cmap->N; 2141 } else { 2142 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2143 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2144 } 2145 } 2146 2147 2148 ierr = PetscMalloc1(ismax,&smats);CHKERRQ(ierr); 2149 ierr = PetscMalloc1(ismax,&row2proc);CHKERRQ(ierr); 2150 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 2151 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 2152 2153 if (scall == MAT_REUSE_MATRIX) { 2154 //PetscInt *req_source1,*req_source2; 2155 2156 /* Assumes new rows are same length as the old rows,hence bug! */ 2157 for (i=0; i<ismax; i++) { 2158 mat = (Mat_SeqAIJ*)(submats[i]->data); 2159 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2160 2161 /* Initial matrix as if empty -- rm later!!! */ 2162 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2163 2164 /* Initial matrix as if empty */ 2165 submats[i]->factortype = C->factortype; 2166 2167 smat_i = mat->submatis1; 2168 smats[i] = smat_i; 2169 2170 nrqs = smat_i->nrqs; 2171 nrqr = smat_i->nrqr; 2172 rbuf1 = smat_i->rbuf1; 2173 rbuf2 = smat_i->rbuf2; 2174 rbuf3 = smat_i->rbuf3; 2175 req_source2 = smat_i->req_source2; 2176 2177 sbuf1 = smat_i->sbuf1; 2178 sbuf2 = smat_i->sbuf2; 2179 ptr = smat_i->ptr; 2180 tmp = smat_i->tmp; 2181 ctr = smat_i->ctr; 2182 2183 pa = smat_i->pa; 2184 req_size = smat_i->req_size; 2185 req_source1 = smat_i->req_source1; 2186 2187 allcolumns[i] = smat_i->allcolumns; 2188 row2proc[i] = smat_i->row2proc; 2189 rmap[i] = smat_i->rmap; 2190 cmap[i] = smat_i->cmap; 2191 #if defined(PETSC_USE_CTABLE) 2192 //rmap_loc = smat_i->rmap_loc; 2193 //cmap_loc = smat_i->cmap_loc; 2194 #endif 2195 } 2196 } 2197 2198 if (scall == MAT_INITIAL_MATRIX ||!ismax) { 2199 2200 /* evaluate communication - mesg to who, length of mesg, and buffer space 2201 required. Based on this, buffers are allocated, and data copied into them*/ 2202 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 2203 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2204 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2205 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2206 2207 for (i=0; i<ismax; i++) { 2208 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2209 jmax = nrow[i]; 2210 irow_i = irow[i]; 2211 2212 if (scall == MAT_INITIAL_MATRIX) { 2213 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2214 row2proc[i] = row2proc_i; 2215 } else { 2216 row2proc_i = row2proc[i]; 2217 } 2218 2219 if (issorted[i]) proc = 0; 2220 for (j=0; j<jmax; j++) { 2221 if (!issorted[i]) proc = 0; 2222 row = irow_i[j]; 2223 while (row >= C->rmap->range[proc+1]) proc++; 2224 w4[proc]++; 2225 row2proc_i[j] = proc; /* map row index to proc */ 2226 } 2227 for (j=0; j<size; j++) { 2228 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 2229 } 2230 } 2231 2232 nrqs = 0; /* no of outgoing messages */ 2233 msz = 0; /* total mesg length (for all procs) */ 2234 w1[rank] = 0; /* no mesg sent to self */ 2235 w3[rank] = 0; 2236 for (i=0; i<size; i++) { 2237 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2238 } 2239 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2240 for (i=0,j=0; i<size; i++) { 2241 if (w1[i]) { pa[j] = i; j++; } 2242 } 2243 2244 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2245 for (i=0; i<nrqs; i++) { 2246 j = pa[i]; 2247 w1[j] += w2[j] + 2* w3[j]; 2248 msz += w1[j]; 2249 } 2250 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2251 2252 /* Determine the number of messages to expect, their lengths, from from-ids */ 2253 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2254 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2255 2256 /* Now post the Irecvs corresponding to these messages */ 2257 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2258 2259 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2260 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2261 2262 /* Allocate Memory for outgoing messages */ 2263 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2264 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2265 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2266 2267 { 2268 PetscInt *iptr = tmp; 2269 k = 0; 2270 for (i=0; i<nrqs; i++) { 2271 j = pa[i]; 2272 iptr += k; 2273 sbuf1[j] = iptr; 2274 k = w1[j]; 2275 } 2276 } 2277 2278 /* Form the outgoing messages */ 2279 /* Initialize the header space */ 2280 for (i=0; i<nrqs; i++) { 2281 j = pa[i]; 2282 sbuf1[j][0] = 0; 2283 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2284 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2285 } 2286 2287 /* Parse the isrow and copy data into outbuf */ 2288 for (i=0; i<ismax; i++) { 2289 row2proc_i = row2proc[i]; 2290 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2291 irow_i = irow[i]; 2292 jmax = nrow[i]; 2293 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2294 proc = row2proc_i[j]; 2295 if (proc != rank) { /* copy to the outgoing buf*/ 2296 ctr[proc]++; 2297 *ptr[proc] = irow_i[j]; 2298 ptr[proc]++; 2299 } 2300 } 2301 /* Update the headers for the current IS */ 2302 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2303 if ((ctr_j = ctr[j])) { 2304 sbuf1_j = sbuf1[j]; 2305 k = ++sbuf1_j[0]; 2306 sbuf1_j[2*k] = ctr_j; 2307 sbuf1_j[2*k-1] = i; 2308 } 2309 } 2310 } 2311 2312 /* Now post the sends */ 2313 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2314 for (i=0; i<nrqs; ++i) { 2315 j = pa[i]; 2316 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2317 } 2318 2319 /* Post Receives to capture the buffer size */ 2320 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2321 //ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 2322 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2323 rbuf2[0] = tmp + msz; 2324 for (i=1; i<nrqs; ++i) { 2325 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2326 } 2327 for (i=0; i<nrqs; ++i) { 2328 j = pa[i]; 2329 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 2330 } 2331 2332 /* Send to other procs the buf size they should allocate */ 2333 2334 2335 /* Receive messages*/ 2336 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2337 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2338 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2339 { 2340 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2341 PetscInt *sbuf2_i; 2342 2343 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2344 for (i=0; i<nrqr; ++i) { 2345 //ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 2346 idex = i; 2347 req_size[idex] = 0; 2348 rbuf1_i = rbuf1[idex]; 2349 start = 2*rbuf1_i[0] + 1; 2350 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2351 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 2352 sbuf2_i = sbuf2[idex]; 2353 for (j=start; j<end; j++) { 2354 id = rbuf1_i[j] - rstart; 2355 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2356 sbuf2_i[j] = ncols; 2357 req_size[idex] += ncols; 2358 } 2359 req_source1[idex] = r_status1[i].MPI_SOURCE; 2360 /* form the header */ 2361 sbuf2_i[0] = req_size[idex]; 2362 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2363 2364 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 2365 } 2366 } 2367 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2368 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2369 2370 /* recv buffer sizes */ 2371 /* Receive messages*/ 2372 2373 //ierr = PetscMalloc2(nrqs+1,&req_source2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2374 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2375 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2376 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2377 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2378 2379 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2380 for (i=0; i<nrqs; ++i) { 2381 idex = i; 2382 //ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 2383 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 2384 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 2385 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 2386 req_source2[i] = r_status2[i].MPI_SOURCE; 2387 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,req_source2[i],tag3,comm,r_waits4+idex);CHKERRQ(ierr); 2388 } 2389 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2390 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2391 2392 /* Wait on sends1 and sends2 */ 2393 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2394 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2395 2396 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2397 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2398 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2399 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2400 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2401 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2402 2403 /* Now allocate buffers for a->j, and send them off */ 2404 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2405 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2406 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2407 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2408 2409 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2410 { 2411 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2412 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2413 PetscInt cend = C->cmap->rend; 2414 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2415 2416 for (i=0; i<nrqr; i++) { 2417 rbuf1_i = rbuf1[i]; 2418 sbuf_aj_i = sbuf_aj[i]; 2419 ct1 = 2*rbuf1_i[0] + 1; 2420 ct2 = 0; 2421 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2422 kmax = rbuf1[i][2*j]; 2423 for (k=0; k<kmax; k++,ct1++) { 2424 row = rbuf1_i[ct1] - rstart; 2425 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2426 ncols = nzA + nzB; 2427 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2428 2429 /* load the column indices for this row into cols*/ 2430 cols = sbuf_aj_i + ct2; 2431 2432 lwrite = 0; 2433 for (l=0; l<nzB; l++) { 2434 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2435 } 2436 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2437 for (l=0; l<nzB; l++) { 2438 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2439 } 2440 2441 ct2 += ncols; 2442 } 2443 } 2444 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 2445 } 2446 } 2447 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 2448 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 2449 2450 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2451 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2452 2453 /* create col map: global col of C -> local col of submatrices */ 2454 if (scall == MAT_INITIAL_MATRIX) { 2455 const PetscInt *icol_i; 2456 #if defined(PETSC_USE_CTABLE) 2457 for (i=0; i<ismax; i++) { 2458 if (!allcolumns[i]) { 2459 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2460 2461 jmax = ncol[i]; 2462 icol_i = icol[i]; 2463 cmap_i = cmap[i]; 2464 for (j=0; j<jmax; j++) { 2465 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2466 } 2467 } else cmap[i] = NULL; 2468 } 2469 #else 2470 for (i=0; i<ismax; i++) { 2471 if (!allcolumns[i]) { 2472 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2473 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2474 jmax = ncol[i]; 2475 icol_i = icol[i]; 2476 cmap_i = cmap[i]; 2477 for (j=0; j<jmax; j++) { 2478 cmap_i[icol_i[j]] = j+1; 2479 } 2480 } else cmap[i] = NULL; 2481 } 2482 #endif 2483 } 2484 2485 /* Create lens which is required for MatCreate... */ 2486 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2487 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2488 if (ismax) { 2489 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 2490 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 2491 } 2492 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2493 2494 /* Update lens from local data */ 2495 for (i=0; i<ismax; i++) { 2496 row2proc_i = row2proc[i]; 2497 jmax = nrow[i]; 2498 if (!allcolumns[i]) cmap_i = cmap[i]; 2499 irow_i = irow[i]; 2500 lens_i = lens[i]; 2501 for (j=0; j<jmax; j++) { 2502 row = irow_i[j]; 2503 proc = row2proc_i[j]; 2504 if (proc == rank) { 2505 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2506 if (!allcolumns[i]) { 2507 for (k=0; k<ncols; k++) { 2508 #if defined(PETSC_USE_CTABLE) 2509 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2510 #else 2511 tcol = cmap_i[cols[k]]; 2512 #endif 2513 if (tcol) lens_i[j]++; 2514 } 2515 } else { /* allcolumns */ 2516 lens_i[j] = ncols; 2517 } 2518 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2519 } 2520 } 2521 } 2522 2523 /* Create row map: global row of C -> local row of submatrices */ 2524 if (scall == MAT_INITIAL_MATRIX) { 2525 #if defined(PETSC_USE_CTABLE) 2526 for (i=0; i<ismax; i++) { 2527 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2528 irow_i = irow[i]; 2529 jmax = nrow[i]; 2530 for (j=0; j<jmax; j++) { 2531 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2532 } 2533 } 2534 #else 2535 if (ismax) { 2536 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 2537 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2538 } 2539 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 2540 for (i=0; i<ismax; i++) { 2541 rmap_i = rmap[i]; 2542 irow_i = irow[i]; 2543 jmax = nrow[i]; 2544 for (j=0; j<jmax; j++) { 2545 rmap_i[irow_i[j]] = j; 2546 } 2547 } 2548 #endif 2549 } 2550 2551 /* Update lens from offproc data */ 2552 { 2553 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2554 2555 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2556 for (tmp2=0; tmp2<nrqs; tmp2++) { 2557 //ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 2558 idex2 = tmp2; 2559 idex = pa[idex2]; 2560 sbuf1_i = sbuf1[idex]; 2561 jmax = sbuf1_i[0]; 2562 ct1 = 2*jmax+1; 2563 ct2 = 0; 2564 rbuf2_i = rbuf2[idex2]; 2565 rbuf3_i = rbuf3[idex2]; 2566 for (j=1; j<=jmax; j++) { 2567 is_no = sbuf1_i[2*j-1]; 2568 max1 = sbuf1_i[2*j]; 2569 lens_i = lens[is_no]; 2570 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2571 rmap_i = rmap[is_no]; 2572 for (k=0; k<max1; k++,ct1++) { 2573 #if defined(PETSC_USE_CTABLE) 2574 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2575 row--; 2576 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2577 #else 2578 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2579 #endif 2580 max2 = rbuf2_i[ct1]; 2581 for (l=0; l<max2; l++,ct2++) { 2582 if (!allcolumns[is_no]) { 2583 #if defined(PETSC_USE_CTABLE) 2584 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2585 #else 2586 tcol = cmap_i[rbuf3_i[ct2]]; 2587 #endif 2588 if (tcol) lens_i[row]++; 2589 } else { /* allcolumns */ 2590 lens_i[row]++; /* lens_i[row] += max2 ? */ 2591 } 2592 } 2593 } 2594 } 2595 } 2596 } 2597 ierr = PetscFree(r_status3);CHKERRQ(ierr); 2598 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2599 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2600 ierr = PetscFree(s_status3);CHKERRQ(ierr); 2601 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2602 2603 } // endof scall == MAT_INITIAL_MATRIX ||!ismax 2604 2605 /* Create the submatrices */ 2606 if (scall == MAT_INITIAL_MATRIX) { 2607 for (i=0; i<ismax; i++) { 2608 PetscInt rbs,cbs; 2609 Mat_SeqAIJ* subc; 2610 2611 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2612 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2613 2614 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2615 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2616 2617 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2618 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2619 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2620 2621 /* create struct Mat_SubMat and attached it to submat */ 2622 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2623 subc = (Mat_SeqAIJ*)submats[i]->data; 2624 subc->submatis1 = smat_i; 2625 smats[i] = smat_i; 2626 2627 smat_i->destroy = submats[i]->ops->destroy; 2628 submats[i]->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 2629 submats[i]->factortype = C->factortype; 2630 2631 smat_i->id = i; 2632 smat_i->nrqs = nrqs; 2633 smat_i->nrqr = nrqr; 2634 smat_i->rbuf1 = rbuf1; 2635 smat_i->rbuf2 = rbuf2; 2636 smat_i->rbuf3 = rbuf3; 2637 smat_i->sbuf2 = sbuf2; 2638 smat_i->req_source2 = req_source2; 2639 2640 smat_i->sbuf1 = sbuf1; 2641 smat_i->ptr = ptr; 2642 smat_i->tmp = tmp; 2643 smat_i->ctr = ctr; 2644 2645 smat_i->pa = pa; 2646 smat_i->req_size = req_size; 2647 smat_i->req_source1 = req_source1; 2648 2649 smat_i->allcolumns = allcolumns[i]; 2650 smat_i->row2proc = row2proc[i]; 2651 smat_i->rmap = rmap[i]; 2652 smat_i->cmap = cmap[i]; 2653 #if defined(PETSC_USE_CTABLE) 2654 smat_i->rmap_loc = NULL;//rmap_loc; 2655 smat_i->cmap_loc = NULL;//cmap_loc; 2656 #endif 2657 2658 } 2659 } // endof scall == MAT_INITIAL_MATRIX 2660 2661 /* Allocate buffers for a->a, and send them off */ 2662 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2663 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2664 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2665 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2666 2667 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2668 { 2669 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2670 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2671 PetscInt cend = C->cmap->rend; 2672 PetscInt *b_j = b->j; 2673 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2674 2675 for (i=0; i<nrqr; i++) { 2676 rbuf1_i = rbuf1[i]; 2677 sbuf_aa_i = sbuf_aa[i]; 2678 ct1 = 2*rbuf1_i[0]+1; 2679 ct2 = 0; 2680 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2681 kmax = rbuf1_i[2*j]; 2682 for (k=0; k<kmax; k++,ct1++) { 2683 row = rbuf1_i[ct1] - rstart; 2684 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2685 ncols = nzA + nzB; 2686 cworkB = b_j + b_i[row]; 2687 vworkA = a_a + a_i[row]; 2688 vworkB = b_a + b_i[row]; 2689 2690 /* load the column values for this row into vals*/ 2691 vals = sbuf_aa_i+ct2; 2692 2693 lwrite = 0; 2694 for (l=0; l<nzB; l++) { 2695 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2696 } 2697 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2698 for (l=0; l<nzB; l++) { 2699 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2700 } 2701 2702 ct2 += ncols; 2703 } 2704 } 2705 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 2706 } 2707 } 2708 2709 if (!ismax) { 2710 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2711 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2712 } 2713 2714 2715 /* Assemble the matrices */ 2716 /* First assemble the local rows */ 2717 { 2718 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2719 PetscScalar *imat_a; 2720 2721 for (i=0; i<ismax; i++) { 2722 row2proc_i = row2proc[i]; 2723 mat = (Mat_SeqAIJ*)submats[i]->data; 2724 imat_ilen = mat->ilen; 2725 imat_j = mat->j; 2726 imat_i = mat->i; 2727 imat_a = mat->a; 2728 2729 if (!allcolumns[i]) cmap_i = cmap[i]; 2730 rmap_i = rmap[i]; 2731 irow_i = irow[i]; 2732 jmax = nrow[i]; 2733 for (j=0; j<jmax; j++) { 2734 row = irow_i[j]; 2735 proc = row2proc_i[j]; 2736 if (proc == rank) { 2737 old_row = row; 2738 #if defined(PETSC_USE_CTABLE) 2739 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2740 row--; 2741 #else 2742 row = rmap_i[row]; 2743 #endif 2744 ilen_row = imat_ilen[row]; 2745 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2746 mat_i = imat_i[row]; 2747 mat_a = imat_a + mat_i; 2748 mat_j = imat_j + mat_i; 2749 if (!allcolumns[i]) { 2750 for (k=0; k<ncols; k++) { 2751 #if defined(PETSC_USE_CTABLE) 2752 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2753 #else 2754 tcol = cmap_i[cols[k]]; 2755 #endif 2756 if (tcol) { 2757 *mat_j++ = tcol - 1; 2758 *mat_a++ = vals[k]; 2759 ilen_row++; 2760 } 2761 } 2762 } else { /* allcolumns */ 2763 for (k=0; k<ncols; k++) { 2764 *mat_j++ = cols[k]; /* global col index! */ 2765 *mat_a++ = vals[k]; 2766 ilen_row++; 2767 } 2768 } 2769 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2770 2771 imat_ilen[row] = ilen_row; 2772 } 2773 } 2774 } 2775 } 2776 2777 /* Now assemble the off proc rows*/ 2778 { 2779 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 2780 PetscInt *imat_j,*imat_i; 2781 PetscScalar *imat_a,*rbuf4_i; 2782 2783 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2784 for (tmp2=0; tmp2<nrqs; tmp2++) { 2785 //ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 2786 idex2 = tmp2; 2787 idex = pa[idex2]; 2788 sbuf1_i = sbuf1[idex]; 2789 jmax = sbuf1_i[0]; 2790 ct1 = 2*jmax + 1; 2791 ct2 = 0; 2792 rbuf2_i = rbuf2[idex2]; 2793 rbuf3_i = rbuf3[idex2]; 2794 rbuf4_i = rbuf4[idex2]; 2795 for (j=1; j<=jmax; j++) { 2796 is_no = sbuf1_i[2*j-1]; 2797 rmap_i = rmap[is_no]; 2798 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2799 mat = (Mat_SeqAIJ*)submats[is_no]->data; 2800 imat_ilen = mat->ilen; 2801 imat_j = mat->j; 2802 imat_i = mat->i; 2803 imat_a = mat->a; 2804 max1 = sbuf1_i[2*j]; 2805 for (k=0; k<max1; k++,ct1++) { 2806 row = sbuf1_i[ct1]; 2807 #if defined(PETSC_USE_CTABLE) 2808 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2809 row--; 2810 #else 2811 row = rmap_i[row]; 2812 #endif 2813 ilen = imat_ilen[row]; 2814 mat_i = imat_i[row]; 2815 mat_a = imat_a + mat_i; 2816 mat_j = imat_j + mat_i; 2817 max2 = rbuf2_i[ct1]; 2818 if (!allcolumns[is_no]) { 2819 for (l=0; l<max2; l++,ct2++) { 2820 2821 #if defined(PETSC_USE_CTABLE) 2822 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2823 #else 2824 tcol = cmap_i[rbuf3_i[ct2]]; 2825 #endif 2826 if (tcol) { 2827 *mat_j++ = tcol - 1; 2828 *mat_a++ = rbuf4_i[ct2]; 2829 ilen++; 2830 } 2831 } 2832 } else { /* allcolumns */ 2833 for (l=0; l<max2; l++,ct2++) { 2834 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2835 *mat_a++ = rbuf4_i[ct2]; 2836 ilen++; 2837 } 2838 } 2839 imat_ilen[row] = ilen; 2840 } 2841 } 2842 } 2843 } 2844 2845 /* sort the rows */ 2846 { 2847 PetscInt *imat_ilen,*imat_j,*imat_i; 2848 PetscScalar *imat_a; 2849 2850 for (i=0; i<ismax; i++) { 2851 mat = (Mat_SeqAIJ*)submats[i]->data; 2852 imat_j = mat->j; 2853 imat_i = mat->i; 2854 imat_a = mat->a; 2855 imat_ilen = mat->ilen; 2856 2857 if (allcolumns[i]) continue; 2858 jmax = nrow[i]; 2859 for (j=0; j<jmax; j++) { 2860 PetscInt ilen; 2861 2862 mat_i = imat_i[j]; 2863 mat_a = imat_a + mat_i; 2864 mat_j = imat_j + mat_i; 2865 ilen = imat_ilen[j]; 2866 ierr = PetscSortIntWithScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2867 } 2868 } 2869 } 2870 2871 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2872 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2873 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2874 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2875 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2876 2877 /* Restore the indices */ 2878 for (i=0; i<ismax; i++) { 2879 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2880 if (!allcolumns[i]) { 2881 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2882 } 2883 } 2884 2885 /* Destroy allocated memory */ 2886 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 2887 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2888 2889 if (!ismax) { 2890 ierr = PetscFree(pa);CHKERRQ(ierr); 2891 2892 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2893 for (i=0; i<nrqr; ++i) { 2894 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2895 } 2896 for (i=0; i<nrqs; ++i) { 2897 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2898 } 2899 2900 ierr = PetscFree3(sbuf2,req_size,req_source1);CHKERRQ(ierr); 2901 ierr = PetscFree3(req_source2,rbuf2,rbuf3);CHKERRQ(ierr); 2902 } 2903 2904 for (i=0; i<nrqs; ++i) { 2905 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2906 } 2907 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2908 2909 if (scall == MAT_INITIAL_MATRIX) { 2910 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2911 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2912 } 2913 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2914 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2915 2916 ierr = PetscFree(rmap);CHKERRQ(ierr); 2917 ierr = PetscFree(cmap);CHKERRQ(ierr); 2918 2919 if (scall == MAT_INITIAL_MATRIX) { 2920 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2921 ierr = PetscFree(lens);CHKERRQ(ierr); 2922 } 2923 2924 for (i=0; i<ismax; i++) { 2925 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2926 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2927 } 2928 2929 ierr = PetscFree(issorted);CHKERRQ(ierr); 2930 ierr = PetscFree(row2proc);CHKERRQ(ierr); 2931 ierr = PetscFree(smats);CHKERRQ(ierr); 2932 PetscFunctionReturn(0); 2933 } 2934 2935 /* 2936 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2937 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2938 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2939 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2940 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2941 state, and needs to be "assembled" later by compressing B's column space. 2942 2943 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2944 Following this call, C->A & C->B have been created, even if empty. 2945 */ 2946 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2947 { 2948 /* If making this function public, change the error returned in this function away from _PLIB. */ 2949 PetscErrorCode ierr; 2950 Mat_MPIAIJ *aij; 2951 Mat_SeqAIJ *Baij; 2952 PetscBool seqaij,Bdisassembled; 2953 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2954 PetscScalar v; 2955 const PetscInt *rowindices,*colindices; 2956 2957 PetscFunctionBegin; 2958 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2959 if (A) { 2960 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2961 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2962 if (rowemb) { 2963 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2964 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2965 } else { 2966 if (C->rmap->n != A->rmap->n) { 2967 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2968 } 2969 } 2970 if (dcolemb) { 2971 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2972 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2973 } else { 2974 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2975 } 2976 } 2977 if (B) { 2978 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2979 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2980 if (rowemb) { 2981 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2982 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2983 } else { 2984 if (C->rmap->n != B->rmap->n) { 2985 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2986 } 2987 } 2988 if (ocolemb) { 2989 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2990 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2991 } else { 2992 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2993 } 2994 } 2995 2996 aij = (Mat_MPIAIJ*)(C->data); 2997 if (!aij->A) { 2998 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2999 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 3000 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 3001 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 3002 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 3003 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 3004 } 3005 if (A) { 3006 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 3007 } else { 3008 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 3009 } 3010 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 3011 /* 3012 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 3013 need to "disassemble" B -- convert it to using C's global indices. 3014 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 3015 3016 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 3017 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 3018 3019 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 3020 At least avoid calling MatSetValues() and the implied searches? 3021 */ 3022 3023 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 3024 #if defined(PETSC_USE_CTABLE) 3025 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 3026 #else 3027 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 3028 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 3029 if (aij->B) { 3030 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 3031 } 3032 #endif 3033 ngcol = 0; 3034 if (aij->lvec) { 3035 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 3036 } 3037 if (aij->garray) { 3038 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 3039 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 3040 } 3041 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 3042 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 3043 } 3044 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 3045 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 3046 } 3047 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 3048 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 3049 } 3050 } 3051 Bdisassembled = PETSC_FALSE; 3052 if (!aij->B) { 3053 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 3054 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 3055 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 3056 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 3057 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3058 Bdisassembled = PETSC_TRUE; 3059 } 3060 if (B) { 3061 Baij = (Mat_SeqAIJ*)(B->data); 3062 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3063 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3064 for (i=0; i<B->rmap->n; i++) { 3065 nz[i] = Baij->i[i+1] - Baij->i[i]; 3066 } 3067 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3068 ierr = PetscFree(nz);CHKERRQ(ierr); 3069 } 3070 3071 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3072 shift = rend-rstart; 3073 count = 0; 3074 rowindices = NULL; 3075 colindices = NULL; 3076 if (rowemb) { 3077 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3078 } 3079 if (ocolemb) { 3080 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3081 } 3082 for (i=0; i<B->rmap->n; i++) { 3083 PetscInt row; 3084 row = i; 3085 if (rowindices) row = rowindices[i]; 3086 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3087 col = Baij->j[count]; 3088 if (colindices) col = colindices[col]; 3089 if (Bdisassembled && col>=rstart) col += shift; 3090 v = Baij->a[count]; 3091 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3092 ++count; 3093 } 3094 } 3095 /* No assembly for aij->B is necessary. */ 3096 /* FIXME: set aij->B's nonzerostate correctly. */ 3097 } else { 3098 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3099 } 3100 C->preallocated = PETSC_TRUE; 3101 C->was_assembled = PETSC_FALSE; 3102 C->assembled = PETSC_FALSE; 3103 /* 3104 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3105 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3106 */ 3107 PetscFunctionReturn(0); 3108 } 3109 3110 /* 3111 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3112 */ 3113 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3114 { 3115 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3116 3117 PetscFunctionBegin; 3118 PetscValidPointer(A,2); 3119 PetscValidPointer(B,3); 3120 /* FIXME: make sure C is assembled */ 3121 *A = aij->A; 3122 *B = aij->B; 3123 /* Note that we don't incref *A and *B, so be careful! */ 3124 PetscFunctionReturn(0); 3125 } 3126 3127 /* 3128 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3129 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3130 */ 3131 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3132 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3133 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3134 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3135 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3136 { 3137 PetscErrorCode ierr; 3138 PetscMPIInt isize,flag; 3139 PetscInt i,ii,cismax,ispar; 3140 Mat *A,*B; 3141 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3142 3143 PetscFunctionBegin; 3144 if (!ismax) PetscFunctionReturn(0); 3145 3146 for (i = 0, cismax = 0; i < ismax; ++i) { 3147 PetscMPIInt isize; 3148 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3149 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3150 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3151 if (isize > 1) ++cismax; 3152 } 3153 3154 /* 3155 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3156 ispar counts the number of parallel ISs across C's comm. 3157 */ 3158 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3159 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3160 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3161 PetscFunctionReturn(0); 3162 } 3163 3164 /* if (ispar) */ 3165 /* 3166 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3167 These are used to extract the off-diag portion of the resulting parallel matrix. 3168 The row IS for the off-diag portion is the same as for the diag portion, 3169 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3170 */ 3171 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3172 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3173 for (i = 0, ii = 0; i < ismax; ++i) { 3174 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3175 if (isize > 1) { 3176 /* 3177 TODO: This is the part that's ***NOT SCALABLE***. 3178 To fix this we need to extract just the indices of C's nonzero columns 3179 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3180 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3181 to be done without serializing on the IS list, so, most likely, it is best 3182 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3183 */ 3184 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3185 /* Now we have to 3186 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3187 were sorted on each rank, concatenated they might no longer be sorted; 3188 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3189 indices in the nondecreasing order to the original index positions. 3190 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3191 */ 3192 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3193 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3194 ++ii; 3195 } 3196 } 3197 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3198 for (i = 0, ii = 0; i < ismax; ++i) { 3199 PetscInt j,issize; 3200 const PetscInt *indices; 3201 3202 /* 3203 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3204 */ 3205 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3206 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3207 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3208 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3209 for (j = 1; j < issize; ++j) { 3210 if (indices[j] == indices[j-1]) { 3211 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3212 } 3213 } 3214 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3215 3216 3217 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3218 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3219 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3220 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3221 for (j = 1; j < issize; ++j) { 3222 if (indices[j-1] == indices[j]) { 3223 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3224 } 3225 } 3226 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3227 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3228 if (isize > 1) { 3229 cisrow[ii] = isrow[i]; 3230 ++ii; 3231 } 3232 } 3233 /* 3234 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3235 array of sequential matrices underlying the resulting parallel matrices. 3236 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3237 contain duplicates. 3238 3239 There are as many diag matrices as there are original index sets. There are only as many parallel 3240 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3241 3242 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3243 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3244 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3245 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3246 with A[i] and B[ii] extracted from the corresponding MPI submat. 3247 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3248 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3249 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3250 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3251 values into A[i] and B[ii] sitting inside the corresponding submat. 3252 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3253 A[i], B[ii], AA[i] or BB[ii] matrices. 3254 */ 3255 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3256 if (scall == MAT_INITIAL_MATRIX) { 3257 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3258 } 3259 3260 /* Now obtain the sequential A and B submatrices separately. */ 3261 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3262 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3263 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3264 3265 /* 3266 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3267 matrices A & B have been extracted directly into the parallel matrices containing them, or 3268 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3269 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3270 to have the same sparsity pattern. 3271 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3272 must be constructed for C. This is done by setseqmat(s). 3273 */ 3274 for (i = 0, ii = 0; i < ismax; ++i) { 3275 /* 3276 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3277 That way we can avoid sorting and computing permutations when reusing. 3278 To this end: 3279 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3280 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3281 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3282 */ 3283 MatStructure pattern; 3284 pattern = DIFFERENT_NONZERO_PATTERN; 3285 3286 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3287 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3288 if (isize > 1) { 3289 if (scall == MAT_INITIAL_MATRIX) { 3290 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3291 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3292 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3293 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3294 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3295 } 3296 /* 3297 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3298 */ 3299 { 3300 Mat AA,BB; 3301 AA = A[i]; 3302 BB = B[ii]; 3303 if (AA || BB) { 3304 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3305 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3306 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3307 } 3308 3309 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3310 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3311 } 3312 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3313 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3314 ++ii; 3315 } else { /* if (isize == 1) */ 3316 if (scall == MAT_REUSE_MATRIX) { 3317 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3318 } 3319 if (isrow_p[i] || iscol_p[i]) { 3320 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3321 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3322 /* Otherwise A is extracted straight into (*submats)[i]. */ 3323 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3324 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3325 } else (*submat)[i] = A[i]; 3326 } 3327 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3328 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3329 } 3330 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3331 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3332 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3333 ierr = PetscFree(A);CHKERRQ(ierr); 3334 ierr = PetscFree(B);CHKERRQ(ierr); 3335 PetscFunctionReturn(0); 3336 } 3337 3338 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3339 { 3340 PetscErrorCode ierr; 3341 3342 PetscFunctionBegin; 3343 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3344 PetscFunctionReturn(0); 3345 } 3346