1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++){ 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++){ 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank){ 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows){ 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++){ 115 if (tosizes_temp[i]){ 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++){ 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++){ 133 for (j=0; j<fromsizes[2*i]; j++){ 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++){ 169 for (j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++){ 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++){ 209 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 210 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 213 } 214 /* retrieve information to get row id and its overlap */ 215 for (i=0; i<nrecvs; ){ 216 is_id = recvdata[i++]; 217 data_size = recvdata[i++]; 218 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 219 isz_i = isz[is_id]; 220 for (j=0; j< data_size; j++){ 221 col = recvdata[i++]; 222 indices_i[isz_i++] = col; 223 } 224 isz[is_id] = isz_i; 225 } 226 /* remove duplicate entities */ 227 for (i=0; i<nidx; i++){ 228 indices_i = indices_temp+(max_lsize+nrecvs)*i; 229 isz_i = isz[i]; 230 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 231 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 232 } 233 ierr = PetscFree(isz);CHKERRQ(ierr); 234 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 239 { 240 PetscLayout rmap,cmap; 241 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 242 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 243 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 244 const PetscInt *gcols,*ai,*aj,*bi,*bj; 245 Mat amat,bmat; 246 PetscMPIInt rank; 247 PetscBool done; 248 MPI_Comm comm; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 253 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 254 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 255 /* Even if the mat is symmetric, we still assume it is not symmetric */ 256 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 257 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 258 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 /* total number of nonzero values is used to estimate the memory usage in the next step */ 261 tnz = ai[an]+bi[bn]; 262 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 263 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 264 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 265 /* to find the longest message */ 266 max_fszs = 0; 267 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 268 /* better way to estimate number of nonzero in the mat??? */ 269 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 270 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 271 rows_pos = 0; 272 totalrows = 0; 273 for (i=0; i<nfrom; i++){ 274 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 275 /* group data together */ 276 for (j=0; j<fromsizes[2*i]; j+=2){ 277 is_id = fromrows[rows_pos++];/* no of is */ 278 rows_i = rows_data[is_id]; 279 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 280 } 281 /* estimate a space to avoid multiple allocations */ 282 for (j=0; j<nidx; j++){ 283 indvc_ij = 0; 284 rows_i = rows_data[j]; 285 for (l=0; l<rows_pos_i[j]; l++){ 286 row = rows_i[l]-rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for (k=start; k<end; k++){ /* Amat */ 290 col = aj[k] + cstart; 291 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for (k=start; k<end; k++) { /* Bmat */ 296 col = gcols[bj[k]]; 297 indices_tmp[indvc_ij++] = col; 298 } 299 } 300 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 301 indv_counts[i*nidx+j] = indvc_ij; 302 totalrows += indvc_ij; 303 } 304 } 305 /* message triple <no of is, number of rows, rows> */ 306 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 307 totalrows = 0; 308 rows_pos = 0; 309 /* use this code again */ 310 for (i=0;i<nfrom;i++){ 311 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 312 for (j=0; j<fromsizes[2*i]; j+=2){ 313 is_id = fromrows[rows_pos++]; 314 rows_i = rows_data[is_id]; 315 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 316 } 317 /* add data */ 318 for (j=0; j<nidx; j++){ 319 if (!indv_counts[i*nidx+j]) continue; 320 indvc_ij = 0; 321 sbdata[totalrows++] = j; 322 sbdata[totalrows++] = indv_counts[i*nidx+j]; 323 sbsizes[2*i] += 2; 324 rows_i = rows_data[j]; 325 for (l=0; l<rows_pos_i[j]; l++){ 326 row = rows_i[l]-rstart; 327 start = ai[row]; 328 end = ai[row+1]; 329 for (k=start; k<end; k++){ /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; 332 } 333 start = bi[row]; 334 end = bi[row+1]; 335 for (k=start; k<end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 341 sbsizes[2*i] += indvc_ij; 342 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 343 totalrows += indvc_ij; 344 } 345 } 346 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 347 for (i=0; i<nfrom; i++){ 348 offsets[i+1] = offsets[i] + sbsizes[2*i]; 349 sbsizes[2*i+1] = offsets[i]; 350 } 351 ierr = PetscFree(offsets);CHKERRQ(ierr); 352 if (sbrowsizes) *sbrowsizes = sbsizes; 353 if (sbrows) *sbrows = sbdata; 354 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 355 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 356 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 361 { 362 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 363 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 364 PetscInt lsize,lsize_tmp,owner; 365 PetscMPIInt rank; 366 Mat amat,bmat; 367 PetscBool done; 368 PetscLayout cmap,rmap; 369 MPI_Comm comm; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 374 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 375 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 376 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 377 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 378 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 /* is it a safe way to compute number of nonzero values ? */ 381 tnz = ai[an]+bi[bn]; 382 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 383 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 384 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 385 /* it is a better way to estimate memory than the old implementation 386 * where global size of matrix is used 387 * */ 388 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 389 for (i=0; i<nidx; i++) { 390 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 391 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 392 lsize_tmp = 0; 393 for (j=0; j<lsize; j++) { 394 owner = -1; 395 row = indices[j]; 396 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 397 if (owner != rank) continue; 398 /* local number */ 399 row -= rstart; 400 start = ai[row]; 401 end = ai[row+1]; 402 for (k=start; k<end; k++) { /* Amat */ 403 col = aj[k] + cstart; 404 indices_temp[lsize_tmp++] = col; 405 } 406 start = bi[row]; 407 end = bi[row+1]; 408 for (k=start; k<end; k++) { /* Bmat */ 409 col = gcols[bj[k]]; 410 indices_temp[lsize_tmp++] = col; 411 } 412 } 413 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 414 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 415 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 416 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 417 } 418 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 419 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 420 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests recieved (which have to be or which have been processed 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscErrorCode ierr; 461 PetscMPIInt size,rank,tag1,tag2; 462 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 463 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 464 PetscBT *table; 465 MPI_Comm comm; 466 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 467 MPI_Status *s_status,*recv_status; 468 char *t_p; 469 470 PetscFunctionBegin; 471 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 477 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 478 479 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 480 481 for (i=0; i<imax; i++) { 482 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 483 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 489 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 490 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 491 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 492 for (i=0; i<imax; i++) { 493 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 idx_i = idx[i]; 495 len = n[i]; 496 for (j=0; j<len; j++) { 497 row = idx_i[j]; 498 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 499 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 500 w4[proc]++; 501 } 502 for (j=0; j<size; j++) { 503 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 504 } 505 } 506 507 nrqs = 0; /* no of outgoing messages */ 508 msz = 0; /* total mesg length (for all proc */ 509 w1[rank] = 0; /* no mesg sent to intself */ 510 w3[rank] = 0; 511 for (i=0; i<size; i++) { 512 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 513 } 514 /* pa - is list of processors to communicate with */ 515 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 516 for (i=0,j=0; i<size; i++) { 517 if (w1[i]) {pa[j] = i; j++;} 518 } 519 520 /* Each message would have a header = 1 + 2*(no of IS) + data */ 521 for (i=0; i<nrqs; i++) { 522 j = pa[i]; 523 w1[j] += w2[j] + 2*w3[j]; 524 msz += w1[j]; 525 } 526 527 /* Determine the number of messages to expect, their lengths, from from-ids */ 528 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 529 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 530 531 /* Now post the Irecvs corresponding to these messages */ 532 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 533 534 /* Allocate Memory for outgoing messages */ 535 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 536 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 537 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 538 539 { 540 PetscInt *iptr = tmp,ict = 0; 541 for (i=0; i<nrqs; i++) { 542 j = pa[i]; 543 iptr += ict; 544 outdat[j] = iptr; 545 ict = w1[j]; 546 } 547 } 548 549 /* Form the outgoing messages */ 550 /* plug in the headers */ 551 for (i=0; i<nrqs; i++) { 552 j = pa[i]; 553 outdat[j][0] = 0; 554 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 555 ptr[j] = outdat[j] + 2*w3[j] + 1; 556 } 557 558 /* Memory for doing local proc's work */ 559 { 560 PetscInt M_BPB_imax = 0; 561 #if defined(PETSC_USE_CTABLE) 562 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 563 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 564 for (i=0; i<imax; i++) { 565 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 566 } 567 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 570 } 571 #else 572 PetscInt Mimax = 0; 573 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 574 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 575 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 data[i] = d_p + M*i; 579 } 580 #endif 581 } 582 583 /* Parse the IS and update local tables and the outgoing buf with the data */ 584 { 585 PetscInt n_i,isz_i,*outdat_j,ctr_j; 586 PetscBT table_i; 587 588 for (i=0; i<imax; i++) { 589 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 590 n_i = n[i]; 591 table_i = table[i]; 592 idx_i = idx[i]; 593 #if defined(PETSC_USE_CTABLE) 594 table_data_i = table_data[i]; 595 #else 596 data_i = data[i]; 597 #endif 598 isz_i = isz[i]; 599 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 600 row = idx_i[j]; 601 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 602 if (proc != rank) { /* copy to the outgoing buffer */ 603 ctr[proc]++; 604 *ptr[proc] = row; 605 ptr[proc]++; 606 } else if (!PetscBTLookupSet(table_i,row)) { 607 #if defined(PETSC_USE_CTABLE) 608 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 609 #else 610 data_i[isz_i] = row; /* Update the local table */ 611 #endif 612 isz_i++; 613 } 614 } 615 /* Update the headers for the current IS */ 616 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 617 if ((ctr_j = ctr[j])) { 618 outdat_j = outdat[j]; 619 k = ++outdat_j[0]; 620 outdat_j[2*k] = ctr_j; 621 outdat_j[2*k-1] = i; 622 } 623 } 624 isz[i] = isz_i; 625 } 626 } 627 628 /* Now post the sends */ 629 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 630 for (i=0; i<nrqs; ++i) { 631 j = pa[i]; 632 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 633 } 634 635 /* No longer need the original indices */ 636 for (i=0; i<imax; ++i) { 637 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 638 } 639 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 640 641 for (i=0; i<imax; ++i) { 642 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 643 } 644 645 /* Do Local work */ 646 #if defined(PETSC_USE_CTABLE) 647 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 648 #else 649 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 650 #endif 651 652 /* Receive messages */ 653 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 654 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 655 656 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 657 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 658 659 /* Phase 1 sends are complete - deallocate buffers */ 660 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 661 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 662 663 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 664 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 665 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 666 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 667 ierr = PetscFree(rbuf);CHKERRQ(ierr); 668 669 670 /* Send the data back */ 671 /* Do a global reduction to know the buffer space req for incoming messages */ 672 { 673 PetscMPIInt *rw1; 674 675 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 676 677 for (i=0; i<nrqr; ++i) { 678 proc = recv_status[i].MPI_SOURCE; 679 680 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 681 rw1[proc] = isz1[i]; 682 } 683 ierr = PetscFree(onodes1);CHKERRQ(ierr); 684 ierr = PetscFree(olengths1);CHKERRQ(ierr); 685 686 /* Determine the number of messages to expect, their lengths, from from-ids */ 687 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 688 ierr = PetscFree(rw1);CHKERRQ(ierr); 689 } 690 /* Now post the Irecvs corresponding to these messages */ 691 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 692 693 /* Now post the sends */ 694 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 695 for (i=0; i<nrqr; ++i) { 696 j = recv_status[i].MPI_SOURCE; 697 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 698 } 699 700 /* receive work done on other processors */ 701 { 702 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 703 PetscMPIInt idex; 704 PetscBT table_i; 705 MPI_Status *status2; 706 707 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 708 for (i=0; i<nrqs; ++i) { 709 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 710 /* Process the message */ 711 rbuf2_i = rbuf2[idex]; 712 ct1 = 2*rbuf2_i[0]+1; 713 jmax = rbuf2[idex][0]; 714 for (j=1; j<=jmax; j++) { 715 max = rbuf2_i[2*j]; 716 is_no = rbuf2_i[2*j-1]; 717 isz_i = isz[is_no]; 718 table_i = table[is_no]; 719 #if defined(PETSC_USE_CTABLE) 720 table_data_i = table_data[is_no]; 721 #else 722 data_i = data[is_no]; 723 #endif 724 for (k=0; k<max; k++,ct1++) { 725 row = rbuf2_i[ct1]; 726 if (!PetscBTLookupSet(table_i,row)) { 727 #if defined(PETSC_USE_CTABLE) 728 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 729 #else 730 data_i[isz_i] = row; 731 #endif 732 isz_i++; 733 } 734 } 735 isz[is_no] = isz_i; 736 } 737 } 738 739 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 740 ierr = PetscFree(status2);CHKERRQ(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 } 768 769 ierr = PetscFree(onodes2);CHKERRQ(ierr); 770 ierr = PetscFree(olengths2);CHKERRQ(ierr); 771 772 ierr = PetscFree(pa);CHKERRQ(ierr); 773 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 774 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 775 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 776 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 779 ierr = PetscFree(s_status);CHKERRQ(ierr); 780 ierr = PetscFree(recv_status);CHKERRQ(ierr); 781 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 782 ierr = PetscFree(xdata);CHKERRQ(ierr); 783 ierr = PetscFree(isz1);CHKERRQ(ierr); 784 #if defined(PETSC_USE_CTABLE) 785 for (i=0; i<imax; i++) { 786 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 787 } 788 ierr = PetscFree(table_data);CHKERRQ(ierr); 789 ierr = PetscFree(tdata);CHKERRQ(ierr); 790 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 791 #else 792 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 793 #endif 794 PetscFunctionReturn(0); 795 } 796 797 /* 798 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 799 the work on the local processor. 800 801 Inputs: 802 C - MAT_MPIAIJ; 803 imax - total no of index sets processed at a time; 804 table - an array of char - size = m bits. 805 806 Output: 807 isz - array containing the count of the solution elements corresponding 808 to each index set; 809 data or table_data - pointer to the solutions 810 */ 811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 812 { 813 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 814 Mat A = c->A,B = c->B; 815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 816 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 817 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 818 PetscBT table_i; 819 #if defined(PETSC_USE_CTABLE) 820 PetscTable table_data_i; 821 PetscErrorCode ierr; 822 PetscTablePosition tpos; 823 PetscInt tcount,*tdata; 824 #else 825 PetscInt *data_i; 826 #endif 827 828 PetscFunctionBegin; 829 rstart = C->rmap->rstart; 830 cstart = C->cmap->rstart; 831 ai = a->i; 832 aj = a->j; 833 bi = b->i; 834 bj = b->j; 835 garray = c->garray; 836 837 for (i=0; i<imax; i++) { 838 #if defined(PETSC_USE_CTABLE) 839 /* copy existing entries of table_data_i into tdata[] */ 840 table_data_i = table_data[i]; 841 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 842 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 843 844 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 845 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 846 while (tpos) { 847 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 848 tdata[--j] = --row; 849 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 850 } 851 #else 852 data_i = data[i]; 853 #endif 854 table_i = table[i]; 855 isz_i = isz[i]; 856 max = isz[i]; 857 858 for (j=0; j<max; j++) { 859 #if defined(PETSC_USE_CTABLE) 860 row = tdata[j] - rstart; 861 #else 862 row = data_i[j] - rstart; 863 #endif 864 start = ai[row]; 865 end = ai[row+1]; 866 for (k=start; k<end; k++) { /* Amat */ 867 val = aj[k] + cstart; 868 if (!PetscBTLookupSet(table_i,val)) { 869 #if defined(PETSC_USE_CTABLE) 870 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 871 #else 872 data_i[isz_i] = val; 873 #endif 874 isz_i++; 875 } 876 } 877 start = bi[row]; 878 end = bi[row+1]; 879 for (k=start; k<end; k++) { /* Bmat */ 880 val = garray[bj[k]]; 881 if (!PetscBTLookupSet(table_i,val)) { 882 #if defined(PETSC_USE_CTABLE) 883 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 884 #else 885 data_i[isz_i] = val; 886 #endif 887 isz_i++; 888 } 889 } 890 } 891 isz[i] = isz_i; 892 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscFree(tdata);CHKERRQ(ierr); 895 #endif 896 } 897 PetscFunctionReturn(0); 898 } 899 900 /* 901 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 902 and return the output 903 904 Input: 905 C - the matrix 906 nrqr - no of messages being processed. 907 rbuf - an array of pointers to the recieved requests 908 909 Output: 910 xdata - array of messages to be sent back 911 isz1 - size of each message 912 913 For better efficiency perhaps we should malloc separately each xdata[i], 914 then if a remalloc is required we need only copy the data for that one row 915 rather then all previous rows as it is now where a single large chunck of 916 memory is used. 917 918 */ 919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 920 { 921 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 922 Mat A = c->A,B = c->B; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 924 PetscErrorCode ierr; 925 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 926 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 927 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 928 PetscInt *rbuf_i,kmax,rbuf_0; 929 PetscBT xtable; 930 931 PetscFunctionBegin; 932 m = C->rmap->N; 933 rstart = C->rmap->rstart; 934 cstart = C->cmap->rstart; 935 ai = a->i; 936 aj = a->j; 937 bi = b->i; 938 bj = b->j; 939 garray = c->garray; 940 941 942 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 943 rbuf_i = rbuf[i]; 944 rbuf_0 = rbuf_i[0]; 945 ct += rbuf_0; 946 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 947 } 948 949 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 950 else max1 = 1; 951 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 952 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 953 ++no_malloc; 954 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 955 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 956 957 ct3 = 0; 958 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 959 rbuf_i = rbuf[i]; 960 rbuf_0 = rbuf_i[0]; 961 ct1 = 2*rbuf_0+1; 962 ct2 = ct1; 963 ct3 += ct1; 964 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 965 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 966 oct2 = ct2; 967 kmax = rbuf_i[2*j]; 968 for (k=0; k<kmax; k++,ct1++) { 969 row = rbuf_i[ct1]; 970 if (!PetscBTLookupSet(xtable,row)) { 971 if (!(ct3 < mem_estimate)) { 972 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 973 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 974 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 975 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 976 xdata[0] = tmp; 977 mem_estimate = new_estimate; ++no_malloc; 978 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 979 } 980 xdata[i][ct2++] = row; 981 ct3++; 982 } 983 } 984 for (k=oct2,max2=ct2; k<max2; k++) { 985 row = xdata[i][k] - rstart; 986 start = ai[row]; 987 end = ai[row+1]; 988 for (l=start; l<end; l++) { 989 val = aj[l] + cstart; 990 if (!PetscBTLookupSet(xtable,val)) { 991 if (!(ct3 < mem_estimate)) { 992 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 993 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 994 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 995 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 996 xdata[0] = tmp; 997 mem_estimate = new_estimate; ++no_malloc; 998 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 999 } 1000 xdata[i][ct2++] = val; 1001 ct3++; 1002 } 1003 } 1004 start = bi[row]; 1005 end = bi[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = garray[bj[l]]; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 } 1023 /* Update the header*/ 1024 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1025 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1026 } 1027 xdata[i][0] = rbuf_0; 1028 xdata[i+1] = xdata[i] + ct2; 1029 isz1[i] = ct2; /* size of each message */ 1030 } 1031 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1032 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 /* -------------------------------------------------------------------------*/ 1036 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1037 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1038 /* 1039 Every processor gets the entire matrix 1040 */ 1041 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1042 { 1043 Mat B; 1044 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1045 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1046 PetscErrorCode ierr; 1047 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1048 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1049 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1050 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1051 1052 PetscFunctionBegin; 1053 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1054 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1055 1056 if (scall == MAT_INITIAL_MATRIX) { 1057 /* ---------------------------------------------------------------- 1058 Tell every processor the number of nonzeros per row 1059 */ 1060 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1061 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1062 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1063 } 1064 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1065 for (i=0; i<size; i++) { 1066 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1067 displs[i] = A->rmap->range[i]; 1068 } 1069 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1070 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1071 #else 1072 sendcount = A->rmap->rend - A->rmap->rstart; 1073 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1074 #endif 1075 /* --------------------------------------------------------------- 1076 Create the sequential matrix of the same type as the local block diagonal 1077 */ 1078 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1079 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1080 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1081 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1082 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1083 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1084 **Bin = B; 1085 b = (Mat_SeqAIJ*)B->data; 1086 1087 /*-------------------------------------------------------------------- 1088 Copy my part of matrix column indices over 1089 */ 1090 sendcount = ad->nz + bd->nz; 1091 jsendbuf = b->j + b->i[rstarts[rank]]; 1092 a_jsendbuf = ad->j; 1093 b_jsendbuf = bd->j; 1094 n = A->rmap->rend - A->rmap->rstart; 1095 cnt = 0; 1096 for (i=0; i<n; i++) { 1097 1098 /* put in lower diagonal portion */ 1099 m = bd->i[i+1] - bd->i[i]; 1100 while (m > 0) { 1101 /* is it above diagonal (in bd (compressed) numbering) */ 1102 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1103 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1104 m--; 1105 } 1106 1107 /* put in diagonal portion */ 1108 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1109 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1110 } 1111 1112 /* put in upper diagonal portion */ 1113 while (m-- > 0) { 1114 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1115 } 1116 } 1117 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1118 1119 /*-------------------------------------------------------------------- 1120 Gather all column indices to all processors 1121 */ 1122 for (i=0; i<size; i++) { 1123 recvcounts[i] = 0; 1124 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1125 recvcounts[i] += lens[j]; 1126 } 1127 } 1128 displs[0] = 0; 1129 for (i=1; i<size; i++) { 1130 displs[i] = displs[i-1] + recvcounts[i-1]; 1131 } 1132 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1133 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1134 #else 1135 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1136 #endif 1137 /*-------------------------------------------------------------------- 1138 Assemble the matrix into useable form (note numerical values not yet set) 1139 */ 1140 /* set the b->ilen (length of each row) values */ 1141 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1142 /* set the b->i indices */ 1143 b->i[0] = 0; 1144 for (i=1; i<=A->rmap->N; i++) { 1145 b->i[i] = b->i[i-1] + lens[i-1]; 1146 } 1147 ierr = PetscFree(lens);CHKERRQ(ierr); 1148 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1150 1151 } else { 1152 B = **Bin; 1153 b = (Mat_SeqAIJ*)B->data; 1154 } 1155 1156 /*-------------------------------------------------------------------- 1157 Copy my part of matrix numerical values into the values location 1158 */ 1159 if (flag == MAT_GET_VALUES) { 1160 sendcount = ad->nz + bd->nz; 1161 sendbuf = b->a + b->i[rstarts[rank]]; 1162 a_sendbuf = ad->a; 1163 b_sendbuf = bd->a; 1164 b_sendj = bd->j; 1165 n = A->rmap->rend - A->rmap->rstart; 1166 cnt = 0; 1167 for (i=0; i<n; i++) { 1168 1169 /* put in lower diagonal portion */ 1170 m = bd->i[i+1] - bd->i[i]; 1171 while (m > 0) { 1172 /* is it above diagonal (in bd (compressed) numbering) */ 1173 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1174 sendbuf[cnt++] = *b_sendbuf++; 1175 m--; 1176 b_sendj++; 1177 } 1178 1179 /* put in diagonal portion */ 1180 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1181 sendbuf[cnt++] = *a_sendbuf++; 1182 } 1183 1184 /* put in upper diagonal portion */ 1185 while (m-- > 0) { 1186 sendbuf[cnt++] = *b_sendbuf++; 1187 b_sendj++; 1188 } 1189 } 1190 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1191 1192 /* ----------------------------------------------------------------- 1193 Gather all numerical values to all processors 1194 */ 1195 if (!recvcounts) { 1196 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1197 } 1198 for (i=0; i<size; i++) { 1199 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1200 } 1201 displs[0] = 0; 1202 for (i=1; i<size; i++) { 1203 displs[i] = displs[i-1] + recvcounts[i-1]; 1204 } 1205 recvbuf = b->a; 1206 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1207 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1208 #else 1209 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1210 #endif 1211 } /* endof (flag == MAT_GET_VALUES) */ 1212 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1213 1214 if (A->symmetric) { 1215 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1216 } else if (A->hermitian) { 1217 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1218 } else if (A->structurally_symmetric) { 1219 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1220 } 1221 PetscFunctionReturn(0); 1222 } 1223 1224 PetscErrorCode MatDestroy_Dummy_MatGetSubmatrices(Mat C) 1225 { 1226 PetscErrorCode ierr; 1227 Mat_SubMat *submatj = (Mat_SubMat*)C->spptr; 1228 PetscInt i; 1229 1230 PetscFunctionBegin; 1231 printf("MatDestroy_Dummy_MatGetSubmatrices...\n"); 1232 1233 if (!submatj->id) { /* delete data that are linked only to submats[id=0] */ 1234 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1235 1236 for (i=0; i<submatj->nrqr; ++i) { 1237 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1238 } 1239 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1240 1241 if (submatj->rbuf1) { 1242 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1243 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1244 } 1245 1246 for (i=0; i<submatj->nrqs; ++i) { 1247 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1248 } 1249 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1250 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1251 } 1252 1253 #if defined(PETSC_USE_CTABLE) 1254 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1255 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1256 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1257 #else 1258 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1259 #endif 1260 1261 if (!submatj->allcolumns) { 1262 #if defined(PETSC_USE_CTABLE) 1263 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1264 #else 1265 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1266 #endif 1267 } 1268 ierr = submatj->destroy(C);CHKERRQ(ierr); 1269 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1270 1271 ierr = PetscFree(submatj);CHKERRQ(ierr); 1272 PetscFunctionReturn(0); 1273 } 1274 1275 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1276 { 1277 PetscErrorCode ierr; 1278 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1279 Mat_SubMat *submatj = c->submatis1; 1280 PetscInt i; 1281 1282 PetscFunctionBegin; 1283 if (!submatj->id) { /* delete data that are linked only to submats[id=0] */ 1284 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1285 1286 for (i=0; i<submatj->nrqr; ++i) { 1287 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1288 } 1289 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1290 1291 if (submatj->rbuf1) { 1292 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1293 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1294 } 1295 1296 for (i=0; i<submatj->nrqs; ++i) { 1297 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1298 } 1299 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1300 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1301 } 1302 1303 #if defined(PETSC_USE_CTABLE) 1304 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1305 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1306 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1307 #else 1308 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1309 #endif 1310 1311 if (!submatj->allcolumns) { 1312 #if defined(PETSC_USE_CTABLE) 1313 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1314 #else 1315 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1316 #endif 1317 } 1318 ierr = submatj->destroy(C);CHKERRQ(ierr); 1319 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1320 1321 ierr = PetscFree(submatj);CHKERRQ(ierr); 1322 PetscFunctionReturn(0); 1323 } 1324 1325 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1326 { 1327 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1328 Mat submat,A = c->A,B = c->B; 1329 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1330 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1331 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1332 const PetscInt *icol,*irow; 1333 PetscInt nrow,ncol,start; 1334 PetscErrorCode ierr; 1335 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1336 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1337 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1338 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1339 PetscInt *lens,rmax,ncols,*cols,Crow; 1340 #if defined(PETSC_USE_CTABLE) 1341 PetscTable cmap,rmap; 1342 PetscInt *cmap_loc,*rmap_loc; 1343 #else 1344 PetscInt *cmap,*rmap; 1345 #endif 1346 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1347 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1348 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1349 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1350 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1351 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1352 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1353 MPI_Comm comm; 1354 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1355 PetscMPIInt *onodes1,*olengths1,idex,end; 1356 Mat_SubMat *smatis1; 1357 PetscBool isrowsorted; 1358 1359 PetscFunctionBegin; 1360 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1361 1362 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1363 size = c->size; 1364 rank = c->rank; 1365 1366 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1367 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1368 1369 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1370 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1371 if (allcolumns) { 1372 icol = NULL; 1373 ncol = C->cmap->N; 1374 } else { 1375 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1376 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1377 } 1378 1379 if (scall == MAT_INITIAL_MATRIX) { 1380 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1381 1382 /* Get some new tags to keep the communication clean */ 1383 tag1 = ((PetscObject)C)->tag; 1384 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1385 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1386 1387 /* evaluate communication - mesg to who, length of mesg, and buffer space 1388 required. Based on this, buffers are allocated, and data copied into them */ 1389 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1390 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1391 1392 /* w1[proc] = num of rows owned by proc -- to be requested */ 1393 proc = 0; 1394 nrqs = 0; /* num of outgoing messages */ 1395 for (j=0; j<nrow; j++) { 1396 row = irow[j]; /* sorted! */ 1397 while (row >= C->rmap->range[proc+1]) proc++; 1398 w1[proc]++; 1399 row2proc[j] = proc; /* map row index to proc */ 1400 1401 if (proc != rank && !w2[proc]) { 1402 w2[proc] = 1; nrqs++; 1403 } 1404 } 1405 w1[rank] = 0; /* rows owned by self will not be requested */ 1406 1407 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1408 for (proc=0,j=0; proc<size; proc++) { 1409 if (w1[proc]) { pa[j++] = proc;} 1410 } 1411 1412 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1413 msz = 0; /* total mesg length (for all procs) */ 1414 for (i=0; i<nrqs; i++) { 1415 proc = pa[i]; 1416 w1[proc] += 3; 1417 msz += w1[proc]; 1418 } 1419 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1420 1421 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1422 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1423 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1424 1425 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1426 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1427 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1428 1429 /* Now post the Irecvs corresponding to these messages */ 1430 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1431 1432 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1433 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1434 1435 /* Allocate Memory for outgoing messages */ 1436 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1437 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1438 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1439 1440 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1441 iptr = tmp; 1442 for (i=0; i<nrqs; i++) { 1443 proc = pa[i]; 1444 sbuf1[proc] = iptr; 1445 iptr += w1[proc]; 1446 } 1447 1448 /* Form the outgoing messages */ 1449 /* Initialize the header space */ 1450 for (i=0; i<nrqs; i++) { 1451 proc = pa[i]; 1452 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1453 ptr[proc] = sbuf1[proc] + 3; 1454 } 1455 1456 /* Parse the isrow and copy data into outbuf */ 1457 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1458 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1459 proc = row2proc[j]; 1460 if (proc != rank) { /* copy to the outgoing buf*/ 1461 *ptr[proc] = irow[j]; 1462 ctr[proc]++; ptr[proc]++; 1463 } 1464 } 1465 1466 /* Update the headers for the current IS */ 1467 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1468 if ((ctr_j = ctr[j])) { 1469 sbuf1_j = sbuf1[j]; 1470 k = ++sbuf1_j[0]; 1471 sbuf1_j[2*k] = ctr_j; 1472 sbuf1_j[2*k-1] = 0; 1473 } 1474 } 1475 1476 /* Now post the sends */ 1477 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1478 for (i=0; i<nrqs; ++i) { 1479 proc = pa[i]; 1480 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1481 } 1482 1483 /* Post Receives to capture the buffer size */ 1484 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1485 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1486 1487 rbuf2[0] = tmp + msz; 1488 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1489 1490 for (i=0; i<nrqs; ++i) { 1491 proc = pa[i]; 1492 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1493 } 1494 1495 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1496 1497 /* Send to other procs the buf size they should allocate */ 1498 /* Receive messages*/ 1499 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1500 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1501 1502 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1503 for (i=0; i<nrqr; ++i) { 1504 req_size[i] = 0; 1505 rbuf1_i = rbuf1[i]; 1506 start = 2*rbuf1_i[0] + 1; 1507 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1508 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1509 sbuf2_i = sbuf2[i]; 1510 for (j=start; j<end; j++) { 1511 k = rbuf1_i[j] - rstart; 1512 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1513 sbuf2_i[j] = ncols; 1514 req_size[i] += ncols; 1515 } 1516 req_source1[i] = r_status1[i].MPI_SOURCE; 1517 1518 /* form the header */ 1519 sbuf2_i[0] = req_size[i]; 1520 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1521 1522 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1523 } 1524 1525 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1526 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1527 1528 /* rbuf2 is received, Post recv column indices a->j */ 1529 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1530 1531 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1532 for (i=0; i<nrqs; ++i) { 1533 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1534 req_source2[i] = r_status2[i].MPI_SOURCE; 1535 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1536 } 1537 1538 /* Wait on sends1 and sends2 */ 1539 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1540 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1541 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1542 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1543 1544 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1545 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1546 1547 /* Now allocate sending buffers for a->j, and send them off */ 1548 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1549 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1550 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1551 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1552 1553 for (i=0; i<nrqr; i++) { /* for each requested message */ 1554 rbuf1_i = rbuf1[i]; 1555 sbuf_aj_i = sbuf_aj[i]; 1556 ct1 = 2*rbuf1_i[0] + 1; 1557 ct2 = 0; 1558 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1559 1560 kmax = rbuf1[i][2]; 1561 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1562 row = rbuf1_i[ct1] - rstart; 1563 nzA = ai[row+1] - ai[row]; 1564 nzB = bi[row+1] - bi[row]; 1565 ncols = nzA + nzB; 1566 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1567 1568 /* load the column indices for this row into cols*/ 1569 cols = sbuf_aj_i + ct2; 1570 1571 lwrite = 0; 1572 for (l=0; l<nzB; l++) { 1573 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1574 } 1575 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1576 for (l=0; l<nzB; l++) { 1577 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1578 } 1579 1580 ct2 += ncols; 1581 } 1582 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1583 } 1584 1585 /* create column map (cmap): global col of C -> local col of submat */ 1586 #if defined(PETSC_USE_CTABLE) 1587 if (!allcolumns) { 1588 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1589 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1590 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1591 if (icol[j] >= cstart && icol[j] <cend) { 1592 cmap_loc[icol[j] - cstart] = j+1; 1593 } else { /* use PetscTable for non-local col indices */ 1594 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1595 } 1596 } 1597 } else { 1598 cmap = NULL; 1599 cmap_loc = NULL; 1600 } 1601 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1602 #else 1603 if (!allcolumns) { 1604 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1605 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1606 } else { 1607 cmap = NULL; 1608 } 1609 #endif 1610 1611 /* Create lens for MatSeqAIJSetPreallocation() */ 1612 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1613 1614 /* Compute lens from local part of C */ 1615 for (j=0; j<nrow; j++) { 1616 row = irow[j]; 1617 proc = row2proc[j]; 1618 if (proc == rank) { 1619 /* diagonal part A = c->A */ 1620 ncols = ai[row-rstart+1] - ai[row-rstart]; 1621 cols = aj + ai[row-rstart]; 1622 if (!allcolumns) { 1623 for (k=0; k<ncols; k++) { 1624 #if defined(PETSC_USE_CTABLE) 1625 tcol = cmap_loc[cols[k]]; 1626 #else 1627 tcol = cmap[cols[k]+cstart]; 1628 #endif 1629 if (tcol) lens[j]++; 1630 } 1631 } else { /* allcolumns */ 1632 lens[j] = ncols; 1633 } 1634 1635 /* off-diagonal part B = c->B */ 1636 ncols = bi[row-rstart+1] - bi[row-rstart]; 1637 cols = bj + bi[row-rstart]; 1638 if (!allcolumns) { 1639 for (k=0; k<ncols; k++) { 1640 #if defined(PETSC_USE_CTABLE) 1641 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1642 #else 1643 tcol = cmap[bmap[cols[k]]]; 1644 #endif 1645 if (tcol) lens[j]++; 1646 } 1647 } else { /* allcolumns */ 1648 lens[j] += ncols; 1649 } 1650 } 1651 } 1652 1653 /* Create row map (rmap): global row of C -> local row of submat */ 1654 #if defined(PETSC_USE_CTABLE) 1655 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1656 for (j=0; j<nrow; j++) { 1657 row = irow[j]; 1658 proc = row2proc[j]; 1659 if (proc == rank) { /* a local row */ 1660 rmap_loc[row - rstart] = j; 1661 } else { 1662 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1663 } 1664 } 1665 #else 1666 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1667 for (j=0; j<nrow; j++) { 1668 rmap[irow[j]] = j; 1669 } 1670 #endif 1671 1672 /* Update lens from offproc data */ 1673 /* recv a->j is done */ 1674 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1675 for (i=0; i<nrqs; i++) { 1676 proc = pa[i]; 1677 sbuf1_i = sbuf1[proc]; 1678 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1679 ct1 = 2 + 1; 1680 ct2 = 0; 1681 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1682 rbuf3_i = rbuf3[i]; /* received C->j */ 1683 1684 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1685 max1 = sbuf1_i[2]; 1686 for (k=0; k<max1; k++,ct1++) { 1687 #if defined(PETSC_USE_CTABLE) 1688 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1689 row--; 1690 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1691 #else 1692 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1693 #endif 1694 /* Now, store row index of submat in sbuf1_i[ct1] */ 1695 sbuf1_i[ct1] = row; 1696 1697 nnz = rbuf2_i[ct1]; 1698 if (!allcolumns) { 1699 for (l=0; l<nnz; l++,ct2++) { 1700 #if defined(PETSC_USE_CTABLE) 1701 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1702 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1703 } else { 1704 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1705 } 1706 #else 1707 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1708 #endif 1709 if (tcol) lens[row]++; 1710 } 1711 } else { /* allcolumns */ 1712 lens[row] += nnz; 1713 } 1714 } 1715 } 1716 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1717 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1718 1719 /* Create the submatrices */ 1720 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1721 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1722 1723 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1724 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1725 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1726 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1727 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1728 1729 /* create struct Mat_SubMat and attached it to submat */ 1730 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1731 subc = (Mat_SeqAIJ*)submat->data; 1732 subc->submatis1 = smatis1; 1733 1734 smatis1->id = 0; 1735 smatis1->nrqs = nrqs; 1736 smatis1->nrqr = nrqr; 1737 smatis1->rbuf1 = rbuf1; 1738 smatis1->rbuf2 = rbuf2; 1739 smatis1->rbuf3 = rbuf3; 1740 smatis1->sbuf2 = sbuf2; 1741 smatis1->req_source2 = req_source2; 1742 1743 smatis1->sbuf1 = sbuf1; 1744 smatis1->ptr = ptr; 1745 smatis1->tmp = tmp; 1746 smatis1->ctr = ctr; 1747 1748 smatis1->pa = pa; 1749 smatis1->req_size = req_size; 1750 smatis1->req_source1 = req_source1; 1751 1752 smatis1->allcolumns = allcolumns; 1753 smatis1->singleis = PETSC_TRUE; 1754 smatis1->row2proc = row2proc; 1755 smatis1->rmap = rmap; 1756 smatis1->cmap = cmap; 1757 #if defined(PETSC_USE_CTABLE) 1758 smatis1->rmap_loc = rmap_loc; 1759 smatis1->cmap_loc = cmap_loc; 1760 #endif 1761 1762 smatis1->destroy = submat->ops->destroy; 1763 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1764 submat->factortype = C->factortype; 1765 1766 /* compute rmax */ 1767 rmax = 0; 1768 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1769 1770 } else { /* scall == MAT_REUSE_MATRIX */ 1771 submat = submats[0]; 1772 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1773 1774 subc = (Mat_SeqAIJ*)submat->data; 1775 rmax = subc->rmax; 1776 smatis1 = subc->submatis1; 1777 nrqs = smatis1->nrqs; 1778 nrqr = smatis1->nrqr; 1779 rbuf1 = smatis1->rbuf1; 1780 rbuf2 = smatis1->rbuf2; 1781 rbuf3 = smatis1->rbuf3; 1782 req_source2 = smatis1->req_source2; 1783 1784 sbuf1 = smatis1->sbuf1; 1785 sbuf2 = smatis1->sbuf2; 1786 ptr = smatis1->ptr; 1787 tmp = smatis1->tmp; 1788 ctr = smatis1->ctr; 1789 1790 pa = smatis1->pa; 1791 req_size = smatis1->req_size; 1792 req_source1 = smatis1->req_source1; 1793 1794 allcolumns = smatis1->allcolumns; 1795 row2proc = smatis1->row2proc; 1796 rmap = smatis1->rmap; 1797 cmap = smatis1->cmap; 1798 #if defined(PETSC_USE_CTABLE) 1799 rmap_loc = smatis1->rmap_loc; 1800 cmap_loc = smatis1->cmap_loc; 1801 #endif 1802 } 1803 1804 /* Post recv matrix values */ 1805 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1806 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1807 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1808 for (i=0; i<nrqs; ++i) { 1809 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1810 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1811 } 1812 1813 /* Allocate sending buffers for a->a, and send them off */ 1814 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1815 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1816 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1817 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1818 1819 for (i=0; i<nrqr; i++) { 1820 rbuf1_i = rbuf1[i]; 1821 sbuf_aa_i = sbuf_aa[i]; 1822 ct1 = 2*rbuf1_i[0]+1; 1823 ct2 = 0; 1824 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1825 1826 kmax = rbuf1_i[2]; 1827 for (k=0; k<kmax; k++,ct1++) { 1828 row = rbuf1_i[ct1] - rstart; 1829 nzA = ai[row+1] - ai[row]; 1830 nzB = bi[row+1] - bi[row]; 1831 ncols = nzA + nzB; 1832 cworkB = bj + bi[row]; 1833 vworkA = a_a + ai[row]; 1834 vworkB = b_a + bi[row]; 1835 1836 /* load the column values for this row into vals*/ 1837 vals = sbuf_aa_i + ct2; 1838 1839 lwrite = 0; 1840 for (l=0; l<nzB; l++) { 1841 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1842 } 1843 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1844 for (l=0; l<nzB; l++) { 1845 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1846 } 1847 1848 ct2 += ncols; 1849 } 1850 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1851 } 1852 1853 /* Assemble submat */ 1854 /* First assemble the local rows */ 1855 for (j=0; j<nrow; j++) { 1856 row = irow[j]; 1857 proc = row2proc[j]; 1858 if (proc == rank) { 1859 Crow = row - rstart; /* local row index of C */ 1860 #if defined(PETSC_USE_CTABLE) 1861 row = rmap_loc[Crow]; /* row index of submat */ 1862 #else 1863 row = rmap[row]; 1864 #endif 1865 1866 if (allcolumns) { 1867 /* diagonal part A = c->A */ 1868 ncols = ai[Crow+1] - ai[Crow]; 1869 cols = aj + ai[Crow]; 1870 vals = a->a + ai[Crow]; 1871 i = 0; 1872 for (k=0; k<ncols; k++) { 1873 subcols[i] = cols[k] + cstart; 1874 subvals[i++] = vals[k]; 1875 } 1876 1877 /* off-diagonal part B = c->B */ 1878 ncols = bi[Crow+1] - bi[Crow]; 1879 cols = bj + bi[Crow]; 1880 vals = b->a + bi[Crow]; 1881 for (k=0; k<ncols; k++) { 1882 subcols[i] = bmap[cols[k]]; 1883 subvals[i++] = vals[k]; 1884 } 1885 1886 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1887 1888 } else { /* !allcolumns */ 1889 #if defined(PETSC_USE_CTABLE) 1890 /* diagonal part A = c->A */ 1891 ncols = ai[Crow+1] - ai[Crow]; 1892 cols = aj + ai[Crow]; 1893 vals = a->a + ai[Crow]; 1894 i = 0; 1895 for (k=0; k<ncols; k++) { 1896 tcol = cmap_loc[cols[k]]; 1897 if (tcol) { 1898 subcols[i] = --tcol; 1899 subvals[i++] = vals[k]; 1900 } 1901 } 1902 1903 /* off-diagonal part B = c->B */ 1904 ncols = bi[Crow+1] - bi[Crow]; 1905 cols = bj + bi[Crow]; 1906 vals = b->a + bi[Crow]; 1907 for (k=0; k<ncols; k++) { 1908 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1909 if (tcol) { 1910 subcols[i] = --tcol; 1911 subvals[i++] = vals[k]; 1912 } 1913 } 1914 #else 1915 /* diagonal part A = c->A */ 1916 ncols = ai[Crow+1] - ai[Crow]; 1917 cols = aj + ai[Crow]; 1918 vals = a->a + ai[Crow]; 1919 i = 0; 1920 for (k=0; k<ncols; k++) { 1921 tcol = cmap[cols[k]+cstart]; 1922 if (tcol) { 1923 subcols[i] = --tcol; 1924 subvals[i++] = vals[k]; 1925 } 1926 } 1927 1928 /* off-diagonal part B = c->B */ 1929 ncols = bi[Crow+1] - bi[Crow]; 1930 cols = bj + bi[Crow]; 1931 vals = b->a + bi[Crow]; 1932 for (k=0; k<ncols; k++) { 1933 tcol = cmap[bmap[cols[k]]]; 1934 if (tcol) { 1935 subcols[i] = --tcol; 1936 subvals[i++] = vals[k]; 1937 } 1938 } 1939 #endif 1940 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1941 } 1942 } 1943 } 1944 1945 /* Now assemble the off-proc rows */ 1946 for (i=0; i<nrqs; i++) { /* for each requested message */ 1947 /* recv values from other processes */ 1948 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1949 proc = pa[idex]; 1950 sbuf1_i = sbuf1[proc]; 1951 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1952 ct1 = 2 + 1; 1953 ct2 = 0; /* count of received C->j */ 1954 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1955 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1956 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1957 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1958 1959 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1960 max1 = sbuf1_i[2]; /* num of rows */ 1961 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1962 row = sbuf1_i[ct1]; /* row index of submat */ 1963 if (!allcolumns) { 1964 idex = 0; 1965 if (scall == MAT_INITIAL_MATRIX) { 1966 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1967 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1968 #if defined(PETSC_USE_CTABLE) 1969 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1970 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1971 } else { 1972 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1973 } 1974 #else 1975 tcol = cmap[rbuf3_i[ct2]]; 1976 #endif 1977 if (tcol) { 1978 subcols[idex] = --tcol; 1979 subvals[idex++] = rbuf4_i[ct2]; 1980 1981 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1982 For reuse, we replace received C->j with index that should be inserted to submat */ 1983 rbuf3_i[ct3++] = ct2; 1984 } 1985 } 1986 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1987 1988 } else { /* scall == MAT_REUSE_MATRIX */ 1989 submat = submats[0]; 1990 subc = (Mat_SeqAIJ*)submat->data; 1991 1992 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1993 for (l=0; l<nnz; l++) { 1994 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1995 subvals[idex++] = rbuf4_i[ct2]; 1996 } 1997 1998 bj = subc->j + subc->i[row]; 1999 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 2000 } 2001 } else { /* allcolumns */ 2002 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 2003 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 2004 ct2 += nnz; 2005 } 2006 } 2007 } 2008 2009 /* sending a->a are done */ 2010 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 2011 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 2012 2013 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2014 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2015 submats[0] = submat; 2016 2017 /* Restore the indices */ 2018 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 2019 if (!allcolumns) { 2020 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 2021 } 2022 2023 /* Destroy allocated memory */ 2024 for (i=0; i<nrqs; ++i) { 2025 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 2026 } 2027 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 2028 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2029 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2030 2031 if (scall == MAT_INITIAL_MATRIX) { 2032 ierr = PetscFree(lens);CHKERRQ(ierr); 2033 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2034 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2035 } 2036 PetscFunctionReturn(0); 2037 } 2038 2039 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2040 { 2041 PetscErrorCode ierr; 2042 PetscInt ncol; 2043 PetscBool colflag,allcolumns=PETSC_FALSE; 2044 2045 PetscFunctionBegin; 2046 /* Allocate memory to hold all the submatrices */ 2047 if (scall == MAT_INITIAL_MATRIX) { 2048 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 2049 } 2050 2051 /* Check for special case: each processor gets entire matrix columns */ 2052 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2053 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2054 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2055 2056 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2057 PetscFunctionReturn(0); 2058 } 2059 2060 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2061 { 2062 PetscErrorCode ierr; 2063 PetscInt nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2]; 2064 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 2065 Mat_SeqAIJ *subc; 2066 Mat_SubMat *smat; 2067 2068 PetscFunctionBegin; 2069 /* Check for special case: each processor has a single IS */ 2070 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 2071 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2072 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */ 2073 PetscFunctionReturn(0); 2074 } 2075 2076 /* Collect global wantallmatrix and nstages */ 2077 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 2078 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2079 if (!nmax) nmax = 1; 2080 2081 if (scall == MAT_INITIAL_MATRIX) { 2082 /* Collect global wantallmatrix and nstages */ 2083 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2084 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2085 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2086 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2087 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2088 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2089 wantallmatrix = PETSC_TRUE; 2090 2091 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2092 } 2093 } 2094 2095 /* Determine the number of stages through which submatrices are done 2096 Each stage will extract nmax submatrices. 2097 nmax is determined by the matrix column dimension. 2098 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2099 */ 2100 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2101 2102 in[0] = -1*(PetscInt)wantallmatrix; 2103 in[1] = nstages; 2104 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2105 wantallmatrix = (PetscBool)(-out[0]); 2106 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2107 2108 } else { /* MAT_REUSE_MATRIX */ 2109 if (ismax) { 2110 subc = (Mat_SeqAIJ*)((*submat)[0]->data); 2111 smat = subc->submatis1; 2112 } else { /* (*submat)[0] is a dummy matrix */ 2113 smat = (Mat_SubMat*)((*submat)[0]->spptr); 2114 } 2115 if (!smat) { 2116 /* smat is not generated by MatGetSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2117 wantallmatrix = PETSC_TRUE; 2118 } else if (smat->singleis) { 2119 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2120 PetscFunctionReturn(0); 2121 } else { 2122 nstages = smat->nstages; 2123 } 2124 } 2125 2126 if (wantallmatrix) { 2127 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2128 PetscFunctionReturn(0); 2129 } 2130 2131 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2132 if (scall == MAT_INITIAL_MATRIX) { 2133 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2134 } 2135 2136 #if 0 2137 PetscMPIInt rank; 2138 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)C),&rank);CHKERRQ(ierr); 2139 printf("[%d] reuse %d, ismax %d, CN %d, nstages %d, nmax %d\n",rank,scall,ismax,C->cmap->N,nstages,nmax); 2140 #endif 2141 for (i=0,pos=0; i<nstages; i++) { 2142 if (pos+nmax <= ismax) max_no = nmax; 2143 else if (pos == ismax) max_no = 0; 2144 else max_no = ismax-pos; 2145 /* if (!max_no) printf("[%d] max_no=0, %d-stage\n",rank,i); */ 2146 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2147 pos += max_no; 2148 } 2149 2150 if (scall == MAT_INITIAL_MATRIX) { 2151 /* save nstages for reuse */ 2152 if (ismax) { 2153 subc = (Mat_SeqAIJ*)((*submat)[0]->data); 2154 smat = subc->submatis1; 2155 } else { /* dummy (*submat)[0] */ 2156 smat = (Mat_SubMat*)((*submat)[0]->spptr); 2157 } 2158 if (!smat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"smat does not exit"); 2159 smat->nstages = nstages; 2160 } 2161 PetscFunctionReturn(0); 2162 } 2163 2164 /* -------------------------------------------------------------------------*/ 2165 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2166 { 2167 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2168 Mat A = c->A; 2169 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2170 const PetscInt **icol,**irow; 2171 PetscInt *nrow,*ncol,start; 2172 PetscErrorCode ierr; 2173 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2174 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2175 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2176 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2177 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2178 #if defined(PETSC_USE_CTABLE) 2179 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2180 #else 2181 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2182 #endif 2183 const PetscInt *irow_i; 2184 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2185 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2186 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2187 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2188 MPI_Status *r_status3,*r_status4,*s_status4; 2189 MPI_Comm comm; 2190 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2191 PetscMPIInt *onodes1,*olengths1,end; 2192 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2193 Mat_SubMat *smat_i; 2194 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2195 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2196 2197 PetscFunctionBegin; 2198 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2199 size = c->size; 2200 rank = c->rank; 2201 2202 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2203 ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2204 2205 for (i=0; i<ismax; i++) { 2206 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2207 if (!issorted[i]) iscsorted = issorted[i]; 2208 2209 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2210 2211 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2212 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2213 2214 /* Check for special case: allcolumn */ 2215 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2216 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2217 if (colflag && ncol[i] == C->cmap->N) { 2218 allcolumns[i] = PETSC_TRUE; 2219 icol[i] = NULL; 2220 } else { 2221 allcolumns[i] = PETSC_FALSE; 2222 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2223 } 2224 } 2225 2226 if (scall == MAT_REUSE_MATRIX) { 2227 /* Assumes new rows are same length as the old rows */ 2228 for (i=0; i<ismax; i++) { 2229 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2230 subc = (Mat_SeqAIJ*)(submats[i]->data); 2231 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2232 2233 /* Initial matrix as if empty */ 2234 ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2235 2236 smat_i = subc->submatis1; 2237 2238 nrqs = smat_i->nrqs; 2239 nrqr = smat_i->nrqr; 2240 rbuf1 = smat_i->rbuf1; 2241 rbuf2 = smat_i->rbuf2; 2242 rbuf3 = smat_i->rbuf3; 2243 req_source2 = smat_i->req_source2; 2244 2245 sbuf1 = smat_i->sbuf1; 2246 sbuf2 = smat_i->sbuf2; 2247 ptr = smat_i->ptr; 2248 tmp = smat_i->tmp; 2249 ctr = smat_i->ctr; 2250 2251 pa = smat_i->pa; 2252 req_size = smat_i->req_size; 2253 req_source1 = smat_i->req_source1; 2254 2255 allcolumns[i] = smat_i->allcolumns; 2256 row2proc[i] = smat_i->row2proc; 2257 rmap[i] = smat_i->rmap; 2258 cmap[i] = smat_i->cmap; 2259 } 2260 2261 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2262 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2263 smat_i = (Mat_SubMat*)(submats[0]->spptr); 2264 2265 nrqs = smat_i->nrqs; 2266 nrqr = smat_i->nrqr; 2267 rbuf1 = smat_i->rbuf1; 2268 rbuf2 = smat_i->rbuf2; 2269 rbuf3 = smat_i->rbuf3; 2270 req_source2 = smat_i->req_source2; 2271 2272 sbuf1 = smat_i->sbuf1; 2273 sbuf2 = smat_i->sbuf2; 2274 ptr = smat_i->ptr; 2275 tmp = smat_i->tmp; 2276 ctr = smat_i->ctr; 2277 2278 pa = smat_i->pa; 2279 req_size = smat_i->req_size; 2280 req_source1 = smat_i->req_source1; 2281 2282 allcolumns[0] = PETSC_TRUE; 2283 } 2284 } else { /* scall == MAT_INITIAL_MATRIX */ 2285 /* Get some new tags to keep the communication clean */ 2286 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2287 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2288 2289 /* evaluate communication - mesg to who, length of mesg, and buffer space 2290 required. Based on this, buffers are allocated, and data copied into them*/ 2291 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2292 2293 for (i=0; i<ismax; i++) { 2294 jmax = nrow[i]; 2295 irow_i = irow[i]; 2296 2297 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2298 row2proc[i] = row2proc_i; 2299 2300 if (issorted[i]) proc = 0; 2301 for (j=0; j<jmax; j++) { 2302 if (!issorted[i]) proc = 0; 2303 row = irow_i[j]; 2304 while (row >= C->rmap->range[proc+1]) proc++; 2305 w4[proc]++; 2306 row2proc_i[j] = proc; /* map row index to proc */ 2307 } 2308 for (j=0; j<size; j++) { 2309 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2310 } 2311 } 2312 2313 nrqs = 0; /* no of outgoing messages */ 2314 msz = 0; /* total mesg length (for all procs) */ 2315 w1[rank] = 0; /* no mesg sent to self */ 2316 w3[rank] = 0; 2317 for (i=0; i<size; i++) { 2318 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2319 } 2320 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2321 for (i=0,j=0; i<size; i++) { 2322 if (w1[i]) { pa[j] = i; j++; } 2323 } 2324 2325 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2326 for (i=0; i<nrqs; i++) { 2327 j = pa[i]; 2328 w1[j] += w2[j] + 2* w3[j]; 2329 msz += w1[j]; 2330 } 2331 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2332 2333 /* Determine the number of messages to expect, their lengths, from from-ids */ 2334 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2335 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2336 2337 /* Now post the Irecvs corresponding to these messages */ 2338 tag0 = ((PetscObject)C)->tag; 2339 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2340 2341 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2342 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2343 2344 /* Allocate Memory for outgoing messages */ 2345 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2346 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2347 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2348 2349 { 2350 PetscInt *iptr = tmp; 2351 k = 0; 2352 for (i=0; i<nrqs; i++) { 2353 j = pa[i]; 2354 iptr += k; 2355 sbuf1[j] = iptr; 2356 k = w1[j]; 2357 } 2358 } 2359 2360 /* Form the outgoing messages. Initialize the header space */ 2361 for (i=0; i<nrqs; i++) { 2362 j = pa[i]; 2363 sbuf1[j][0] = 0; 2364 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2365 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2366 } 2367 2368 /* Parse the isrow and copy data into outbuf */ 2369 for (i=0; i<ismax; i++) { 2370 row2proc_i = row2proc[i]; 2371 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2372 irow_i = irow[i]; 2373 jmax = nrow[i]; 2374 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2375 proc = row2proc_i[j]; 2376 if (proc != rank) { /* copy to the outgoing buf*/ 2377 ctr[proc]++; 2378 *ptr[proc] = irow_i[j]; 2379 ptr[proc]++; 2380 } 2381 } 2382 /* Update the headers for the current IS */ 2383 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2384 if ((ctr_j = ctr[j])) { 2385 sbuf1_j = sbuf1[j]; 2386 k = ++sbuf1_j[0]; 2387 sbuf1_j[2*k] = ctr_j; 2388 sbuf1_j[2*k-1] = i; 2389 } 2390 } 2391 } 2392 2393 /* Now post the sends */ 2394 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2395 for (i=0; i<nrqs; ++i) { 2396 j = pa[i]; 2397 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2398 } 2399 2400 /* Post Receives to capture the buffer size */ 2401 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2402 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2403 rbuf2[0] = tmp + msz; 2404 for (i=1; i<nrqs; ++i) { 2405 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2406 } 2407 for (i=0; i<nrqs; ++i) { 2408 j = pa[i]; 2409 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2410 } 2411 2412 /* Send to other procs the buf size they should allocate */ 2413 /* Receive messages*/ 2414 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2415 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2416 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2417 { 2418 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2419 PetscInt *sbuf2_i; 2420 2421 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2422 for (i=0; i<nrqr; ++i) { 2423 req_size[i] = 0; 2424 rbuf1_i = rbuf1[i]; 2425 start = 2*rbuf1_i[0] + 1; 2426 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2427 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2428 sbuf2_i = sbuf2[i]; 2429 for (j=start; j<end; j++) { 2430 id = rbuf1_i[j] - rstart; 2431 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2432 sbuf2_i[j] = ncols; 2433 req_size[i] += ncols; 2434 } 2435 req_source1[i] = r_status1[i].MPI_SOURCE; 2436 /* form the header */ 2437 sbuf2_i[0] = req_size[i]; 2438 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2439 2440 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2441 } 2442 } 2443 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2444 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2445 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2446 2447 /* Receive messages*/ 2448 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2449 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2450 2451 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2452 for (i=0; i<nrqs; ++i) { 2453 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2454 req_source2[i] = r_status2[i].MPI_SOURCE; 2455 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2456 } 2457 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2458 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2459 2460 /* Wait on sends1 and sends2 */ 2461 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2462 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2463 2464 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2465 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2466 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2467 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2468 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2469 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2470 2471 /* Now allocate sending buffers for a->j, and send them off */ 2472 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2473 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2474 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2475 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2476 2477 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2478 { 2479 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2480 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2481 PetscInt cend = C->cmap->rend; 2482 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2483 2484 for (i=0; i<nrqr; i++) { 2485 rbuf1_i = rbuf1[i]; 2486 sbuf_aj_i = sbuf_aj[i]; 2487 ct1 = 2*rbuf1_i[0] + 1; 2488 ct2 = 0; 2489 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2490 kmax = rbuf1[i][2*j]; 2491 for (k=0; k<kmax; k++,ct1++) { 2492 row = rbuf1_i[ct1] - rstart; 2493 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2494 ncols = nzA + nzB; 2495 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2496 2497 /* load the column indices for this row into cols */ 2498 cols = sbuf_aj_i + ct2; 2499 2500 lwrite = 0; 2501 for (l=0; l<nzB; l++) { 2502 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2503 } 2504 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2505 for (l=0; l<nzB; l++) { 2506 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2507 } 2508 2509 ct2 += ncols; 2510 } 2511 } 2512 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2513 } 2514 } 2515 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2516 2517 /* create col map: global col of C -> local col of submatrices */ 2518 { 2519 const PetscInt *icol_i; 2520 #if defined(PETSC_USE_CTABLE) 2521 for (i=0; i<ismax; i++) { 2522 if (!allcolumns[i]) { 2523 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2524 2525 jmax = ncol[i]; 2526 icol_i = icol[i]; 2527 cmap_i = cmap[i]; 2528 for (j=0; j<jmax; j++) { 2529 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2530 } 2531 } else cmap[i] = NULL; 2532 } 2533 #else 2534 for (i=0; i<ismax; i++) { 2535 if (!allcolumns[i]) { 2536 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2537 jmax = ncol[i]; 2538 icol_i = icol[i]; 2539 cmap_i = cmap[i]; 2540 for (j=0; j<jmax; j++) { 2541 cmap_i[icol_i[j]] = j+1; 2542 } 2543 } else cmap[i] = NULL; 2544 } 2545 #endif 2546 } 2547 2548 /* Create lens which is required for MatCreate... */ 2549 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2550 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2551 2552 if (ismax) { 2553 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2554 } 2555 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2556 2557 /* Update lens from local data */ 2558 for (i=0; i<ismax; i++) { 2559 row2proc_i = row2proc[i]; 2560 jmax = nrow[i]; 2561 if (!allcolumns[i]) cmap_i = cmap[i]; 2562 irow_i = irow[i]; 2563 lens_i = lens[i]; 2564 for (j=0; j<jmax; j++) { 2565 row = irow_i[j]; 2566 proc = row2proc_i[j]; 2567 if (proc == rank) { 2568 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2569 if (!allcolumns[i]) { 2570 for (k=0; k<ncols; k++) { 2571 #if defined(PETSC_USE_CTABLE) 2572 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2573 #else 2574 tcol = cmap_i[cols[k]]; 2575 #endif 2576 if (tcol) lens_i[j]++; 2577 } 2578 } else { /* allcolumns */ 2579 lens_i[j] = ncols; 2580 } 2581 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2582 } 2583 } 2584 } 2585 2586 /* Create row map: global row of C -> local row of submatrices */ 2587 #if defined(PETSC_USE_CTABLE) 2588 for (i=0; i<ismax; i++) { 2589 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2590 irow_i = irow[i]; 2591 jmax = nrow[i]; 2592 for (j=0; j<jmax; j++) { 2593 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2594 } 2595 } 2596 #else 2597 for (i=0; i<ismax; i++) { 2598 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2599 rmap_i = rmap[i]; 2600 irow_i = irow[i]; 2601 jmax = nrow[i]; 2602 for (j=0; j<jmax; j++) { 2603 rmap_i[irow_i[j]] = j; 2604 } 2605 } 2606 #endif 2607 2608 /* Update lens from offproc data */ 2609 { 2610 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2611 2612 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2613 for (tmp2=0; tmp2<nrqs; tmp2++) { 2614 sbuf1_i = sbuf1[pa[tmp2]]; 2615 jmax = sbuf1_i[0]; 2616 ct1 = 2*jmax+1; 2617 ct2 = 0; 2618 rbuf2_i = rbuf2[tmp2]; 2619 rbuf3_i = rbuf3[tmp2]; 2620 for (j=1; j<=jmax; j++) { 2621 is_no = sbuf1_i[2*j-1]; 2622 max1 = sbuf1_i[2*j]; 2623 lens_i = lens[is_no]; 2624 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2625 rmap_i = rmap[is_no]; 2626 for (k=0; k<max1; k++,ct1++) { 2627 #if defined(PETSC_USE_CTABLE) 2628 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2629 row--; 2630 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2631 #else 2632 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2633 #endif 2634 max2 = rbuf2_i[ct1]; 2635 for (l=0; l<max2; l++,ct2++) { 2636 if (!allcolumns[is_no]) { 2637 #if defined(PETSC_USE_CTABLE) 2638 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2639 #else 2640 tcol = cmap_i[rbuf3_i[ct2]]; 2641 #endif 2642 if (tcol) lens_i[row]++; 2643 } else { /* allcolumns */ 2644 lens_i[row]++; /* lens_i[row] += max2 ? */ 2645 } 2646 } 2647 } 2648 } 2649 } 2650 } 2651 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2652 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2653 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2654 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2655 2656 /* Create the submatrices */ 2657 for (i=0; i<ismax; i++) { 2658 PetscInt rbs,cbs; 2659 2660 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2661 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2662 2663 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2664 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2665 2666 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2667 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2668 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2669 2670 /* create struct Mat_SubMat and attached it to submat */ 2671 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2672 subc = (Mat_SeqAIJ*)submats[i]->data; 2673 subc->submatis1 = smat_i; 2674 2675 smat_i->destroy = submats[i]->ops->destroy; 2676 submats[i]->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 2677 submats[i]->factortype = C->factortype; 2678 2679 smat_i->id = i; 2680 smat_i->nrqs = nrqs; 2681 smat_i->nrqr = nrqr; 2682 smat_i->rbuf1 = rbuf1; 2683 smat_i->rbuf2 = rbuf2; 2684 smat_i->rbuf3 = rbuf3; 2685 smat_i->sbuf2 = sbuf2; 2686 smat_i->req_source2 = req_source2; 2687 2688 smat_i->sbuf1 = sbuf1; 2689 smat_i->ptr = ptr; 2690 smat_i->tmp = tmp; 2691 smat_i->ctr = ctr; 2692 2693 smat_i->pa = pa; 2694 smat_i->req_size = req_size; 2695 smat_i->req_source1 = req_source1; 2696 2697 smat_i->allcolumns = allcolumns[i]; 2698 smat_i->singleis = PETSC_FALSE; 2699 smat_i->row2proc = row2proc[i]; 2700 smat_i->rmap = rmap[i]; 2701 smat_i->cmap = cmap[i]; 2702 } 2703 2704 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2705 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2706 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2707 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2708 2709 /* create struct Mat_SubMat and attached it to submat */ 2710 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2711 submats[0]->spptr = (Mat_SubMat*)smat_i; 2712 2713 smat_i->destroy = submats[0]->ops->destroy; 2714 submats[0]->ops->destroy = MatDestroy_Dummy_MatGetSubmatrices; 2715 submats[0]->factortype = C->factortype; 2716 2717 smat_i->id = i; 2718 smat_i->nrqs = nrqs; 2719 smat_i->nrqr = nrqr; 2720 smat_i->rbuf1 = rbuf1; 2721 smat_i->rbuf2 = rbuf2; 2722 smat_i->rbuf3 = rbuf3; 2723 smat_i->sbuf2 = sbuf2; 2724 smat_i->req_source2 = req_source2; 2725 2726 smat_i->sbuf1 = sbuf1; 2727 smat_i->ptr = ptr; 2728 smat_i->tmp = tmp; 2729 smat_i->ctr = ctr; 2730 2731 smat_i->pa = pa; 2732 smat_i->req_size = req_size; 2733 smat_i->req_source1 = req_source1; 2734 2735 smat_i->allcolumns = PETSC_TRUE; 2736 smat_i->singleis = PETSC_FALSE; 2737 smat_i->row2proc = NULL; 2738 smat_i->rmap = NULL; 2739 smat_i->cmap = NULL; 2740 } 2741 2742 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2743 ierr = PetscFree(lens);CHKERRQ(ierr); 2744 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2745 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2746 2747 } /* endof scall == MAT_INITIAL_MATRIX */ 2748 2749 /* Post recv matrix values */ 2750 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2751 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2752 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2753 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2754 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2755 for (i=0; i<nrqs; ++i) { 2756 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2757 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2758 } 2759 2760 /* Allocate sending buffers for a->a, and send them off */ 2761 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2762 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2763 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2764 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2765 2766 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2767 { 2768 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2769 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2770 PetscInt cend = C->cmap->rend; 2771 PetscInt *b_j = b->j; 2772 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2773 2774 for (i=0; i<nrqr; i++) { 2775 rbuf1_i = rbuf1[i]; 2776 sbuf_aa_i = sbuf_aa[i]; 2777 ct1 = 2*rbuf1_i[0]+1; 2778 ct2 = 0; 2779 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2780 kmax = rbuf1_i[2*j]; 2781 for (k=0; k<kmax; k++,ct1++) { 2782 row = rbuf1_i[ct1] - rstart; 2783 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2784 ncols = nzA + nzB; 2785 cworkB = b_j + b_i[row]; 2786 vworkA = a_a + a_i[row]; 2787 vworkB = b_a + b_i[row]; 2788 2789 /* load the column values for this row into vals*/ 2790 vals = sbuf_aa_i+ct2; 2791 2792 lwrite = 0; 2793 for (l=0; l<nzB; l++) { 2794 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2795 } 2796 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2797 for (l=0; l<nzB; l++) { 2798 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2799 } 2800 2801 ct2 += ncols; 2802 } 2803 } 2804 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2805 } 2806 } 2807 2808 /* Assemble the matrices */ 2809 /* First assemble the local rows */ 2810 for (i=0; i<ismax; i++) { 2811 row2proc_i = row2proc[i]; 2812 subc = (Mat_SeqAIJ*)submats[i]->data; 2813 imat_ilen = subc->ilen; 2814 imat_j = subc->j; 2815 imat_i = subc->i; 2816 imat_a = subc->a; 2817 2818 if (!allcolumns[i]) cmap_i = cmap[i]; 2819 rmap_i = rmap[i]; 2820 irow_i = irow[i]; 2821 jmax = nrow[i]; 2822 for (j=0; j<jmax; j++) { 2823 row = irow_i[j]; 2824 proc = row2proc_i[j]; 2825 if (proc == rank) { 2826 old_row = row; 2827 #if defined(PETSC_USE_CTABLE) 2828 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2829 row--; 2830 #else 2831 row = rmap_i[row]; 2832 #endif 2833 ilen_row = imat_ilen[row]; 2834 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2835 mat_i = imat_i[row]; 2836 mat_a = imat_a + mat_i; 2837 mat_j = imat_j + mat_i; 2838 if (!allcolumns[i]) { 2839 for (k=0; k<ncols; k++) { 2840 #if defined(PETSC_USE_CTABLE) 2841 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2842 #else 2843 tcol = cmap_i[cols[k]]; 2844 #endif 2845 if (tcol) { 2846 *mat_j++ = tcol - 1; 2847 *mat_a++ = vals[k]; 2848 ilen_row++; 2849 } 2850 } 2851 } else { /* allcolumns */ 2852 for (k=0; k<ncols; k++) { 2853 *mat_j++ = cols[k]; /* global col index! */ 2854 *mat_a++ = vals[k]; 2855 ilen_row++; 2856 } 2857 } 2858 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2859 2860 imat_ilen[row] = ilen_row; 2861 } 2862 } 2863 } 2864 2865 /* Now assemble the off proc rows */ 2866 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2867 for (tmp2=0; tmp2<nrqs; tmp2++) { 2868 sbuf1_i = sbuf1[pa[tmp2]]; 2869 jmax = sbuf1_i[0]; 2870 ct1 = 2*jmax + 1; 2871 ct2 = 0; 2872 rbuf2_i = rbuf2[tmp2]; 2873 rbuf3_i = rbuf3[tmp2]; 2874 rbuf4_i = rbuf4[tmp2]; 2875 for (j=1; j<=jmax; j++) { 2876 is_no = sbuf1_i[2*j-1]; 2877 rmap_i = rmap[is_no]; 2878 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2879 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2880 imat_ilen = subc->ilen; 2881 imat_j = subc->j; 2882 imat_i = subc->i; 2883 imat_a = subc->a; 2884 max1 = sbuf1_i[2*j]; 2885 for (k=0; k<max1; k++,ct1++) { 2886 row = sbuf1_i[ct1]; 2887 #if defined(PETSC_USE_CTABLE) 2888 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2889 row--; 2890 #else 2891 row = rmap_i[row]; 2892 #endif 2893 ilen = imat_ilen[row]; 2894 mat_i = imat_i[row]; 2895 mat_a = imat_a + mat_i; 2896 mat_j = imat_j + mat_i; 2897 max2 = rbuf2_i[ct1]; 2898 if (!allcolumns[is_no]) { 2899 for (l=0; l<max2; l++,ct2++) { 2900 #if defined(PETSC_USE_CTABLE) 2901 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2902 #else 2903 tcol = cmap_i[rbuf3_i[ct2]]; 2904 #endif 2905 if (tcol) { 2906 *mat_j++ = tcol - 1; 2907 *mat_a++ = rbuf4_i[ct2]; 2908 ilen++; 2909 } 2910 } 2911 } else { /* allcolumns */ 2912 for (l=0; l<max2; l++,ct2++) { 2913 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2914 *mat_a++ = rbuf4_i[ct2]; 2915 ilen++; 2916 } 2917 } 2918 imat_ilen[row] = ilen; 2919 } 2920 } 2921 } 2922 2923 if (!iscsorted) { /* sort column indices of the rows */ 2924 for (i=0; i<ismax; i++) { 2925 subc = (Mat_SeqAIJ*)submats[i]->data; 2926 imat_j = subc->j; 2927 imat_i = subc->i; 2928 imat_a = subc->a; 2929 imat_ilen = subc->ilen; 2930 2931 if (allcolumns[i]) continue; 2932 jmax = nrow[i]; 2933 for (j=0; j<jmax; j++) { 2934 mat_i = imat_i[j]; 2935 mat_a = imat_a + mat_i; 2936 mat_j = imat_j + mat_i; 2937 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2938 } 2939 } 2940 } 2941 2942 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2943 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2944 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2945 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2946 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2947 2948 /* Restore the indices */ 2949 for (i=0; i<ismax; i++) { 2950 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2951 if (!allcolumns[i]) { 2952 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2953 } 2954 } 2955 2956 for (i=0; i<ismax; i++) { 2957 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2958 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2959 } 2960 if (!ismax) { /* a dummy matrix */ 2961 //ierr = MatAssemblyBegin(submats[0],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2962 //ierr = MatAssemblyEnd(submats[0],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2963 } 2964 2965 /* Destroy allocated memory */ 2966 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2967 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2968 ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr); 2969 2970 for (i=0; i<nrqs; ++i) { 2971 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2972 } 2973 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2974 2975 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2976 PetscFunctionReturn(0); 2977 } 2978 2979 /* 2980 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2981 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2982 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2983 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2984 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2985 state, and needs to be "assembled" later by compressing B's column space. 2986 2987 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2988 Following this call, C->A & C->B have been created, even if empty. 2989 */ 2990 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2991 { 2992 /* If making this function public, change the error returned in this function away from _PLIB. */ 2993 PetscErrorCode ierr; 2994 Mat_MPIAIJ *aij; 2995 Mat_SeqAIJ *Baij; 2996 PetscBool seqaij,Bdisassembled; 2997 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2998 PetscScalar v; 2999 const PetscInt *rowindices,*colindices; 3000 3001 PetscFunctionBegin; 3002 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 3003 if (A) { 3004 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 3005 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 3006 if (rowemb) { 3007 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 3008 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 3009 } else { 3010 if (C->rmap->n != A->rmap->n) { 3011 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 3012 } 3013 } 3014 if (dcolemb) { 3015 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 3016 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 3017 } else { 3018 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 3019 } 3020 } 3021 if (B) { 3022 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 3023 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 3024 if (rowemb) { 3025 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 3026 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 3027 } else { 3028 if (C->rmap->n != B->rmap->n) { 3029 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 3030 } 3031 } 3032 if (ocolemb) { 3033 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 3034 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 3035 } else { 3036 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 3037 } 3038 } 3039 3040 aij = (Mat_MPIAIJ*)(C->data); 3041 if (!aij->A) { 3042 /* Mimic parts of MatMPIAIJSetPreallocation() */ 3043 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 3044 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 3045 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 3046 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 3047 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 3048 } 3049 if (A) { 3050 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 3051 } else { 3052 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 3053 } 3054 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 3055 /* 3056 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 3057 need to "disassemble" B -- convert it to using C's global indices. 3058 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 3059 3060 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 3061 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 3062 3063 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 3064 At least avoid calling MatSetValues() and the implied searches? 3065 */ 3066 3067 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 3068 #if defined(PETSC_USE_CTABLE) 3069 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 3070 #else 3071 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 3072 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 3073 if (aij->B) { 3074 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 3075 } 3076 #endif 3077 ngcol = 0; 3078 if (aij->lvec) { 3079 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 3080 } 3081 if (aij->garray) { 3082 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 3083 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 3084 } 3085 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 3086 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 3087 } 3088 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 3089 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 3090 } 3091 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 3092 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 3093 } 3094 } 3095 Bdisassembled = PETSC_FALSE; 3096 if (!aij->B) { 3097 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 3098 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 3099 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 3100 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 3101 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3102 Bdisassembled = PETSC_TRUE; 3103 } 3104 if (B) { 3105 Baij = (Mat_SeqAIJ*)(B->data); 3106 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3107 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3108 for (i=0; i<B->rmap->n; i++) { 3109 nz[i] = Baij->i[i+1] - Baij->i[i]; 3110 } 3111 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3112 ierr = PetscFree(nz);CHKERRQ(ierr); 3113 } 3114 3115 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3116 shift = rend-rstart; 3117 count = 0; 3118 rowindices = NULL; 3119 colindices = NULL; 3120 if (rowemb) { 3121 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3122 } 3123 if (ocolemb) { 3124 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3125 } 3126 for (i=0; i<B->rmap->n; i++) { 3127 PetscInt row; 3128 row = i; 3129 if (rowindices) row = rowindices[i]; 3130 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3131 col = Baij->j[count]; 3132 if (colindices) col = colindices[col]; 3133 if (Bdisassembled && col>=rstart) col += shift; 3134 v = Baij->a[count]; 3135 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3136 ++count; 3137 } 3138 } 3139 /* No assembly for aij->B is necessary. */ 3140 /* FIXME: set aij->B's nonzerostate correctly. */ 3141 } else { 3142 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3143 } 3144 C->preallocated = PETSC_TRUE; 3145 C->was_assembled = PETSC_FALSE; 3146 C->assembled = PETSC_FALSE; 3147 /* 3148 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3149 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3150 */ 3151 PetscFunctionReturn(0); 3152 } 3153 3154 /* 3155 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3156 */ 3157 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3158 { 3159 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3160 3161 PetscFunctionBegin; 3162 PetscValidPointer(A,2); 3163 PetscValidPointer(B,3); 3164 /* FIXME: make sure C is assembled */ 3165 *A = aij->A; 3166 *B = aij->B; 3167 /* Note that we don't incref *A and *B, so be careful! */ 3168 PetscFunctionReturn(0); 3169 } 3170 3171 /* 3172 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3173 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3174 */ 3175 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3176 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3177 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3178 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3179 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3180 { 3181 PetscErrorCode ierr; 3182 PetscMPIInt isize,flag; 3183 PetscInt i,ii,cismax,ispar; 3184 Mat *A,*B; 3185 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3186 3187 PetscFunctionBegin; 3188 if (!ismax) PetscFunctionReturn(0); 3189 3190 for (i = 0, cismax = 0; i < ismax; ++i) { 3191 PetscMPIInt isize; 3192 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3193 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3194 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3195 if (isize > 1) ++cismax; 3196 } 3197 3198 /* 3199 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3200 ispar counts the number of parallel ISs across C's comm. 3201 */ 3202 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3203 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3204 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3205 PetscFunctionReturn(0); 3206 } 3207 3208 /* if (ispar) */ 3209 /* 3210 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3211 These are used to extract the off-diag portion of the resulting parallel matrix. 3212 The row IS for the off-diag portion is the same as for the diag portion, 3213 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3214 */ 3215 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3216 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3217 for (i = 0, ii = 0; i < ismax; ++i) { 3218 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3219 if (isize > 1) { 3220 /* 3221 TODO: This is the part that's ***NOT SCALABLE***. 3222 To fix this we need to extract just the indices of C's nonzero columns 3223 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3224 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3225 to be done without serializing on the IS list, so, most likely, it is best 3226 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3227 */ 3228 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3229 /* Now we have to 3230 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3231 were sorted on each rank, concatenated they might no longer be sorted; 3232 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3233 indices in the nondecreasing order to the original index positions. 3234 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3235 */ 3236 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3237 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3238 ++ii; 3239 } 3240 } 3241 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3242 for (i = 0, ii = 0; i < ismax; ++i) { 3243 PetscInt j,issize; 3244 const PetscInt *indices; 3245 3246 /* 3247 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3248 */ 3249 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3250 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3251 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3252 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3253 for (j = 1; j < issize; ++j) { 3254 if (indices[j] == indices[j-1]) { 3255 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3256 } 3257 } 3258 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3259 3260 3261 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3262 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3263 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3264 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3265 for (j = 1; j < issize; ++j) { 3266 if (indices[j-1] == indices[j]) { 3267 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3268 } 3269 } 3270 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3271 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3272 if (isize > 1) { 3273 cisrow[ii] = isrow[i]; 3274 ++ii; 3275 } 3276 } 3277 /* 3278 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3279 array of sequential matrices underlying the resulting parallel matrices. 3280 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3281 contain duplicates. 3282 3283 There are as many diag matrices as there are original index sets. There are only as many parallel 3284 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3285 3286 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3287 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3288 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3289 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3290 with A[i] and B[ii] extracted from the corresponding MPI submat. 3291 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3292 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3293 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3294 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3295 values into A[i] and B[ii] sitting inside the corresponding submat. 3296 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3297 A[i], B[ii], AA[i] or BB[ii] matrices. 3298 */ 3299 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3300 if (scall == MAT_INITIAL_MATRIX) { 3301 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3302 } 3303 3304 /* Now obtain the sequential A and B submatrices separately. */ 3305 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3306 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3307 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3308 3309 /* 3310 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3311 matrices A & B have been extracted directly into the parallel matrices containing them, or 3312 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3313 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3314 to have the same sparsity pattern. 3315 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3316 must be constructed for C. This is done by setseqmat(s). 3317 */ 3318 for (i = 0, ii = 0; i < ismax; ++i) { 3319 /* 3320 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3321 That way we can avoid sorting and computing permutations when reusing. 3322 To this end: 3323 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3324 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3325 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3326 */ 3327 MatStructure pattern; 3328 pattern = DIFFERENT_NONZERO_PATTERN; 3329 3330 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3331 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3332 if (isize > 1) { 3333 if (scall == MAT_INITIAL_MATRIX) { 3334 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3335 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3336 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3337 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3338 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3339 } 3340 /* 3341 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3342 */ 3343 { 3344 Mat AA,BB; 3345 AA = A[i]; 3346 BB = B[ii]; 3347 if (AA || BB) { 3348 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3349 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3350 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3351 } 3352 3353 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3354 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3355 } 3356 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3357 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3358 ++ii; 3359 } else { /* if (isize == 1) */ 3360 if (scall == MAT_REUSE_MATRIX) { 3361 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3362 } 3363 if (isrow_p[i] || iscol_p[i]) { 3364 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3365 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3366 /* Otherwise A is extracted straight into (*submats)[i]. */ 3367 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3368 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3369 } else (*submat)[i] = A[i]; 3370 } 3371 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3372 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3373 } 3374 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3375 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3376 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3377 ierr = PetscFree(A);CHKERRQ(ierr); 3378 ierr = PetscFree(B);CHKERRQ(ierr); 3379 PetscFunctionReturn(0); 3380 } 3381 3382 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3383 { 3384 PetscErrorCode ierr; 3385 3386 PetscFunctionBegin; 3387 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3388 PetscFunctionReturn(0); 3389 } 3390