1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++){ 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++){ 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank){ 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows){ 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++){ 115 if (tosizes_temp[i]){ 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++){ 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++){ 133 for (j=0; j<fromsizes[2*i]; j++){ 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++){ 169 for (j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++){ 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++){ 209 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 210 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 213 } 214 /* retrieve information to get row id and its overlap */ 215 for (i=0; i<nrecvs; ){ 216 is_id = recvdata[i++]; 217 data_size = recvdata[i++]; 218 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 219 isz_i = isz[is_id]; 220 for (j=0; j< data_size; j++){ 221 col = recvdata[i++]; 222 indices_i[isz_i++] = col; 223 } 224 isz[is_id] = isz_i; 225 } 226 /* remove duplicate entities */ 227 for (i=0; i<nidx; i++){ 228 indices_i = indices_temp+(max_lsize+nrecvs)*i; 229 isz_i = isz[i]; 230 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 231 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 232 } 233 ierr = PetscFree(isz);CHKERRQ(ierr); 234 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 239 { 240 PetscLayout rmap,cmap; 241 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 242 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 243 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 244 const PetscInt *gcols,*ai,*aj,*bi,*bj; 245 Mat amat,bmat; 246 PetscMPIInt rank; 247 PetscBool done; 248 MPI_Comm comm; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 253 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 254 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 255 /* Even if the mat is symmetric, we still assume it is not symmetric */ 256 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 257 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 258 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 /* total number of nonzero values is used to estimate the memory usage in the next step */ 261 tnz = ai[an]+bi[bn]; 262 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 263 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 264 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 265 /* to find the longest message */ 266 max_fszs = 0; 267 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 268 /* better way to estimate number of nonzero in the mat??? */ 269 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 270 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 271 rows_pos = 0; 272 totalrows = 0; 273 for (i=0; i<nfrom; i++){ 274 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 275 /* group data together */ 276 for (j=0; j<fromsizes[2*i]; j+=2){ 277 is_id = fromrows[rows_pos++];/* no of is */ 278 rows_i = rows_data[is_id]; 279 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 280 } 281 /* estimate a space to avoid multiple allocations */ 282 for (j=0; j<nidx; j++){ 283 indvc_ij = 0; 284 rows_i = rows_data[j]; 285 for (l=0; l<rows_pos_i[j]; l++){ 286 row = rows_i[l]-rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for (k=start; k<end; k++){ /* Amat */ 290 col = aj[k] + cstart; 291 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for (k=start; k<end; k++) { /* Bmat */ 296 col = gcols[bj[k]]; 297 indices_tmp[indvc_ij++] = col; 298 } 299 } 300 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 301 indv_counts[i*nidx+j] = indvc_ij; 302 totalrows += indvc_ij; 303 } 304 } 305 /* message triple <no of is, number of rows, rows> */ 306 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 307 totalrows = 0; 308 rows_pos = 0; 309 /* use this code again */ 310 for (i=0;i<nfrom;i++){ 311 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 312 for (j=0; j<fromsizes[2*i]; j+=2){ 313 is_id = fromrows[rows_pos++]; 314 rows_i = rows_data[is_id]; 315 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 316 } 317 /* add data */ 318 for (j=0; j<nidx; j++){ 319 if (!indv_counts[i*nidx+j]) continue; 320 indvc_ij = 0; 321 sbdata[totalrows++] = j; 322 sbdata[totalrows++] = indv_counts[i*nidx+j]; 323 sbsizes[2*i] += 2; 324 rows_i = rows_data[j]; 325 for (l=0; l<rows_pos_i[j]; l++){ 326 row = rows_i[l]-rstart; 327 start = ai[row]; 328 end = ai[row+1]; 329 for (k=start; k<end; k++){ /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; 332 } 333 start = bi[row]; 334 end = bi[row+1]; 335 for (k=start; k<end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 341 sbsizes[2*i] += indvc_ij; 342 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 343 totalrows += indvc_ij; 344 } 345 } 346 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 347 for (i=0; i<nfrom; i++){ 348 offsets[i+1] = offsets[i] + sbsizes[2*i]; 349 sbsizes[2*i+1] = offsets[i]; 350 } 351 ierr = PetscFree(offsets);CHKERRQ(ierr); 352 if (sbrowsizes) *sbrowsizes = sbsizes; 353 if (sbrows) *sbrows = sbdata; 354 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 355 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 356 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 361 { 362 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 363 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 364 PetscInt lsize,lsize_tmp,owner; 365 PetscMPIInt rank; 366 Mat amat,bmat; 367 PetscBool done; 368 PetscLayout cmap,rmap; 369 MPI_Comm comm; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 374 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 375 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 376 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 377 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 378 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 /* is it a safe way to compute number of nonzero values ? */ 381 tnz = ai[an]+bi[bn]; 382 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 383 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 384 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 385 /* it is a better way to estimate memory than the old implementation 386 * where global size of matrix is used 387 * */ 388 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 389 for (i=0; i<nidx; i++) { 390 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 391 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 392 lsize_tmp = 0; 393 for (j=0; j<lsize; j++) { 394 owner = -1; 395 row = indices[j]; 396 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 397 if (owner != rank) continue; 398 /* local number */ 399 row -= rstart; 400 start = ai[row]; 401 end = ai[row+1]; 402 for (k=start; k<end; k++) { /* Amat */ 403 col = aj[k] + cstart; 404 indices_temp[lsize_tmp++] = col; 405 } 406 start = bi[row]; 407 end = bi[row+1]; 408 for (k=start; k<end; k++) { /* Bmat */ 409 col = gcols[bj[k]]; 410 indices_temp[lsize_tmp++] = col; 411 } 412 } 413 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 414 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 415 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 416 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 417 } 418 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 419 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 420 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests recieved (which have to be or which have been processed 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscErrorCode ierr; 461 PetscMPIInt size,rank,tag1,tag2; 462 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 463 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 464 PetscBT *table; 465 MPI_Comm comm; 466 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 467 MPI_Status *s_status,*recv_status; 468 char *t_p; 469 470 PetscFunctionBegin; 471 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 477 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 478 479 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 480 481 for (i=0; i<imax; i++) { 482 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 483 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 489 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 490 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 491 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 492 for (i=0; i<imax; i++) { 493 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 idx_i = idx[i]; 495 len = n[i]; 496 for (j=0; j<len; j++) { 497 row = idx_i[j]; 498 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 499 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 500 w4[proc]++; 501 } 502 for (j=0; j<size; j++) { 503 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 504 } 505 } 506 507 nrqs = 0; /* no of outgoing messages */ 508 msz = 0; /* total mesg length (for all proc */ 509 w1[rank] = 0; /* no mesg sent to intself */ 510 w3[rank] = 0; 511 for (i=0; i<size; i++) { 512 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 513 } 514 /* pa - is list of processors to communicate with */ 515 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 516 for (i=0,j=0; i<size; i++) { 517 if (w1[i]) {pa[j] = i; j++;} 518 } 519 520 /* Each message would have a header = 1 + 2*(no of IS) + data */ 521 for (i=0; i<nrqs; i++) { 522 j = pa[i]; 523 w1[j] += w2[j] + 2*w3[j]; 524 msz += w1[j]; 525 } 526 527 /* Determine the number of messages to expect, their lengths, from from-ids */ 528 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 529 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 530 531 /* Now post the Irecvs corresponding to these messages */ 532 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 533 534 /* Allocate Memory for outgoing messages */ 535 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 536 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 537 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 538 539 { 540 PetscInt *iptr = tmp,ict = 0; 541 for (i=0; i<nrqs; i++) { 542 j = pa[i]; 543 iptr += ict; 544 outdat[j] = iptr; 545 ict = w1[j]; 546 } 547 } 548 549 /* Form the outgoing messages */ 550 /* plug in the headers */ 551 for (i=0; i<nrqs; i++) { 552 j = pa[i]; 553 outdat[j][0] = 0; 554 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 555 ptr[j] = outdat[j] + 2*w3[j] + 1; 556 } 557 558 /* Memory for doing local proc's work */ 559 { 560 PetscInt M_BPB_imax = 0; 561 #if defined(PETSC_USE_CTABLE) 562 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 563 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 564 for (i=0; i<imax; i++) { 565 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 566 } 567 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 570 } 571 #else 572 PetscInt Mimax = 0; 573 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 574 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 575 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 data[i] = d_p + M*i; 579 } 580 #endif 581 } 582 583 /* Parse the IS and update local tables and the outgoing buf with the data */ 584 { 585 PetscInt n_i,isz_i,*outdat_j,ctr_j; 586 PetscBT table_i; 587 588 for (i=0; i<imax; i++) { 589 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 590 n_i = n[i]; 591 table_i = table[i]; 592 idx_i = idx[i]; 593 #if defined(PETSC_USE_CTABLE) 594 table_data_i = table_data[i]; 595 #else 596 data_i = data[i]; 597 #endif 598 isz_i = isz[i]; 599 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 600 row = idx_i[j]; 601 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 602 if (proc != rank) { /* copy to the outgoing buffer */ 603 ctr[proc]++; 604 *ptr[proc] = row; 605 ptr[proc]++; 606 } else if (!PetscBTLookupSet(table_i,row)) { 607 #if defined(PETSC_USE_CTABLE) 608 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 609 #else 610 data_i[isz_i] = row; /* Update the local table */ 611 #endif 612 isz_i++; 613 } 614 } 615 /* Update the headers for the current IS */ 616 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 617 if ((ctr_j = ctr[j])) { 618 outdat_j = outdat[j]; 619 k = ++outdat_j[0]; 620 outdat_j[2*k] = ctr_j; 621 outdat_j[2*k-1] = i; 622 } 623 } 624 isz[i] = isz_i; 625 } 626 } 627 628 /* Now post the sends */ 629 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 630 for (i=0; i<nrqs; ++i) { 631 j = pa[i]; 632 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 633 } 634 635 /* No longer need the original indices */ 636 for (i=0; i<imax; ++i) { 637 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 638 } 639 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 640 641 for (i=0; i<imax; ++i) { 642 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 643 } 644 645 /* Do Local work */ 646 #if defined(PETSC_USE_CTABLE) 647 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 648 #else 649 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 650 #endif 651 652 /* Receive messages */ 653 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 654 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 655 656 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 657 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 658 659 /* Phase 1 sends are complete - deallocate buffers */ 660 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 661 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 662 663 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 664 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 665 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 666 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 667 ierr = PetscFree(rbuf);CHKERRQ(ierr); 668 669 670 /* Send the data back */ 671 /* Do a global reduction to know the buffer space req for incoming messages */ 672 { 673 PetscMPIInt *rw1; 674 675 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 676 677 for (i=0; i<nrqr; ++i) { 678 proc = recv_status[i].MPI_SOURCE; 679 680 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 681 rw1[proc] = isz1[i]; 682 } 683 ierr = PetscFree(onodes1);CHKERRQ(ierr); 684 ierr = PetscFree(olengths1);CHKERRQ(ierr); 685 686 /* Determine the number of messages to expect, their lengths, from from-ids */ 687 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 688 ierr = PetscFree(rw1);CHKERRQ(ierr); 689 } 690 /* Now post the Irecvs corresponding to these messages */ 691 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 692 693 /* Now post the sends */ 694 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 695 for (i=0; i<nrqr; ++i) { 696 j = recv_status[i].MPI_SOURCE; 697 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 698 } 699 700 /* receive work done on other processors */ 701 { 702 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 703 PetscMPIInt idex; 704 PetscBT table_i; 705 MPI_Status *status2; 706 707 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 708 for (i=0; i<nrqs; ++i) { 709 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 710 /* Process the message */ 711 rbuf2_i = rbuf2[idex]; 712 ct1 = 2*rbuf2_i[0]+1; 713 jmax = rbuf2[idex][0]; 714 for (j=1; j<=jmax; j++) { 715 max = rbuf2_i[2*j]; 716 is_no = rbuf2_i[2*j-1]; 717 isz_i = isz[is_no]; 718 table_i = table[is_no]; 719 #if defined(PETSC_USE_CTABLE) 720 table_data_i = table_data[is_no]; 721 #else 722 data_i = data[is_no]; 723 #endif 724 for (k=0; k<max; k++,ct1++) { 725 row = rbuf2_i[ct1]; 726 if (!PetscBTLookupSet(table_i,row)) { 727 #if defined(PETSC_USE_CTABLE) 728 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 729 #else 730 data_i[isz_i] = row; 731 #endif 732 isz_i++; 733 } 734 } 735 isz[is_no] = isz_i; 736 } 737 } 738 739 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 740 ierr = PetscFree(status2);CHKERRQ(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 } 768 769 ierr = PetscFree(onodes2);CHKERRQ(ierr); 770 ierr = PetscFree(olengths2);CHKERRQ(ierr); 771 772 ierr = PetscFree(pa);CHKERRQ(ierr); 773 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 774 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 775 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 776 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 779 ierr = PetscFree(s_status);CHKERRQ(ierr); 780 ierr = PetscFree(recv_status);CHKERRQ(ierr); 781 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 782 ierr = PetscFree(xdata);CHKERRQ(ierr); 783 ierr = PetscFree(isz1);CHKERRQ(ierr); 784 #if defined(PETSC_USE_CTABLE) 785 for (i=0; i<imax; i++) { 786 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 787 } 788 ierr = PetscFree(table_data);CHKERRQ(ierr); 789 ierr = PetscFree(tdata);CHKERRQ(ierr); 790 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 791 #else 792 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 793 #endif 794 PetscFunctionReturn(0); 795 } 796 797 /* 798 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 799 the work on the local processor. 800 801 Inputs: 802 C - MAT_MPIAIJ; 803 imax - total no of index sets processed at a time; 804 table - an array of char - size = m bits. 805 806 Output: 807 isz - array containing the count of the solution elements corresponding 808 to each index set; 809 data or table_data - pointer to the solutions 810 */ 811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 812 { 813 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 814 Mat A = c->A,B = c->B; 815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 816 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 817 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 818 PetscBT table_i; 819 #if defined(PETSC_USE_CTABLE) 820 PetscTable table_data_i; 821 PetscErrorCode ierr; 822 PetscTablePosition tpos; 823 PetscInt tcount,*tdata; 824 #else 825 PetscInt *data_i; 826 #endif 827 828 PetscFunctionBegin; 829 rstart = C->rmap->rstart; 830 cstart = C->cmap->rstart; 831 ai = a->i; 832 aj = a->j; 833 bi = b->i; 834 bj = b->j; 835 garray = c->garray; 836 837 for (i=0; i<imax; i++) { 838 #if defined(PETSC_USE_CTABLE) 839 /* copy existing entries of table_data_i into tdata[] */ 840 table_data_i = table_data[i]; 841 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 842 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 843 844 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 845 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 846 while (tpos) { 847 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 848 tdata[--j] = --row; 849 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 850 } 851 #else 852 data_i = data[i]; 853 #endif 854 table_i = table[i]; 855 isz_i = isz[i]; 856 max = isz[i]; 857 858 for (j=0; j<max; j++) { 859 #if defined(PETSC_USE_CTABLE) 860 row = tdata[j] - rstart; 861 #else 862 row = data_i[j] - rstart; 863 #endif 864 start = ai[row]; 865 end = ai[row+1]; 866 for (k=start; k<end; k++) { /* Amat */ 867 val = aj[k] + cstart; 868 if (!PetscBTLookupSet(table_i,val)) { 869 #if defined(PETSC_USE_CTABLE) 870 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 871 #else 872 data_i[isz_i] = val; 873 #endif 874 isz_i++; 875 } 876 } 877 start = bi[row]; 878 end = bi[row+1]; 879 for (k=start; k<end; k++) { /* Bmat */ 880 val = garray[bj[k]]; 881 if (!PetscBTLookupSet(table_i,val)) { 882 #if defined(PETSC_USE_CTABLE) 883 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 884 #else 885 data_i[isz_i] = val; 886 #endif 887 isz_i++; 888 } 889 } 890 } 891 isz[i] = isz_i; 892 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscFree(tdata);CHKERRQ(ierr); 895 #endif 896 } 897 PetscFunctionReturn(0); 898 } 899 900 /* 901 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 902 and return the output 903 904 Input: 905 C - the matrix 906 nrqr - no of messages being processed. 907 rbuf - an array of pointers to the recieved requests 908 909 Output: 910 xdata - array of messages to be sent back 911 isz1 - size of each message 912 913 For better efficiency perhaps we should malloc separately each xdata[i], 914 then if a remalloc is required we need only copy the data for that one row 915 rather then all previous rows as it is now where a single large chunck of 916 memory is used. 917 918 */ 919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 920 { 921 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 922 Mat A = c->A,B = c->B; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 924 PetscErrorCode ierr; 925 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 926 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 927 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 928 PetscInt *rbuf_i,kmax,rbuf_0; 929 PetscBT xtable; 930 931 PetscFunctionBegin; 932 m = C->rmap->N; 933 rstart = C->rmap->rstart; 934 cstart = C->cmap->rstart; 935 ai = a->i; 936 aj = a->j; 937 bi = b->i; 938 bj = b->j; 939 garray = c->garray; 940 941 942 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 943 rbuf_i = rbuf[i]; 944 rbuf_0 = rbuf_i[0]; 945 ct += rbuf_0; 946 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 947 } 948 949 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 950 else max1 = 1; 951 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 952 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 953 ++no_malloc; 954 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 955 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 956 957 ct3 = 0; 958 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 959 rbuf_i = rbuf[i]; 960 rbuf_0 = rbuf_i[0]; 961 ct1 = 2*rbuf_0+1; 962 ct2 = ct1; 963 ct3 += ct1; 964 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 965 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 966 oct2 = ct2; 967 kmax = rbuf_i[2*j]; 968 for (k=0; k<kmax; k++,ct1++) { 969 row = rbuf_i[ct1]; 970 if (!PetscBTLookupSet(xtable,row)) { 971 if (!(ct3 < mem_estimate)) { 972 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 973 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 974 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 975 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 976 xdata[0] = tmp; 977 mem_estimate = new_estimate; ++no_malloc; 978 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 979 } 980 xdata[i][ct2++] = row; 981 ct3++; 982 } 983 } 984 for (k=oct2,max2=ct2; k<max2; k++) { 985 row = xdata[i][k] - rstart; 986 start = ai[row]; 987 end = ai[row+1]; 988 for (l=start; l<end; l++) { 989 val = aj[l] + cstart; 990 if (!PetscBTLookupSet(xtable,val)) { 991 if (!(ct3 < mem_estimate)) { 992 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 993 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 994 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 995 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 996 xdata[0] = tmp; 997 mem_estimate = new_estimate; ++no_malloc; 998 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 999 } 1000 xdata[i][ct2++] = val; 1001 ct3++; 1002 } 1003 } 1004 start = bi[row]; 1005 end = bi[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = garray[bj[l]]; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 } 1023 /* Update the header*/ 1024 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1025 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1026 } 1027 xdata[i][0] = rbuf_0; 1028 xdata[i+1] = xdata[i] + ct2; 1029 isz1[i] = ct2; /* size of each message */ 1030 } 1031 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1032 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 /* -------------------------------------------------------------------------*/ 1036 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1037 /* 1038 Every processor gets the entire matrix 1039 */ 1040 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1041 { 1042 Mat B; 1043 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1044 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1045 PetscErrorCode ierr; 1046 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1047 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1048 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1049 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1050 1051 PetscFunctionBegin; 1052 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1053 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1054 1055 if (scall == MAT_INITIAL_MATRIX) { 1056 /* ---------------------------------------------------------------- 1057 Tell every processor the number of nonzeros per row 1058 */ 1059 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1060 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1061 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1062 } 1063 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1064 for (i=0; i<size; i++) { 1065 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1066 displs[i] = A->rmap->range[i]; 1067 } 1068 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1069 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1070 #else 1071 sendcount = A->rmap->rend - A->rmap->rstart; 1072 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1073 #endif 1074 /* --------------------------------------------------------------- 1075 Create the sequential matrix of the same type as the local block diagonal 1076 */ 1077 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1078 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1079 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1080 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1081 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1082 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1083 **Bin = B; 1084 b = (Mat_SeqAIJ*)B->data; 1085 1086 /*-------------------------------------------------------------------- 1087 Copy my part of matrix column indices over 1088 */ 1089 sendcount = ad->nz + bd->nz; 1090 jsendbuf = b->j + b->i[rstarts[rank]]; 1091 a_jsendbuf = ad->j; 1092 b_jsendbuf = bd->j; 1093 n = A->rmap->rend - A->rmap->rstart; 1094 cnt = 0; 1095 for (i=0; i<n; i++) { 1096 1097 /* put in lower diagonal portion */ 1098 m = bd->i[i+1] - bd->i[i]; 1099 while (m > 0) { 1100 /* is it above diagonal (in bd (compressed) numbering) */ 1101 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1102 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1103 m--; 1104 } 1105 1106 /* put in diagonal portion */ 1107 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1108 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1109 } 1110 1111 /* put in upper diagonal portion */ 1112 while (m-- > 0) { 1113 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1114 } 1115 } 1116 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1117 1118 /*-------------------------------------------------------------------- 1119 Gather all column indices to all processors 1120 */ 1121 for (i=0; i<size; i++) { 1122 recvcounts[i] = 0; 1123 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1124 recvcounts[i] += lens[j]; 1125 } 1126 } 1127 displs[0] = 0; 1128 for (i=1; i<size; i++) { 1129 displs[i] = displs[i-1] + recvcounts[i-1]; 1130 } 1131 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1132 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1133 #else 1134 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1135 #endif 1136 /*-------------------------------------------------------------------- 1137 Assemble the matrix into useable form (note numerical values not yet set) 1138 */ 1139 /* set the b->ilen (length of each row) values */ 1140 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1141 /* set the b->i indices */ 1142 b->i[0] = 0; 1143 for (i=1; i<=A->rmap->N; i++) { 1144 b->i[i] = b->i[i-1] + lens[i-1]; 1145 } 1146 ierr = PetscFree(lens);CHKERRQ(ierr); 1147 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1148 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 1150 } else { 1151 B = **Bin; 1152 b = (Mat_SeqAIJ*)B->data; 1153 } 1154 1155 /*-------------------------------------------------------------------- 1156 Copy my part of matrix numerical values into the values location 1157 */ 1158 if (flag == MAT_GET_VALUES) { 1159 sendcount = ad->nz + bd->nz; 1160 sendbuf = b->a + b->i[rstarts[rank]]; 1161 a_sendbuf = ad->a; 1162 b_sendbuf = bd->a; 1163 b_sendj = bd->j; 1164 n = A->rmap->rend - A->rmap->rstart; 1165 cnt = 0; 1166 for (i=0; i<n; i++) { 1167 1168 /* put in lower diagonal portion */ 1169 m = bd->i[i+1] - bd->i[i]; 1170 while (m > 0) { 1171 /* is it above diagonal (in bd (compressed) numbering) */ 1172 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1173 sendbuf[cnt++] = *b_sendbuf++; 1174 m--; 1175 b_sendj++; 1176 } 1177 1178 /* put in diagonal portion */ 1179 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1180 sendbuf[cnt++] = *a_sendbuf++; 1181 } 1182 1183 /* put in upper diagonal portion */ 1184 while (m-- > 0) { 1185 sendbuf[cnt++] = *b_sendbuf++; 1186 b_sendj++; 1187 } 1188 } 1189 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1190 1191 /* ----------------------------------------------------------------- 1192 Gather all numerical values to all processors 1193 */ 1194 if (!recvcounts) { 1195 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1196 } 1197 for (i=0; i<size; i++) { 1198 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1199 } 1200 displs[0] = 0; 1201 for (i=1; i<size; i++) { 1202 displs[i] = displs[i-1] + recvcounts[i-1]; 1203 } 1204 recvbuf = b->a; 1205 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1206 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1207 #else 1208 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1209 #endif 1210 } /* endof (flag == MAT_GET_VALUES) */ 1211 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1212 1213 if (A->symmetric) { 1214 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1215 } else if (A->hermitian) { 1216 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1217 } else if (A->structurally_symmetric) { 1218 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1219 } 1220 PetscFunctionReturn(0); 1221 } 1222 1223 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1224 { 1225 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1226 Mat submat,A = c->A,B = c->B; 1227 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1228 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1229 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1230 const PetscInt *icol,*irow; 1231 PetscInt nrow,ncol,start; 1232 PetscErrorCode ierr; 1233 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1234 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1235 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1236 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1237 PetscInt *lens,rmax,ncols,*cols,Crow; 1238 #if defined(PETSC_USE_CTABLE) 1239 PetscTable cmap,rmap; 1240 PetscInt *cmap_loc,*rmap_loc; 1241 #else 1242 PetscInt *cmap,*rmap; 1243 #endif 1244 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1245 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1246 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1247 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1248 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1249 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1250 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1251 MPI_Comm comm; 1252 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1253 PetscMPIInt *onodes1,*olengths1,idex,end; 1254 Mat_SubSppt *smatis1; 1255 PetscBool isrowsorted,iscolsorted; 1256 1257 PetscFunctionBegin; 1258 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1259 1260 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1261 size = c->size; 1262 rank = c->rank; 1263 1264 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1265 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1266 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1267 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1268 if (allcolumns) { 1269 icol = NULL; 1270 ncol = C->cmap->N; 1271 } else { 1272 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1273 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1274 } 1275 1276 if (scall == MAT_INITIAL_MATRIX) { 1277 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1278 1279 /* Get some new tags to keep the communication clean */ 1280 tag1 = ((PetscObject)C)->tag; 1281 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1282 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1283 1284 /* evaluate communication - mesg to who, length of mesg, and buffer space 1285 required. Based on this, buffers are allocated, and data copied into them */ 1286 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1287 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1288 1289 /* w1[proc] = num of rows owned by proc -- to be requested */ 1290 proc = 0; 1291 nrqs = 0; /* num of outgoing messages */ 1292 for (j=0; j<nrow; j++) { 1293 row = irow[j]; 1294 if (!isrowsorted) proc = 0; 1295 while (row >= C->rmap->range[proc+1]) proc++; 1296 w1[proc]++; 1297 row2proc[j] = proc; /* map row index to proc */ 1298 1299 if (proc != rank && !w2[proc]) { 1300 w2[proc] = 1; nrqs++; 1301 } 1302 } 1303 w1[rank] = 0; /* rows owned by self will not be requested */ 1304 1305 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1306 for (proc=0,j=0; proc<size; proc++) { 1307 if (w1[proc]) { pa[j++] = proc;} 1308 } 1309 1310 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1311 msz = 0; /* total mesg length (for all procs) */ 1312 for (i=0; i<nrqs; i++) { 1313 proc = pa[i]; 1314 w1[proc] += 3; 1315 msz += w1[proc]; 1316 } 1317 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1318 1319 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1320 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1321 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1322 1323 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1324 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1325 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1326 1327 /* Now post the Irecvs corresponding to these messages */ 1328 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1329 1330 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1331 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1332 1333 /* Allocate Memory for outgoing messages */ 1334 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1335 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1336 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1337 1338 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1339 iptr = tmp; 1340 for (i=0; i<nrqs; i++) { 1341 proc = pa[i]; 1342 sbuf1[proc] = iptr; 1343 iptr += w1[proc]; 1344 } 1345 1346 /* Form the outgoing messages */ 1347 /* Initialize the header space */ 1348 for (i=0; i<nrqs; i++) { 1349 proc = pa[i]; 1350 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1351 ptr[proc] = sbuf1[proc] + 3; 1352 } 1353 1354 /* Parse the isrow and copy data into outbuf */ 1355 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1356 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1357 proc = row2proc[j]; 1358 if (proc != rank) { /* copy to the outgoing buf*/ 1359 *ptr[proc] = irow[j]; 1360 ctr[proc]++; ptr[proc]++; 1361 } 1362 } 1363 1364 /* Update the headers for the current IS */ 1365 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1366 if ((ctr_j = ctr[j])) { 1367 sbuf1_j = sbuf1[j]; 1368 k = ++sbuf1_j[0]; 1369 sbuf1_j[2*k] = ctr_j; 1370 sbuf1_j[2*k-1] = 0; 1371 } 1372 } 1373 1374 /* Now post the sends */ 1375 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1376 for (i=0; i<nrqs; ++i) { 1377 proc = pa[i]; 1378 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1379 } 1380 1381 /* Post Receives to capture the buffer size */ 1382 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1383 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1384 1385 rbuf2[0] = tmp + msz; 1386 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1387 1388 for (i=0; i<nrqs; ++i) { 1389 proc = pa[i]; 1390 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1391 } 1392 1393 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1394 1395 /* Send to other procs the buf size they should allocate */ 1396 /* Receive messages*/ 1397 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1398 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1399 1400 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1401 for (i=0; i<nrqr; ++i) { 1402 req_size[i] = 0; 1403 rbuf1_i = rbuf1[i]; 1404 start = 2*rbuf1_i[0] + 1; 1405 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1406 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1407 sbuf2_i = sbuf2[i]; 1408 for (j=start; j<end; j++) { 1409 k = rbuf1_i[j] - rstart; 1410 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1411 sbuf2_i[j] = ncols; 1412 req_size[i] += ncols; 1413 } 1414 req_source1[i] = r_status1[i].MPI_SOURCE; 1415 1416 /* form the header */ 1417 sbuf2_i[0] = req_size[i]; 1418 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1419 1420 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1421 } 1422 1423 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1424 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1425 1426 /* rbuf2 is received, Post recv column indices a->j */ 1427 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1428 1429 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1430 for (i=0; i<nrqs; ++i) { 1431 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1432 req_source2[i] = r_status2[i].MPI_SOURCE; 1433 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1434 } 1435 1436 /* Wait on sends1 and sends2 */ 1437 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1438 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1439 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1440 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1441 1442 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1443 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1444 1445 /* Now allocate sending buffers for a->j, and send them off */ 1446 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1447 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1448 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1449 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1450 1451 for (i=0; i<nrqr; i++) { /* for each requested message */ 1452 rbuf1_i = rbuf1[i]; 1453 sbuf_aj_i = sbuf_aj[i]; 1454 ct1 = 2*rbuf1_i[0] + 1; 1455 ct2 = 0; 1456 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1457 1458 kmax = rbuf1[i][2]; 1459 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1460 row = rbuf1_i[ct1] - rstart; 1461 nzA = ai[row+1] - ai[row]; 1462 nzB = bi[row+1] - bi[row]; 1463 ncols = nzA + nzB; 1464 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1465 1466 /* load the column indices for this row into cols*/ 1467 cols = sbuf_aj_i + ct2; 1468 1469 lwrite = 0; 1470 for (l=0; l<nzB; l++) { 1471 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1472 } 1473 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1474 for (l=0; l<nzB; l++) { 1475 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1476 } 1477 1478 ct2 += ncols; 1479 } 1480 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1481 } 1482 1483 /* create column map (cmap): global col of C -> local col of submat */ 1484 #if defined(PETSC_USE_CTABLE) 1485 if (!allcolumns) { 1486 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1487 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1488 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1489 if (icol[j] >= cstart && icol[j] <cend) { 1490 cmap_loc[icol[j] - cstart] = j+1; 1491 } else { /* use PetscTable for non-local col indices */ 1492 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1493 } 1494 } 1495 } else { 1496 cmap = NULL; 1497 cmap_loc = NULL; 1498 } 1499 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1500 #else 1501 if (!allcolumns) { 1502 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1503 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1504 } else { 1505 cmap = NULL; 1506 } 1507 #endif 1508 1509 /* Create lens for MatSeqAIJSetPreallocation() */ 1510 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1511 1512 /* Compute lens from local part of C */ 1513 for (j=0; j<nrow; j++) { 1514 row = irow[j]; 1515 proc = row2proc[j]; 1516 if (proc == rank) { 1517 /* diagonal part A = c->A */ 1518 ncols = ai[row-rstart+1] - ai[row-rstart]; 1519 cols = aj + ai[row-rstart]; 1520 if (!allcolumns) { 1521 for (k=0; k<ncols; k++) { 1522 #if defined(PETSC_USE_CTABLE) 1523 tcol = cmap_loc[cols[k]]; 1524 #else 1525 tcol = cmap[cols[k]+cstart]; 1526 #endif 1527 if (tcol) lens[j]++; 1528 } 1529 } else { /* allcolumns */ 1530 lens[j] = ncols; 1531 } 1532 1533 /* off-diagonal part B = c->B */ 1534 ncols = bi[row-rstart+1] - bi[row-rstart]; 1535 cols = bj + bi[row-rstart]; 1536 if (!allcolumns) { 1537 for (k=0; k<ncols; k++) { 1538 #if defined(PETSC_USE_CTABLE) 1539 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1540 #else 1541 tcol = cmap[bmap[cols[k]]]; 1542 #endif 1543 if (tcol) lens[j]++; 1544 } 1545 } else { /* allcolumns */ 1546 lens[j] += ncols; 1547 } 1548 } 1549 } 1550 1551 /* Create row map (rmap): global row of C -> local row of submat */ 1552 #if defined(PETSC_USE_CTABLE) 1553 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1554 for (j=0; j<nrow; j++) { 1555 row = irow[j]; 1556 proc = row2proc[j]; 1557 if (proc == rank) { /* a local row */ 1558 rmap_loc[row - rstart] = j; 1559 } else { 1560 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1561 } 1562 } 1563 #else 1564 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1565 for (j=0; j<nrow; j++) { 1566 rmap[irow[j]] = j; 1567 } 1568 #endif 1569 1570 /* Update lens from offproc data */ 1571 /* recv a->j is done */ 1572 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1573 for (i=0; i<nrqs; i++) { 1574 proc = pa[i]; 1575 sbuf1_i = sbuf1[proc]; 1576 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1577 ct1 = 2 + 1; 1578 ct2 = 0; 1579 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1580 rbuf3_i = rbuf3[i]; /* received C->j */ 1581 1582 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1583 max1 = sbuf1_i[2]; 1584 for (k=0; k<max1; k++,ct1++) { 1585 #if defined(PETSC_USE_CTABLE) 1586 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1587 row--; 1588 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1589 #else 1590 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1591 #endif 1592 /* Now, store row index of submat in sbuf1_i[ct1] */ 1593 sbuf1_i[ct1] = row; 1594 1595 nnz = rbuf2_i[ct1]; 1596 if (!allcolumns) { 1597 for (l=0; l<nnz; l++,ct2++) { 1598 #if defined(PETSC_USE_CTABLE) 1599 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1600 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1601 } else { 1602 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1603 } 1604 #else 1605 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1606 #endif 1607 if (tcol) lens[row]++; 1608 } 1609 } else { /* allcolumns */ 1610 lens[row] += nnz; 1611 } 1612 } 1613 } 1614 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1615 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1616 1617 /* Create the submatrices */ 1618 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1619 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1620 1621 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1622 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1623 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1624 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1625 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1626 1627 /* create struct Mat_SubSppt and attached it to submat */ 1628 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1629 subc = (Mat_SeqAIJ*)submat->data; 1630 subc->submatis1 = smatis1; 1631 1632 smatis1->id = 0; 1633 smatis1->nrqs = nrqs; 1634 smatis1->nrqr = nrqr; 1635 smatis1->rbuf1 = rbuf1; 1636 smatis1->rbuf2 = rbuf2; 1637 smatis1->rbuf3 = rbuf3; 1638 smatis1->sbuf2 = sbuf2; 1639 smatis1->req_source2 = req_source2; 1640 1641 smatis1->sbuf1 = sbuf1; 1642 smatis1->ptr = ptr; 1643 smatis1->tmp = tmp; 1644 smatis1->ctr = ctr; 1645 1646 smatis1->pa = pa; 1647 smatis1->req_size = req_size; 1648 smatis1->req_source1 = req_source1; 1649 1650 smatis1->allcolumns = allcolumns; 1651 smatis1->singleis = PETSC_TRUE; 1652 smatis1->row2proc = row2proc; 1653 smatis1->rmap = rmap; 1654 smatis1->cmap = cmap; 1655 #if defined(PETSC_USE_CTABLE) 1656 smatis1->rmap_loc = rmap_loc; 1657 smatis1->cmap_loc = cmap_loc; 1658 #endif 1659 1660 smatis1->destroy = submat->ops->destroy; 1661 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1662 submat->factortype = C->factortype; 1663 1664 /* compute rmax */ 1665 rmax = 0; 1666 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1667 1668 } else { /* scall == MAT_REUSE_MATRIX */ 1669 submat = submats[0]; 1670 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1671 1672 subc = (Mat_SeqAIJ*)submat->data; 1673 rmax = subc->rmax; 1674 smatis1 = subc->submatis1; 1675 nrqs = smatis1->nrqs; 1676 nrqr = smatis1->nrqr; 1677 rbuf1 = smatis1->rbuf1; 1678 rbuf2 = smatis1->rbuf2; 1679 rbuf3 = smatis1->rbuf3; 1680 req_source2 = smatis1->req_source2; 1681 1682 sbuf1 = smatis1->sbuf1; 1683 sbuf2 = smatis1->sbuf2; 1684 ptr = smatis1->ptr; 1685 tmp = smatis1->tmp; 1686 ctr = smatis1->ctr; 1687 1688 pa = smatis1->pa; 1689 req_size = smatis1->req_size; 1690 req_source1 = smatis1->req_source1; 1691 1692 allcolumns = smatis1->allcolumns; 1693 row2proc = smatis1->row2proc; 1694 rmap = smatis1->rmap; 1695 cmap = smatis1->cmap; 1696 #if defined(PETSC_USE_CTABLE) 1697 rmap_loc = smatis1->rmap_loc; 1698 cmap_loc = smatis1->cmap_loc; 1699 #endif 1700 } 1701 1702 /* Post recv matrix values */ 1703 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1704 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1705 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1706 for (i=0; i<nrqs; ++i) { 1707 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1708 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1709 } 1710 1711 /* Allocate sending buffers for a->a, and send them off */ 1712 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1713 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1714 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1715 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1716 1717 for (i=0; i<nrqr; i++) { 1718 rbuf1_i = rbuf1[i]; 1719 sbuf_aa_i = sbuf_aa[i]; 1720 ct1 = 2*rbuf1_i[0]+1; 1721 ct2 = 0; 1722 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1723 1724 kmax = rbuf1_i[2]; 1725 for (k=0; k<kmax; k++,ct1++) { 1726 row = rbuf1_i[ct1] - rstart; 1727 nzA = ai[row+1] - ai[row]; 1728 nzB = bi[row+1] - bi[row]; 1729 ncols = nzA + nzB; 1730 cworkB = bj + bi[row]; 1731 vworkA = a_a + ai[row]; 1732 vworkB = b_a + bi[row]; 1733 1734 /* load the column values for this row into vals*/ 1735 vals = sbuf_aa_i + ct2; 1736 1737 lwrite = 0; 1738 for (l=0; l<nzB; l++) { 1739 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1740 } 1741 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1742 for (l=0; l<nzB; l++) { 1743 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1744 } 1745 1746 ct2 += ncols; 1747 } 1748 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1749 } 1750 1751 /* Assemble submat */ 1752 /* First assemble the local rows */ 1753 for (j=0; j<nrow; j++) { 1754 row = irow[j]; 1755 proc = row2proc[j]; 1756 if (proc == rank) { 1757 Crow = row - rstart; /* local row index of C */ 1758 #if defined(PETSC_USE_CTABLE) 1759 row = rmap_loc[Crow]; /* row index of submat */ 1760 #else 1761 row = rmap[row]; 1762 #endif 1763 1764 if (allcolumns) { 1765 /* diagonal part A = c->A */ 1766 ncols = ai[Crow+1] - ai[Crow]; 1767 cols = aj + ai[Crow]; 1768 vals = a->a + ai[Crow]; 1769 i = 0; 1770 for (k=0; k<ncols; k++) { 1771 subcols[i] = cols[k] + cstart; 1772 subvals[i++] = vals[k]; 1773 } 1774 1775 /* off-diagonal part B = c->B */ 1776 ncols = bi[Crow+1] - bi[Crow]; 1777 cols = bj + bi[Crow]; 1778 vals = b->a + bi[Crow]; 1779 for (k=0; k<ncols; k++) { 1780 subcols[i] = bmap[cols[k]]; 1781 subvals[i++] = vals[k]; 1782 } 1783 1784 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1785 1786 } else { /* !allcolumns */ 1787 #if defined(PETSC_USE_CTABLE) 1788 /* diagonal part A = c->A */ 1789 ncols = ai[Crow+1] - ai[Crow]; 1790 cols = aj + ai[Crow]; 1791 vals = a->a + ai[Crow]; 1792 i = 0; 1793 for (k=0; k<ncols; k++) { 1794 tcol = cmap_loc[cols[k]]; 1795 if (tcol) { 1796 subcols[i] = --tcol; 1797 subvals[i++] = vals[k]; 1798 } 1799 } 1800 1801 /* off-diagonal part B = c->B */ 1802 ncols = bi[Crow+1] - bi[Crow]; 1803 cols = bj + bi[Crow]; 1804 vals = b->a + bi[Crow]; 1805 for (k=0; k<ncols; k++) { 1806 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1807 if (tcol) { 1808 subcols[i] = --tcol; 1809 subvals[i++] = vals[k]; 1810 } 1811 } 1812 #else 1813 /* diagonal part A = c->A */ 1814 ncols = ai[Crow+1] - ai[Crow]; 1815 cols = aj + ai[Crow]; 1816 vals = a->a + ai[Crow]; 1817 i = 0; 1818 for (k=0; k<ncols; k++) { 1819 tcol = cmap[cols[k]+cstart]; 1820 if (tcol) { 1821 subcols[i] = --tcol; 1822 subvals[i++] = vals[k]; 1823 } 1824 } 1825 1826 /* off-diagonal part B = c->B */ 1827 ncols = bi[Crow+1] - bi[Crow]; 1828 cols = bj + bi[Crow]; 1829 vals = b->a + bi[Crow]; 1830 for (k=0; k<ncols; k++) { 1831 tcol = cmap[bmap[cols[k]]]; 1832 if (tcol) { 1833 subcols[i] = --tcol; 1834 subvals[i++] = vals[k]; 1835 } 1836 } 1837 #endif 1838 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1839 } 1840 } 1841 } 1842 1843 /* Now assemble the off-proc rows */ 1844 for (i=0; i<nrqs; i++) { /* for each requested message */ 1845 /* recv values from other processes */ 1846 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1847 proc = pa[idex]; 1848 sbuf1_i = sbuf1[proc]; 1849 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1850 ct1 = 2 + 1; 1851 ct2 = 0; /* count of received C->j */ 1852 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1853 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1854 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1855 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1856 1857 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1858 max1 = sbuf1_i[2]; /* num of rows */ 1859 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1860 row = sbuf1_i[ct1]; /* row index of submat */ 1861 if (!allcolumns) { 1862 idex = 0; 1863 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1864 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1865 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1866 #if defined(PETSC_USE_CTABLE) 1867 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1868 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1869 } else { 1870 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1871 } 1872 #else 1873 tcol = cmap[rbuf3_i[ct2]]; 1874 #endif 1875 if (tcol) { 1876 subcols[idex] = --tcol; /* may not be sorted */ 1877 subvals[idex++] = rbuf4_i[ct2]; 1878 1879 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1880 For reuse, we replace received C->j with index that should be inserted to submat */ 1881 rbuf3_i[ct3++] = ct2; 1882 } 1883 } 1884 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1885 } else { /* scall == MAT_REUSE_MATRIX */ 1886 submat = submats[0]; 1887 subc = (Mat_SeqAIJ*)submat->data; 1888 1889 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1890 for (l=0; l<nnz; l++) { 1891 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1892 subvals[idex++] = rbuf4_i[ct2]; 1893 } 1894 1895 bj = subc->j + subc->i[row]; /* sorted column indices */ 1896 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1897 } 1898 } else { /* allcolumns */ 1899 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1900 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1901 ct2 += nnz; 1902 } 1903 } 1904 } 1905 1906 /* sending a->a are done */ 1907 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1908 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1909 1910 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1911 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1912 submats[0] = submat; 1913 1914 /* Restore the indices */ 1915 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1916 if (!allcolumns) { 1917 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1918 } 1919 1920 /* Destroy allocated memory */ 1921 for (i=0; i<nrqs; ++i) { 1922 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1923 } 1924 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1925 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1926 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1927 1928 if (scall == MAT_INITIAL_MATRIX) { 1929 ierr = PetscFree(lens);CHKERRQ(ierr); 1930 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1931 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1932 } 1933 PetscFunctionReturn(0); 1934 } 1935 1936 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1937 { 1938 PetscErrorCode ierr; 1939 PetscInt ncol; 1940 PetscBool colflag,allcolumns=PETSC_FALSE; 1941 1942 PetscFunctionBegin; 1943 /* Allocate memory to hold all the submatrices */ 1944 if (scall == MAT_INITIAL_MATRIX) { 1945 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1946 } 1947 1948 /* Check for special case: each processor gets entire matrix columns */ 1949 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1950 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1951 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1952 1953 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1954 PetscFunctionReturn(0); 1955 } 1956 1957 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1958 { 1959 PetscErrorCode ierr; 1960 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1961 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1962 Mat_SeqAIJ *subc; 1963 Mat_SubSppt *smat; 1964 1965 PetscFunctionBegin; 1966 /* Check for special case: each processor has a single IS */ 1967 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1968 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1969 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */ 1970 PetscFunctionReturn(0); 1971 } 1972 1973 /* Collect global wantallmatrix and nstages */ 1974 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1975 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1976 if (!nmax) nmax = 1; 1977 1978 if (scall == MAT_INITIAL_MATRIX) { 1979 /* Collect global wantallmatrix and nstages */ 1980 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1981 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1982 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1983 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1984 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1985 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1986 wantallmatrix = PETSC_TRUE; 1987 1988 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1989 } 1990 } 1991 1992 /* Determine the number of stages through which submatrices are done 1993 Each stage will extract nmax submatrices. 1994 nmax is determined by the matrix column dimension. 1995 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1996 */ 1997 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1998 1999 in[0] = -1*(PetscInt)wantallmatrix; 2000 in[1] = nstages; 2001 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2002 wantallmatrix = (PetscBool)(-out[0]); 2003 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2004 2005 } else { /* MAT_REUSE_MATRIX */ 2006 if (ismax) { 2007 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2008 smat = subc->submatis1; 2009 } else { /* (*submat)[0] is a dummy matrix */ 2010 smat = (Mat_SubSppt*)(*submat)[0]->data; 2011 } 2012 if (!smat) { 2013 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2014 wantallmatrix = PETSC_TRUE; 2015 } else if (smat->singleis) { 2016 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2017 PetscFunctionReturn(0); 2018 } else { 2019 nstages = smat->nstages; 2020 } 2021 } 2022 2023 if (wantallmatrix) { 2024 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2025 PetscFunctionReturn(0); 2026 } 2027 2028 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2029 if (scall == MAT_INITIAL_MATRIX) { 2030 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2031 } 2032 2033 for (i=0,pos=0; i<nstages; i++) { 2034 if (pos+nmax <= ismax) max_no = nmax; 2035 else if (pos == ismax) max_no = 0; 2036 else max_no = ismax-pos; 2037 2038 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2039 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2040 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2041 smat->nstages = nstages; 2042 } 2043 pos += max_no; 2044 } 2045 2046 if (ismax && scall == MAT_INITIAL_MATRIX) { 2047 /* save nstages for reuse */ 2048 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2049 smat = subc->submatis1; 2050 smat->nstages = nstages; 2051 } 2052 PetscFunctionReturn(0); 2053 } 2054 2055 /* -------------------------------------------------------------------------*/ 2056 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2057 { 2058 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2059 Mat A = c->A; 2060 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2061 const PetscInt **icol,**irow; 2062 PetscInt *nrow,*ncol,start; 2063 PetscErrorCode ierr; 2064 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2065 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2066 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2067 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2068 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2069 #if defined(PETSC_USE_CTABLE) 2070 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2071 #else 2072 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2073 #endif 2074 const PetscInt *irow_i; 2075 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2076 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2077 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2078 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2079 MPI_Status *r_status3,*r_status4,*s_status4; 2080 MPI_Comm comm; 2081 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2082 PetscMPIInt *onodes1,*olengths1,end; 2083 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2084 Mat_SubSppt *smat_i; 2085 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2086 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2087 2088 PetscFunctionBegin; 2089 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2090 size = c->size; 2091 rank = c->rank; 2092 2093 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2094 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2095 2096 for (i=0; i<ismax; i++) { 2097 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2098 if (!issorted[i]) iscsorted = issorted[i]; 2099 2100 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2101 2102 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2103 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2104 2105 /* Check for special case: allcolumn */ 2106 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2107 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2108 if (colflag && ncol[i] == C->cmap->N) { 2109 allcolumns[i] = PETSC_TRUE; 2110 icol[i] = NULL; 2111 } else { 2112 allcolumns[i] = PETSC_FALSE; 2113 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2114 } 2115 } 2116 2117 if (scall == MAT_REUSE_MATRIX) { 2118 /* Assumes new rows are same length as the old rows */ 2119 for (i=0; i<ismax; i++) { 2120 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2121 subc = (Mat_SeqAIJ*)submats[i]->data; 2122 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2123 2124 /* Initial matrix as if empty */ 2125 ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2126 2127 smat_i = subc->submatis1; 2128 2129 nrqs = smat_i->nrqs; 2130 nrqr = smat_i->nrqr; 2131 rbuf1 = smat_i->rbuf1; 2132 rbuf2 = smat_i->rbuf2; 2133 rbuf3 = smat_i->rbuf3; 2134 req_source2 = smat_i->req_source2; 2135 2136 sbuf1 = smat_i->sbuf1; 2137 sbuf2 = smat_i->sbuf2; 2138 ptr = smat_i->ptr; 2139 tmp = smat_i->tmp; 2140 ctr = smat_i->ctr; 2141 2142 pa = smat_i->pa; 2143 req_size = smat_i->req_size; 2144 req_source1 = smat_i->req_source1; 2145 2146 allcolumns[i] = smat_i->allcolumns; 2147 row2proc[i] = smat_i->row2proc; 2148 rmap[i] = smat_i->rmap; 2149 cmap[i] = smat_i->cmap; 2150 } 2151 2152 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2153 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2154 smat_i = (Mat_SubSppt*)submats[0]->data; 2155 2156 nrqs = smat_i->nrqs; 2157 nrqr = smat_i->nrqr; 2158 rbuf1 = smat_i->rbuf1; 2159 rbuf2 = smat_i->rbuf2; 2160 rbuf3 = smat_i->rbuf3; 2161 req_source2 = smat_i->req_source2; 2162 2163 sbuf1 = smat_i->sbuf1; 2164 sbuf2 = smat_i->sbuf2; 2165 ptr = smat_i->ptr; 2166 tmp = smat_i->tmp; 2167 ctr = smat_i->ctr; 2168 2169 pa = smat_i->pa; 2170 req_size = smat_i->req_size; 2171 req_source1 = smat_i->req_source1; 2172 2173 allcolumns[0] = PETSC_FALSE; 2174 } 2175 } else { /* scall == MAT_INITIAL_MATRIX */ 2176 /* Get some new tags to keep the communication clean */ 2177 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2178 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2179 2180 /* evaluate communication - mesg to who, length of mesg, and buffer space 2181 required. Based on this, buffers are allocated, and data copied into them*/ 2182 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2183 2184 for (i=0; i<ismax; i++) { 2185 jmax = nrow[i]; 2186 irow_i = irow[i]; 2187 2188 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2189 row2proc[i] = row2proc_i; 2190 2191 if (issorted[i]) proc = 0; 2192 for (j=0; j<jmax; j++) { 2193 if (!issorted[i]) proc = 0; 2194 row = irow_i[j]; 2195 while (row >= C->rmap->range[proc+1]) proc++; 2196 w4[proc]++; 2197 row2proc_i[j] = proc; /* map row index to proc */ 2198 } 2199 for (j=0; j<size; j++) { 2200 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2201 } 2202 } 2203 2204 nrqs = 0; /* no of outgoing messages */ 2205 msz = 0; /* total mesg length (for all procs) */ 2206 w1[rank] = 0; /* no mesg sent to self */ 2207 w3[rank] = 0; 2208 for (i=0; i<size; i++) { 2209 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2210 } 2211 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2212 for (i=0,j=0; i<size; i++) { 2213 if (w1[i]) { pa[j] = i; j++; } 2214 } 2215 2216 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2217 for (i=0; i<nrqs; i++) { 2218 j = pa[i]; 2219 w1[j] += w2[j] + 2* w3[j]; 2220 msz += w1[j]; 2221 } 2222 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2223 2224 /* Determine the number of messages to expect, their lengths, from from-ids */ 2225 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2226 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2227 2228 /* Now post the Irecvs corresponding to these messages */ 2229 tag0 = ((PetscObject)C)->tag; 2230 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2231 2232 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2233 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2234 2235 /* Allocate Memory for outgoing messages */ 2236 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2237 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2238 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2239 2240 { 2241 PetscInt *iptr = tmp; 2242 k = 0; 2243 for (i=0; i<nrqs; i++) { 2244 j = pa[i]; 2245 iptr += k; 2246 sbuf1[j] = iptr; 2247 k = w1[j]; 2248 } 2249 } 2250 2251 /* Form the outgoing messages. Initialize the header space */ 2252 for (i=0; i<nrqs; i++) { 2253 j = pa[i]; 2254 sbuf1[j][0] = 0; 2255 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2256 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2257 } 2258 2259 /* Parse the isrow and copy data into outbuf */ 2260 for (i=0; i<ismax; i++) { 2261 row2proc_i = row2proc[i]; 2262 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2263 irow_i = irow[i]; 2264 jmax = nrow[i]; 2265 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2266 proc = row2proc_i[j]; 2267 if (proc != rank) { /* copy to the outgoing buf*/ 2268 ctr[proc]++; 2269 *ptr[proc] = irow_i[j]; 2270 ptr[proc]++; 2271 } 2272 } 2273 /* Update the headers for the current IS */ 2274 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2275 if ((ctr_j = ctr[j])) { 2276 sbuf1_j = sbuf1[j]; 2277 k = ++sbuf1_j[0]; 2278 sbuf1_j[2*k] = ctr_j; 2279 sbuf1_j[2*k-1] = i; 2280 } 2281 } 2282 } 2283 2284 /* Now post the sends */ 2285 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2286 for (i=0; i<nrqs; ++i) { 2287 j = pa[i]; 2288 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2289 } 2290 2291 /* Post Receives to capture the buffer size */ 2292 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2293 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2294 rbuf2[0] = tmp + msz; 2295 for (i=1; i<nrqs; ++i) { 2296 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2297 } 2298 for (i=0; i<nrqs; ++i) { 2299 j = pa[i]; 2300 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2301 } 2302 2303 /* Send to other procs the buf size they should allocate */ 2304 /* Receive messages*/ 2305 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2306 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2307 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2308 { 2309 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2310 PetscInt *sbuf2_i; 2311 2312 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2313 for (i=0; i<nrqr; ++i) { 2314 req_size[i] = 0; 2315 rbuf1_i = rbuf1[i]; 2316 start = 2*rbuf1_i[0] + 1; 2317 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2318 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2319 sbuf2_i = sbuf2[i]; 2320 for (j=start; j<end; j++) { 2321 id = rbuf1_i[j] - rstart; 2322 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2323 sbuf2_i[j] = ncols; 2324 req_size[i] += ncols; 2325 } 2326 req_source1[i] = r_status1[i].MPI_SOURCE; 2327 /* form the header */ 2328 sbuf2_i[0] = req_size[i]; 2329 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2330 2331 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2332 } 2333 } 2334 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2335 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2336 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2337 2338 /* Receive messages*/ 2339 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2340 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2341 2342 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2343 for (i=0; i<nrqs; ++i) { 2344 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2345 req_source2[i] = r_status2[i].MPI_SOURCE; 2346 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2347 } 2348 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2349 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2350 2351 /* Wait on sends1 and sends2 */ 2352 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2353 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2354 2355 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2356 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2357 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2358 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2359 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2360 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2361 2362 /* Now allocate sending buffers for a->j, and send them off */ 2363 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2364 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2365 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2366 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2367 2368 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2369 { 2370 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2371 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2372 PetscInt cend = C->cmap->rend; 2373 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2374 2375 for (i=0; i<nrqr; i++) { 2376 rbuf1_i = rbuf1[i]; 2377 sbuf_aj_i = sbuf_aj[i]; 2378 ct1 = 2*rbuf1_i[0] + 1; 2379 ct2 = 0; 2380 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2381 kmax = rbuf1[i][2*j]; 2382 for (k=0; k<kmax; k++,ct1++) { 2383 row = rbuf1_i[ct1] - rstart; 2384 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2385 ncols = nzA + nzB; 2386 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2387 2388 /* load the column indices for this row into cols */ 2389 cols = sbuf_aj_i + ct2; 2390 2391 lwrite = 0; 2392 for (l=0; l<nzB; l++) { 2393 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2394 } 2395 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2396 for (l=0; l<nzB; l++) { 2397 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2398 } 2399 2400 ct2 += ncols; 2401 } 2402 } 2403 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2404 } 2405 } 2406 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2407 2408 /* create col map: global col of C -> local col of submatrices */ 2409 { 2410 const PetscInt *icol_i; 2411 #if defined(PETSC_USE_CTABLE) 2412 for (i=0; i<ismax; i++) { 2413 if (!allcolumns[i]) { 2414 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2415 2416 jmax = ncol[i]; 2417 icol_i = icol[i]; 2418 cmap_i = cmap[i]; 2419 for (j=0; j<jmax; j++) { 2420 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2421 } 2422 } else cmap[i] = NULL; 2423 } 2424 #else 2425 for (i=0; i<ismax; i++) { 2426 if (!allcolumns[i]) { 2427 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2428 jmax = ncol[i]; 2429 icol_i = icol[i]; 2430 cmap_i = cmap[i]; 2431 for (j=0; j<jmax; j++) { 2432 cmap_i[icol_i[j]] = j+1; 2433 } 2434 } else cmap[i] = NULL; 2435 } 2436 #endif 2437 } 2438 2439 /* Create lens which is required for MatCreate... */ 2440 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2441 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2442 2443 if (ismax) { 2444 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2445 } 2446 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2447 2448 /* Update lens from local data */ 2449 for (i=0; i<ismax; i++) { 2450 row2proc_i = row2proc[i]; 2451 jmax = nrow[i]; 2452 if (!allcolumns[i]) cmap_i = cmap[i]; 2453 irow_i = irow[i]; 2454 lens_i = lens[i]; 2455 for (j=0; j<jmax; j++) { 2456 row = irow_i[j]; 2457 proc = row2proc_i[j]; 2458 if (proc == rank) { 2459 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2460 if (!allcolumns[i]) { 2461 for (k=0; k<ncols; k++) { 2462 #if defined(PETSC_USE_CTABLE) 2463 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2464 #else 2465 tcol = cmap_i[cols[k]]; 2466 #endif 2467 if (tcol) lens_i[j]++; 2468 } 2469 } else { /* allcolumns */ 2470 lens_i[j] = ncols; 2471 } 2472 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2473 } 2474 } 2475 } 2476 2477 /* Create row map: global row of C -> local row of submatrices */ 2478 #if defined(PETSC_USE_CTABLE) 2479 for (i=0; i<ismax; i++) { 2480 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2481 irow_i = irow[i]; 2482 jmax = nrow[i]; 2483 for (j=0; j<jmax; j++) { 2484 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2485 } 2486 } 2487 #else 2488 for (i=0; i<ismax; i++) { 2489 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2490 rmap_i = rmap[i]; 2491 irow_i = irow[i]; 2492 jmax = nrow[i]; 2493 for (j=0; j<jmax; j++) { 2494 rmap_i[irow_i[j]] = j; 2495 } 2496 } 2497 #endif 2498 2499 /* Update lens from offproc data */ 2500 { 2501 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2502 2503 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2504 for (tmp2=0; tmp2<nrqs; tmp2++) { 2505 sbuf1_i = sbuf1[pa[tmp2]]; 2506 jmax = sbuf1_i[0]; 2507 ct1 = 2*jmax+1; 2508 ct2 = 0; 2509 rbuf2_i = rbuf2[tmp2]; 2510 rbuf3_i = rbuf3[tmp2]; 2511 for (j=1; j<=jmax; j++) { 2512 is_no = sbuf1_i[2*j-1]; 2513 max1 = sbuf1_i[2*j]; 2514 lens_i = lens[is_no]; 2515 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2516 rmap_i = rmap[is_no]; 2517 for (k=0; k<max1; k++,ct1++) { 2518 #if defined(PETSC_USE_CTABLE) 2519 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2520 row--; 2521 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2522 #else 2523 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2524 #endif 2525 max2 = rbuf2_i[ct1]; 2526 for (l=0; l<max2; l++,ct2++) { 2527 if (!allcolumns[is_no]) { 2528 #if defined(PETSC_USE_CTABLE) 2529 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2530 #else 2531 tcol = cmap_i[rbuf3_i[ct2]]; 2532 #endif 2533 if (tcol) lens_i[row]++; 2534 } else { /* allcolumns */ 2535 lens_i[row]++; /* lens_i[row] += max2 ? */ 2536 } 2537 } 2538 } 2539 } 2540 } 2541 } 2542 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2543 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2544 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2545 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2546 2547 /* Create the submatrices */ 2548 for (i=0; i<ismax; i++) { 2549 PetscInt rbs,cbs; 2550 2551 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2552 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2553 2554 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2555 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2556 2557 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2558 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2559 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2560 2561 /* create struct Mat_SubSppt and attached it to submat */ 2562 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2563 subc = (Mat_SeqAIJ*)submats[i]->data; 2564 subc->submatis1 = smat_i; 2565 2566 smat_i->destroy = submats[i]->ops->destroy; 2567 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2568 submats[i]->factortype = C->factortype; 2569 2570 smat_i->id = i; 2571 smat_i->nrqs = nrqs; 2572 smat_i->nrqr = nrqr; 2573 smat_i->rbuf1 = rbuf1; 2574 smat_i->rbuf2 = rbuf2; 2575 smat_i->rbuf3 = rbuf3; 2576 smat_i->sbuf2 = sbuf2; 2577 smat_i->req_source2 = req_source2; 2578 2579 smat_i->sbuf1 = sbuf1; 2580 smat_i->ptr = ptr; 2581 smat_i->tmp = tmp; 2582 smat_i->ctr = ctr; 2583 2584 smat_i->pa = pa; 2585 smat_i->req_size = req_size; 2586 smat_i->req_source1 = req_source1; 2587 2588 smat_i->allcolumns = allcolumns[i]; 2589 smat_i->singleis = PETSC_FALSE; 2590 smat_i->row2proc = row2proc[i]; 2591 smat_i->rmap = rmap[i]; 2592 smat_i->cmap = cmap[i]; 2593 } 2594 2595 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2596 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2597 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2598 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2599 2600 /* create struct Mat_SubSppt and attached it to submat */ 2601 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2602 submats[0]->data = (void*)smat_i; 2603 2604 smat_i->destroy = submats[0]->ops->destroy; 2605 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2606 submats[0]->factortype = C->factortype; 2607 2608 smat_i->id = 0; 2609 smat_i->nrqs = nrqs; 2610 smat_i->nrqr = nrqr; 2611 smat_i->rbuf1 = rbuf1; 2612 smat_i->rbuf2 = rbuf2; 2613 smat_i->rbuf3 = rbuf3; 2614 smat_i->sbuf2 = sbuf2; 2615 smat_i->req_source2 = req_source2; 2616 2617 smat_i->sbuf1 = sbuf1; 2618 smat_i->ptr = ptr; 2619 smat_i->tmp = tmp; 2620 smat_i->ctr = ctr; 2621 2622 smat_i->pa = pa; 2623 smat_i->req_size = req_size; 2624 smat_i->req_source1 = req_source1; 2625 2626 smat_i->allcolumns = PETSC_FALSE; 2627 smat_i->singleis = PETSC_FALSE; 2628 smat_i->row2proc = NULL; 2629 smat_i->rmap = NULL; 2630 smat_i->cmap = NULL; 2631 } 2632 2633 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2634 ierr = PetscFree(lens);CHKERRQ(ierr); 2635 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2636 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2637 2638 } /* endof scall == MAT_INITIAL_MATRIX */ 2639 2640 /* Post recv matrix values */ 2641 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2642 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2643 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2644 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2645 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2646 for (i=0; i<nrqs; ++i) { 2647 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2648 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2649 } 2650 2651 /* Allocate sending buffers for a->a, and send them off */ 2652 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2653 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2654 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2655 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2656 2657 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2658 { 2659 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2660 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2661 PetscInt cend = C->cmap->rend; 2662 PetscInt *b_j = b->j; 2663 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2664 2665 for (i=0; i<nrqr; i++) { 2666 rbuf1_i = rbuf1[i]; 2667 sbuf_aa_i = sbuf_aa[i]; 2668 ct1 = 2*rbuf1_i[0]+1; 2669 ct2 = 0; 2670 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2671 kmax = rbuf1_i[2*j]; 2672 for (k=0; k<kmax; k++,ct1++) { 2673 row = rbuf1_i[ct1] - rstart; 2674 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2675 ncols = nzA + nzB; 2676 cworkB = b_j + b_i[row]; 2677 vworkA = a_a + a_i[row]; 2678 vworkB = b_a + b_i[row]; 2679 2680 /* load the column values for this row into vals*/ 2681 vals = sbuf_aa_i+ct2; 2682 2683 lwrite = 0; 2684 for (l=0; l<nzB; l++) { 2685 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2686 } 2687 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2688 for (l=0; l<nzB; l++) { 2689 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2690 } 2691 2692 ct2 += ncols; 2693 } 2694 } 2695 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2696 } 2697 } 2698 2699 /* Assemble the matrices */ 2700 /* First assemble the local rows */ 2701 for (i=0; i<ismax; i++) { 2702 row2proc_i = row2proc[i]; 2703 subc = (Mat_SeqAIJ*)submats[i]->data; 2704 imat_ilen = subc->ilen; 2705 imat_j = subc->j; 2706 imat_i = subc->i; 2707 imat_a = subc->a; 2708 2709 if (!allcolumns[i]) cmap_i = cmap[i]; 2710 rmap_i = rmap[i]; 2711 irow_i = irow[i]; 2712 jmax = nrow[i]; 2713 for (j=0; j<jmax; j++) { 2714 row = irow_i[j]; 2715 proc = row2proc_i[j]; 2716 if (proc == rank) { 2717 old_row = row; 2718 #if defined(PETSC_USE_CTABLE) 2719 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2720 row--; 2721 #else 2722 row = rmap_i[row]; 2723 #endif 2724 ilen_row = imat_ilen[row]; 2725 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2726 mat_i = imat_i[row]; 2727 mat_a = imat_a + mat_i; 2728 mat_j = imat_j + mat_i; 2729 if (!allcolumns[i]) { 2730 for (k=0; k<ncols; k++) { 2731 #if defined(PETSC_USE_CTABLE) 2732 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2733 #else 2734 tcol = cmap_i[cols[k]]; 2735 #endif 2736 if (tcol) { 2737 *mat_j++ = tcol - 1; 2738 *mat_a++ = vals[k]; 2739 ilen_row++; 2740 } 2741 } 2742 } else { /* allcolumns */ 2743 for (k=0; k<ncols; k++) { 2744 *mat_j++ = cols[k]; /* global col index! */ 2745 *mat_a++ = vals[k]; 2746 ilen_row++; 2747 } 2748 } 2749 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2750 2751 imat_ilen[row] = ilen_row; 2752 } 2753 } 2754 } 2755 2756 /* Now assemble the off proc rows */ 2757 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2758 for (tmp2=0; tmp2<nrqs; tmp2++) { 2759 sbuf1_i = sbuf1[pa[tmp2]]; 2760 jmax = sbuf1_i[0]; 2761 ct1 = 2*jmax + 1; 2762 ct2 = 0; 2763 rbuf2_i = rbuf2[tmp2]; 2764 rbuf3_i = rbuf3[tmp2]; 2765 rbuf4_i = rbuf4[tmp2]; 2766 for (j=1; j<=jmax; j++) { 2767 is_no = sbuf1_i[2*j-1]; 2768 rmap_i = rmap[is_no]; 2769 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2770 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2771 imat_ilen = subc->ilen; 2772 imat_j = subc->j; 2773 imat_i = subc->i; 2774 imat_a = subc->a; 2775 max1 = sbuf1_i[2*j]; 2776 for (k=0; k<max1; k++,ct1++) { 2777 row = sbuf1_i[ct1]; 2778 #if defined(PETSC_USE_CTABLE) 2779 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2780 row--; 2781 #else 2782 row = rmap_i[row]; 2783 #endif 2784 ilen = imat_ilen[row]; 2785 mat_i = imat_i[row]; 2786 mat_a = imat_a + mat_i; 2787 mat_j = imat_j + mat_i; 2788 max2 = rbuf2_i[ct1]; 2789 if (!allcolumns[is_no]) { 2790 for (l=0; l<max2; l++,ct2++) { 2791 #if defined(PETSC_USE_CTABLE) 2792 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2793 #else 2794 tcol = cmap_i[rbuf3_i[ct2]]; 2795 #endif 2796 if (tcol) { 2797 *mat_j++ = tcol - 1; 2798 *mat_a++ = rbuf4_i[ct2]; 2799 ilen++; 2800 } 2801 } 2802 } else { /* allcolumns */ 2803 for (l=0; l<max2; l++,ct2++) { 2804 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2805 *mat_a++ = rbuf4_i[ct2]; 2806 ilen++; 2807 } 2808 } 2809 imat_ilen[row] = ilen; 2810 } 2811 } 2812 } 2813 2814 if (!iscsorted) { /* sort column indices of the rows */ 2815 for (i=0; i<ismax; i++) { 2816 subc = (Mat_SeqAIJ*)submats[i]->data; 2817 imat_j = subc->j; 2818 imat_i = subc->i; 2819 imat_a = subc->a; 2820 imat_ilen = subc->ilen; 2821 2822 if (allcolumns[i]) continue; 2823 jmax = nrow[i]; 2824 for (j=0; j<jmax; j++) { 2825 mat_i = imat_i[j]; 2826 mat_a = imat_a + mat_i; 2827 mat_j = imat_j + mat_i; 2828 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2829 } 2830 } 2831 } 2832 2833 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2834 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2835 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2836 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2837 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2838 2839 /* Restore the indices */ 2840 for (i=0; i<ismax; i++) { 2841 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2842 if (!allcolumns[i]) { 2843 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2844 } 2845 } 2846 2847 for (i=0; i<ismax; i++) { 2848 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2849 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2850 } 2851 2852 /* Destroy allocated memory */ 2853 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2854 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2855 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2856 2857 for (i=0; i<nrqs; ++i) { 2858 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2859 } 2860 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2861 2862 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2863 PetscFunctionReturn(0); 2864 } 2865 2866 /* 2867 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2868 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2869 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2870 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2871 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2872 state, and needs to be "assembled" later by compressing B's column space. 2873 2874 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2875 Following this call, C->A & C->B have been created, even if empty. 2876 */ 2877 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2878 { 2879 /* If making this function public, change the error returned in this function away from _PLIB. */ 2880 PetscErrorCode ierr; 2881 Mat_MPIAIJ *aij; 2882 Mat_SeqAIJ *Baij; 2883 PetscBool seqaij,Bdisassembled; 2884 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2885 PetscScalar v; 2886 const PetscInt *rowindices,*colindices; 2887 2888 PetscFunctionBegin; 2889 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2890 if (A) { 2891 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2892 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2893 if (rowemb) { 2894 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2895 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2896 } else { 2897 if (C->rmap->n != A->rmap->n) { 2898 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2899 } 2900 } 2901 if (dcolemb) { 2902 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2903 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2904 } else { 2905 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2906 } 2907 } 2908 if (B) { 2909 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2910 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2911 if (rowemb) { 2912 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2913 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2914 } else { 2915 if (C->rmap->n != B->rmap->n) { 2916 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2917 } 2918 } 2919 if (ocolemb) { 2920 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2921 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2922 } else { 2923 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2924 } 2925 } 2926 2927 aij = (Mat_MPIAIJ*)C->data; 2928 if (!aij->A) { 2929 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2930 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2931 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2932 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2933 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2934 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2935 } 2936 if (A) { 2937 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2938 } else { 2939 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2940 } 2941 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2942 /* 2943 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2944 need to "disassemble" B -- convert it to using C's global indices. 2945 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2946 2947 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2948 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2949 2950 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2951 At least avoid calling MatSetValues() and the implied searches? 2952 */ 2953 2954 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2955 #if defined(PETSC_USE_CTABLE) 2956 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2957 #else 2958 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2959 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2960 if (aij->B) { 2961 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2962 } 2963 #endif 2964 ngcol = 0; 2965 if (aij->lvec) { 2966 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2967 } 2968 if (aij->garray) { 2969 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2970 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2971 } 2972 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2973 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2974 } 2975 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2976 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2977 } 2978 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2979 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2980 } 2981 } 2982 Bdisassembled = PETSC_FALSE; 2983 if (!aij->B) { 2984 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2985 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2986 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2987 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2988 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2989 Bdisassembled = PETSC_TRUE; 2990 } 2991 if (B) { 2992 Baij = (Mat_SeqAIJ*)B->data; 2993 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2994 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2995 for (i=0; i<B->rmap->n; i++) { 2996 nz[i] = Baij->i[i+1] - Baij->i[i]; 2997 } 2998 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2999 ierr = PetscFree(nz);CHKERRQ(ierr); 3000 } 3001 3002 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3003 shift = rend-rstart; 3004 count = 0; 3005 rowindices = NULL; 3006 colindices = NULL; 3007 if (rowemb) { 3008 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3009 } 3010 if (ocolemb) { 3011 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3012 } 3013 for (i=0; i<B->rmap->n; i++) { 3014 PetscInt row; 3015 row = i; 3016 if (rowindices) row = rowindices[i]; 3017 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3018 col = Baij->j[count]; 3019 if (colindices) col = colindices[col]; 3020 if (Bdisassembled && col>=rstart) col += shift; 3021 v = Baij->a[count]; 3022 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3023 ++count; 3024 } 3025 } 3026 /* No assembly for aij->B is necessary. */ 3027 /* FIXME: set aij->B's nonzerostate correctly. */ 3028 } else { 3029 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3030 } 3031 C->preallocated = PETSC_TRUE; 3032 C->was_assembled = PETSC_FALSE; 3033 C->assembled = PETSC_FALSE; 3034 /* 3035 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3036 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3037 */ 3038 PetscFunctionReturn(0); 3039 } 3040 3041 /* 3042 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3043 */ 3044 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3045 { 3046 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3047 3048 PetscFunctionBegin; 3049 PetscValidPointer(A,2); 3050 PetscValidPointer(B,3); 3051 /* FIXME: make sure C is assembled */ 3052 *A = aij->A; 3053 *B = aij->B; 3054 /* Note that we don't incref *A and *B, so be careful! */ 3055 PetscFunctionReturn(0); 3056 } 3057 3058 /* 3059 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3060 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3061 */ 3062 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3063 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3064 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3065 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3066 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3067 { 3068 PetscErrorCode ierr; 3069 PetscMPIInt isize,flag; 3070 PetscInt i,ii,cismax,ispar; 3071 Mat *A,*B; 3072 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3073 3074 PetscFunctionBegin; 3075 if (!ismax) PetscFunctionReturn(0); 3076 3077 for (i = 0, cismax = 0; i < ismax; ++i) { 3078 PetscMPIInt isize; 3079 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3080 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3081 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3082 if (isize > 1) ++cismax; 3083 } 3084 3085 /* 3086 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3087 ispar counts the number of parallel ISs across C's comm. 3088 */ 3089 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3090 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3091 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3092 PetscFunctionReturn(0); 3093 } 3094 3095 /* if (ispar) */ 3096 /* 3097 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3098 These are used to extract the off-diag portion of the resulting parallel matrix. 3099 The row IS for the off-diag portion is the same as for the diag portion, 3100 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3101 */ 3102 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3103 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3104 for (i = 0, ii = 0; i < ismax; ++i) { 3105 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3106 if (isize > 1) { 3107 /* 3108 TODO: This is the part that's ***NOT SCALABLE***. 3109 To fix this we need to extract just the indices of C's nonzero columns 3110 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3111 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3112 to be done without serializing on the IS list, so, most likely, it is best 3113 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3114 */ 3115 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3116 /* Now we have to 3117 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3118 were sorted on each rank, concatenated they might no longer be sorted; 3119 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3120 indices in the nondecreasing order to the original index positions. 3121 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3122 */ 3123 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3124 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3125 ++ii; 3126 } 3127 } 3128 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3129 for (i = 0, ii = 0; i < ismax; ++i) { 3130 PetscInt j,issize; 3131 const PetscInt *indices; 3132 3133 /* 3134 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3135 */ 3136 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3137 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3138 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3139 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3140 for (j = 1; j < issize; ++j) { 3141 if (indices[j] == indices[j-1]) { 3142 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3143 } 3144 } 3145 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3146 3147 3148 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3149 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3150 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3151 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3152 for (j = 1; j < issize; ++j) { 3153 if (indices[j-1] == indices[j]) { 3154 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3155 } 3156 } 3157 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3158 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3159 if (isize > 1) { 3160 cisrow[ii] = isrow[i]; 3161 ++ii; 3162 } 3163 } 3164 /* 3165 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3166 array of sequential matrices underlying the resulting parallel matrices. 3167 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3168 contain duplicates. 3169 3170 There are as many diag matrices as there are original index sets. There are only as many parallel 3171 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3172 3173 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3174 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3175 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3176 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3177 with A[i] and B[ii] extracted from the corresponding MPI submat. 3178 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3179 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3180 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3181 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3182 values into A[i] and B[ii] sitting inside the corresponding submat. 3183 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3184 A[i], B[ii], AA[i] or BB[ii] matrices. 3185 */ 3186 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3187 if (scall == MAT_INITIAL_MATRIX) { 3188 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3189 } 3190 3191 /* Now obtain the sequential A and B submatrices separately. */ 3192 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3193 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3194 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3195 3196 /* 3197 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3198 matrices A & B have been extracted directly into the parallel matrices containing them, or 3199 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3200 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3201 to have the same sparsity pattern. 3202 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3203 must be constructed for C. This is done by setseqmat(s). 3204 */ 3205 for (i = 0, ii = 0; i < ismax; ++i) { 3206 /* 3207 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3208 That way we can avoid sorting and computing permutations when reusing. 3209 To this end: 3210 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3211 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3212 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3213 */ 3214 MatStructure pattern; 3215 pattern = DIFFERENT_NONZERO_PATTERN; 3216 3217 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3218 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3219 if (isize > 1) { 3220 if (scall == MAT_INITIAL_MATRIX) { 3221 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3222 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3223 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3224 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3225 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3226 } 3227 /* 3228 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3229 */ 3230 { 3231 Mat AA,BB; 3232 AA = A[i]; 3233 BB = B[ii]; 3234 if (AA || BB) { 3235 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3236 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3237 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3238 } 3239 3240 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3241 } 3242 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3243 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3244 ++ii; 3245 } else { /* if (isize == 1) */ 3246 if (scall == MAT_REUSE_MATRIX) { 3247 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3248 } 3249 if (isrow_p[i] || iscol_p[i]) { 3250 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3251 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3252 /* Otherwise A is extracted straight into (*submats)[i]. */ 3253 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3254 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3255 } else (*submat)[i] = A[i]; 3256 } 3257 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3258 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3259 } 3260 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3261 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3262 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3263 ierr = PetscFree(A);CHKERRQ(ierr); 3264 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3265 PetscFunctionReturn(0); 3266 } 3267 3268 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3269 { 3270 PetscErrorCode ierr; 3271 3272 PetscFunctionBegin; 3273 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3274 PetscFunctionReturn(0); 3275 } 3276