1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++){ 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++){ 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank){ 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows){ 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++){ 115 if (tosizes_temp[i]){ 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++){ 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++){ 133 for (j=0; j<fromsizes[2*i]; j++){ 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++){ 169 for (j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++){ 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++){ 209 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 210 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 213 } 214 /* retrieve information to get row id and its overlap */ 215 for (i=0; i<nrecvs; ){ 216 is_id = recvdata[i++]; 217 data_size = recvdata[i++]; 218 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 219 isz_i = isz[is_id]; 220 for (j=0; j< data_size; j++){ 221 col = recvdata[i++]; 222 indices_i[isz_i++] = col; 223 } 224 isz[is_id] = isz_i; 225 } 226 /* remove duplicate entities */ 227 for (i=0; i<nidx; i++){ 228 indices_i = indices_temp+(max_lsize+nrecvs)*i; 229 isz_i = isz[i]; 230 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 231 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 232 } 233 ierr = PetscFree(isz);CHKERRQ(ierr); 234 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 239 { 240 PetscLayout rmap,cmap; 241 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 242 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 243 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 244 const PetscInt *gcols,*ai,*aj,*bi,*bj; 245 Mat amat,bmat; 246 PetscMPIInt rank; 247 PetscBool done; 248 MPI_Comm comm; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 253 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 254 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 255 /* Even if the mat is symmetric, we still assume it is not symmetric */ 256 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 257 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 258 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 /* total number of nonzero values is used to estimate the memory usage in the next step */ 261 tnz = ai[an]+bi[bn]; 262 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 263 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 264 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 265 /* to find the longest message */ 266 max_fszs = 0; 267 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 268 /* better way to estimate number of nonzero in the mat??? */ 269 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 270 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 271 rows_pos = 0; 272 totalrows = 0; 273 for (i=0; i<nfrom; i++){ 274 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 275 /* group data together */ 276 for (j=0; j<fromsizes[2*i]; j+=2){ 277 is_id = fromrows[rows_pos++];/* no of is */ 278 rows_i = rows_data[is_id]; 279 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 280 } 281 /* estimate a space to avoid multiple allocations */ 282 for (j=0; j<nidx; j++){ 283 indvc_ij = 0; 284 rows_i = rows_data[j]; 285 for (l=0; l<rows_pos_i[j]; l++){ 286 row = rows_i[l]-rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for (k=start; k<end; k++){ /* Amat */ 290 col = aj[k] + cstart; 291 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for (k=start; k<end; k++) { /* Bmat */ 296 col = gcols[bj[k]]; 297 indices_tmp[indvc_ij++] = col; 298 } 299 } 300 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 301 indv_counts[i*nidx+j] = indvc_ij; 302 totalrows += indvc_ij; 303 } 304 } 305 /* message triple <no of is, number of rows, rows> */ 306 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 307 totalrows = 0; 308 rows_pos = 0; 309 /* use this code again */ 310 for (i=0;i<nfrom;i++){ 311 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 312 for (j=0; j<fromsizes[2*i]; j+=2){ 313 is_id = fromrows[rows_pos++]; 314 rows_i = rows_data[is_id]; 315 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 316 } 317 /* add data */ 318 for (j=0; j<nidx; j++){ 319 if (!indv_counts[i*nidx+j]) continue; 320 indvc_ij = 0; 321 sbdata[totalrows++] = j; 322 sbdata[totalrows++] = indv_counts[i*nidx+j]; 323 sbsizes[2*i] += 2; 324 rows_i = rows_data[j]; 325 for (l=0; l<rows_pos_i[j]; l++){ 326 row = rows_i[l]-rstart; 327 start = ai[row]; 328 end = ai[row+1]; 329 for (k=start; k<end; k++){ /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; 332 } 333 start = bi[row]; 334 end = bi[row+1]; 335 for (k=start; k<end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 341 sbsizes[2*i] += indvc_ij; 342 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 343 totalrows += indvc_ij; 344 } 345 } 346 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 347 for (i=0; i<nfrom; i++){ 348 offsets[i+1] = offsets[i] + sbsizes[2*i]; 349 sbsizes[2*i+1] = offsets[i]; 350 } 351 ierr = PetscFree(offsets);CHKERRQ(ierr); 352 if (sbrowsizes) *sbrowsizes = sbsizes; 353 if (sbrows) *sbrows = sbdata; 354 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 355 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 356 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 361 { 362 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 363 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 364 PetscInt lsize,lsize_tmp,owner; 365 PetscMPIInt rank; 366 Mat amat,bmat; 367 PetscBool done; 368 PetscLayout cmap,rmap; 369 MPI_Comm comm; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 374 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 375 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 376 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 377 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 378 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 /* is it a safe way to compute number of nonzero values ? */ 381 tnz = ai[an]+bi[bn]; 382 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 383 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 384 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 385 /* it is a better way to estimate memory than the old implementation 386 * where global size of matrix is used 387 * */ 388 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 389 for (i=0; i<nidx; i++) { 390 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 391 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 392 lsize_tmp = 0; 393 for (j=0; j<lsize; j++) { 394 owner = -1; 395 row = indices[j]; 396 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 397 if (owner != rank) continue; 398 /* local number */ 399 row -= rstart; 400 start = ai[row]; 401 end = ai[row+1]; 402 for (k=start; k<end; k++) { /* Amat */ 403 col = aj[k] + cstart; 404 indices_temp[lsize_tmp++] = col; 405 } 406 start = bi[row]; 407 end = bi[row+1]; 408 for (k=start; k<end; k++) { /* Bmat */ 409 col = gcols[bj[k]]; 410 indices_temp[lsize_tmp++] = col; 411 } 412 } 413 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 414 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 415 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 416 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 417 } 418 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 419 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 420 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests recieved (which have to be or which have been processed 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscErrorCode ierr; 461 PetscMPIInt size,rank,tag1,tag2; 462 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 463 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 464 PetscBT *table; 465 MPI_Comm comm; 466 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 467 MPI_Status *s_status,*recv_status; 468 char *t_p; 469 470 PetscFunctionBegin; 471 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 477 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 478 479 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 480 481 for (i=0; i<imax; i++) { 482 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 483 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 489 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 490 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 491 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 492 for (i=0; i<imax; i++) { 493 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 idx_i = idx[i]; 495 len = n[i]; 496 for (j=0; j<len; j++) { 497 row = idx_i[j]; 498 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 499 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 500 w4[proc]++; 501 } 502 for (j=0; j<size; j++) { 503 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 504 } 505 } 506 507 nrqs = 0; /* no of outgoing messages */ 508 msz = 0; /* total mesg length (for all proc */ 509 w1[rank] = 0; /* no mesg sent to intself */ 510 w3[rank] = 0; 511 for (i=0; i<size; i++) { 512 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 513 } 514 /* pa - is list of processors to communicate with */ 515 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 516 for (i=0,j=0; i<size; i++) { 517 if (w1[i]) {pa[j] = i; j++;} 518 } 519 520 /* Each message would have a header = 1 + 2*(no of IS) + data */ 521 for (i=0; i<nrqs; i++) { 522 j = pa[i]; 523 w1[j] += w2[j] + 2*w3[j]; 524 msz += w1[j]; 525 } 526 527 /* Determine the number of messages to expect, their lengths, from from-ids */ 528 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 529 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 530 531 /* Now post the Irecvs corresponding to these messages */ 532 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 533 534 /* Allocate Memory for outgoing messages */ 535 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 536 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 537 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 538 539 { 540 PetscInt *iptr = tmp,ict = 0; 541 for (i=0; i<nrqs; i++) { 542 j = pa[i]; 543 iptr += ict; 544 outdat[j] = iptr; 545 ict = w1[j]; 546 } 547 } 548 549 /* Form the outgoing messages */ 550 /* plug in the headers */ 551 for (i=0; i<nrqs; i++) { 552 j = pa[i]; 553 outdat[j][0] = 0; 554 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 555 ptr[j] = outdat[j] + 2*w3[j] + 1; 556 } 557 558 /* Memory for doing local proc's work */ 559 { 560 PetscInt M_BPB_imax = 0; 561 #if defined(PETSC_USE_CTABLE) 562 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 563 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 564 for (i=0; i<imax; i++) { 565 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 566 } 567 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 570 } 571 #else 572 PetscInt Mimax = 0; 573 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 574 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 575 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 data[i] = d_p + M*i; 579 } 580 #endif 581 } 582 583 /* Parse the IS and update local tables and the outgoing buf with the data */ 584 { 585 PetscInt n_i,isz_i,*outdat_j,ctr_j; 586 PetscBT table_i; 587 588 for (i=0; i<imax; i++) { 589 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 590 n_i = n[i]; 591 table_i = table[i]; 592 idx_i = idx[i]; 593 #if defined(PETSC_USE_CTABLE) 594 table_data_i = table_data[i]; 595 #else 596 data_i = data[i]; 597 #endif 598 isz_i = isz[i]; 599 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 600 row = idx_i[j]; 601 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 602 if (proc != rank) { /* copy to the outgoing buffer */ 603 ctr[proc]++; 604 *ptr[proc] = row; 605 ptr[proc]++; 606 } else if (!PetscBTLookupSet(table_i,row)) { 607 #if defined(PETSC_USE_CTABLE) 608 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 609 #else 610 data_i[isz_i] = row; /* Update the local table */ 611 #endif 612 isz_i++; 613 } 614 } 615 /* Update the headers for the current IS */ 616 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 617 if ((ctr_j = ctr[j])) { 618 outdat_j = outdat[j]; 619 k = ++outdat_j[0]; 620 outdat_j[2*k] = ctr_j; 621 outdat_j[2*k-1] = i; 622 } 623 } 624 isz[i] = isz_i; 625 } 626 } 627 628 /* Now post the sends */ 629 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 630 for (i=0; i<nrqs; ++i) { 631 j = pa[i]; 632 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 633 } 634 635 /* No longer need the original indices */ 636 for (i=0; i<imax; ++i) { 637 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 638 } 639 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 640 641 for (i=0; i<imax; ++i) { 642 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 643 } 644 645 /* Do Local work */ 646 #if defined(PETSC_USE_CTABLE) 647 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 648 #else 649 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 650 #endif 651 652 /* Receive messages */ 653 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 654 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 655 656 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 657 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 658 659 /* Phase 1 sends are complete - deallocate buffers */ 660 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 661 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 662 663 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 664 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 665 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 666 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 667 ierr = PetscFree(rbuf);CHKERRQ(ierr); 668 669 670 /* Send the data back */ 671 /* Do a global reduction to know the buffer space req for incoming messages */ 672 { 673 PetscMPIInt *rw1; 674 675 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 676 677 for (i=0; i<nrqr; ++i) { 678 proc = recv_status[i].MPI_SOURCE; 679 680 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 681 rw1[proc] = isz1[i]; 682 } 683 ierr = PetscFree(onodes1);CHKERRQ(ierr); 684 ierr = PetscFree(olengths1);CHKERRQ(ierr); 685 686 /* Determine the number of messages to expect, their lengths, from from-ids */ 687 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 688 ierr = PetscFree(rw1);CHKERRQ(ierr); 689 } 690 /* Now post the Irecvs corresponding to these messages */ 691 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 692 693 /* Now post the sends */ 694 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 695 for (i=0; i<nrqr; ++i) { 696 j = recv_status[i].MPI_SOURCE; 697 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 698 } 699 700 /* receive work done on other processors */ 701 { 702 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 703 PetscMPIInt idex; 704 PetscBT table_i; 705 MPI_Status *status2; 706 707 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 708 for (i=0; i<nrqs; ++i) { 709 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 710 /* Process the message */ 711 rbuf2_i = rbuf2[idex]; 712 ct1 = 2*rbuf2_i[0]+1; 713 jmax = rbuf2[idex][0]; 714 for (j=1; j<=jmax; j++) { 715 max = rbuf2_i[2*j]; 716 is_no = rbuf2_i[2*j-1]; 717 isz_i = isz[is_no]; 718 table_i = table[is_no]; 719 #if defined(PETSC_USE_CTABLE) 720 table_data_i = table_data[is_no]; 721 #else 722 data_i = data[is_no]; 723 #endif 724 for (k=0; k<max; k++,ct1++) { 725 row = rbuf2_i[ct1]; 726 if (!PetscBTLookupSet(table_i,row)) { 727 #if defined(PETSC_USE_CTABLE) 728 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 729 #else 730 data_i[isz_i] = row; 731 #endif 732 isz_i++; 733 } 734 } 735 isz[is_no] = isz_i; 736 } 737 } 738 739 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 740 ierr = PetscFree(status2);CHKERRQ(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 } 768 769 ierr = PetscFree(onodes2);CHKERRQ(ierr); 770 ierr = PetscFree(olengths2);CHKERRQ(ierr); 771 772 ierr = PetscFree(pa);CHKERRQ(ierr); 773 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 774 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 775 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 776 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 779 ierr = PetscFree(s_status);CHKERRQ(ierr); 780 ierr = PetscFree(recv_status);CHKERRQ(ierr); 781 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 782 ierr = PetscFree(xdata);CHKERRQ(ierr); 783 ierr = PetscFree(isz1);CHKERRQ(ierr); 784 #if defined(PETSC_USE_CTABLE) 785 for (i=0; i<imax; i++) { 786 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 787 } 788 ierr = PetscFree(table_data);CHKERRQ(ierr); 789 ierr = PetscFree(tdata);CHKERRQ(ierr); 790 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 791 #else 792 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 793 #endif 794 PetscFunctionReturn(0); 795 } 796 797 /* 798 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 799 the work on the local processor. 800 801 Inputs: 802 C - MAT_MPIAIJ; 803 imax - total no of index sets processed at a time; 804 table - an array of char - size = m bits. 805 806 Output: 807 isz - array containing the count of the solution elements corresponding 808 to each index set; 809 data or table_data - pointer to the solutions 810 */ 811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 812 { 813 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 814 Mat A = c->A,B = c->B; 815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 816 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 817 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 818 PetscBT table_i; 819 #if defined(PETSC_USE_CTABLE) 820 PetscTable table_data_i; 821 PetscErrorCode ierr; 822 PetscTablePosition tpos; 823 PetscInt tcount,*tdata; 824 #else 825 PetscInt *data_i; 826 #endif 827 828 PetscFunctionBegin; 829 rstart = C->rmap->rstart; 830 cstart = C->cmap->rstart; 831 ai = a->i; 832 aj = a->j; 833 bi = b->i; 834 bj = b->j; 835 garray = c->garray; 836 837 for (i=0; i<imax; i++) { 838 #if defined(PETSC_USE_CTABLE) 839 /* copy existing entries of table_data_i into tdata[] */ 840 table_data_i = table_data[i]; 841 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 842 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 843 844 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 845 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 846 while (tpos) { 847 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 848 tdata[--j] = --row; 849 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 850 } 851 #else 852 data_i = data[i]; 853 #endif 854 table_i = table[i]; 855 isz_i = isz[i]; 856 max = isz[i]; 857 858 for (j=0; j<max; j++) { 859 #if defined(PETSC_USE_CTABLE) 860 row = tdata[j] - rstart; 861 #else 862 row = data_i[j] - rstart; 863 #endif 864 start = ai[row]; 865 end = ai[row+1]; 866 for (k=start; k<end; k++) { /* Amat */ 867 val = aj[k] + cstart; 868 if (!PetscBTLookupSet(table_i,val)) { 869 #if defined(PETSC_USE_CTABLE) 870 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 871 #else 872 data_i[isz_i] = val; 873 #endif 874 isz_i++; 875 } 876 } 877 start = bi[row]; 878 end = bi[row+1]; 879 for (k=start; k<end; k++) { /* Bmat */ 880 val = garray[bj[k]]; 881 if (!PetscBTLookupSet(table_i,val)) { 882 #if defined(PETSC_USE_CTABLE) 883 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 884 #else 885 data_i[isz_i] = val; 886 #endif 887 isz_i++; 888 } 889 } 890 } 891 isz[i] = isz_i; 892 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscFree(tdata);CHKERRQ(ierr); 895 #endif 896 } 897 PetscFunctionReturn(0); 898 } 899 900 /* 901 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 902 and return the output 903 904 Input: 905 C - the matrix 906 nrqr - no of messages being processed. 907 rbuf - an array of pointers to the recieved requests 908 909 Output: 910 xdata - array of messages to be sent back 911 isz1 - size of each message 912 913 For better efficiency perhaps we should malloc separately each xdata[i], 914 then if a remalloc is required we need only copy the data for that one row 915 rather then all previous rows as it is now where a single large chunck of 916 memory is used. 917 918 */ 919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 920 { 921 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 922 Mat A = c->A,B = c->B; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 924 PetscErrorCode ierr; 925 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 926 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 927 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 928 PetscInt *rbuf_i,kmax,rbuf_0; 929 PetscBT xtable; 930 931 PetscFunctionBegin; 932 m = C->rmap->N; 933 rstart = C->rmap->rstart; 934 cstart = C->cmap->rstart; 935 ai = a->i; 936 aj = a->j; 937 bi = b->i; 938 bj = b->j; 939 garray = c->garray; 940 941 942 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 943 rbuf_i = rbuf[i]; 944 rbuf_0 = rbuf_i[0]; 945 ct += rbuf_0; 946 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 947 } 948 949 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 950 else max1 = 1; 951 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 952 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 953 ++no_malloc; 954 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 955 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 956 957 ct3 = 0; 958 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 959 rbuf_i = rbuf[i]; 960 rbuf_0 = rbuf_i[0]; 961 ct1 = 2*rbuf_0+1; 962 ct2 = ct1; 963 ct3 += ct1; 964 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 965 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 966 oct2 = ct2; 967 kmax = rbuf_i[2*j]; 968 for (k=0; k<kmax; k++,ct1++) { 969 row = rbuf_i[ct1]; 970 if (!PetscBTLookupSet(xtable,row)) { 971 if (!(ct3 < mem_estimate)) { 972 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 973 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 974 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 975 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 976 xdata[0] = tmp; 977 mem_estimate = new_estimate; ++no_malloc; 978 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 979 } 980 xdata[i][ct2++] = row; 981 ct3++; 982 } 983 } 984 for (k=oct2,max2=ct2; k<max2; k++) { 985 row = xdata[i][k] - rstart; 986 start = ai[row]; 987 end = ai[row+1]; 988 for (l=start; l<end; l++) { 989 val = aj[l] + cstart; 990 if (!PetscBTLookupSet(xtable,val)) { 991 if (!(ct3 < mem_estimate)) { 992 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 993 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 994 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 995 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 996 xdata[0] = tmp; 997 mem_estimate = new_estimate; ++no_malloc; 998 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 999 } 1000 xdata[i][ct2++] = val; 1001 ct3++; 1002 } 1003 } 1004 start = bi[row]; 1005 end = bi[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = garray[bj[l]]; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 } 1023 /* Update the header*/ 1024 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1025 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1026 } 1027 xdata[i][0] = rbuf_0; 1028 xdata[i+1] = xdata[i] + ct2; 1029 isz1[i] = ct2; /* size of each message */ 1030 } 1031 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1032 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 /* -------------------------------------------------------------------------*/ 1036 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1037 /* 1038 Every processor gets the entire matrix 1039 */ 1040 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1041 { 1042 Mat B; 1043 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1044 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1045 PetscErrorCode ierr; 1046 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1047 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1048 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1049 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1050 1051 PetscFunctionBegin; 1052 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1053 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1054 1055 if (scall == MAT_INITIAL_MATRIX) { 1056 /* ---------------------------------------------------------------- 1057 Tell every processor the number of nonzeros per row 1058 */ 1059 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1060 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1061 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1062 } 1063 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1064 for (i=0; i<size; i++) { 1065 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1066 displs[i] = A->rmap->range[i]; 1067 } 1068 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1069 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1070 #else 1071 sendcount = A->rmap->rend - A->rmap->rstart; 1072 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1073 #endif 1074 /* --------------------------------------------------------------- 1075 Create the sequential matrix of the same type as the local block diagonal 1076 */ 1077 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1078 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1079 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1080 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1081 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1082 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1083 **Bin = B; 1084 b = (Mat_SeqAIJ*)B->data; 1085 1086 /*-------------------------------------------------------------------- 1087 Copy my part of matrix column indices over 1088 */ 1089 sendcount = ad->nz + bd->nz; 1090 jsendbuf = b->j + b->i[rstarts[rank]]; 1091 a_jsendbuf = ad->j; 1092 b_jsendbuf = bd->j; 1093 n = A->rmap->rend - A->rmap->rstart; 1094 cnt = 0; 1095 for (i=0; i<n; i++) { 1096 1097 /* put in lower diagonal portion */ 1098 m = bd->i[i+1] - bd->i[i]; 1099 while (m > 0) { 1100 /* is it above diagonal (in bd (compressed) numbering) */ 1101 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1102 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1103 m--; 1104 } 1105 1106 /* put in diagonal portion */ 1107 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1108 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1109 } 1110 1111 /* put in upper diagonal portion */ 1112 while (m-- > 0) { 1113 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1114 } 1115 } 1116 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1117 1118 /*-------------------------------------------------------------------- 1119 Gather all column indices to all processors 1120 */ 1121 for (i=0; i<size; i++) { 1122 recvcounts[i] = 0; 1123 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1124 recvcounts[i] += lens[j]; 1125 } 1126 } 1127 displs[0] = 0; 1128 for (i=1; i<size; i++) { 1129 displs[i] = displs[i-1] + recvcounts[i-1]; 1130 } 1131 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1132 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1133 #else 1134 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1135 #endif 1136 /*-------------------------------------------------------------------- 1137 Assemble the matrix into useable form (note numerical values not yet set) 1138 */ 1139 /* set the b->ilen (length of each row) values */ 1140 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1141 /* set the b->i indices */ 1142 b->i[0] = 0; 1143 for (i=1; i<=A->rmap->N; i++) { 1144 b->i[i] = b->i[i-1] + lens[i-1]; 1145 } 1146 ierr = PetscFree(lens);CHKERRQ(ierr); 1147 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1148 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 1150 } else { 1151 B = **Bin; 1152 b = (Mat_SeqAIJ*)B->data; 1153 } 1154 1155 /*-------------------------------------------------------------------- 1156 Copy my part of matrix numerical values into the values location 1157 */ 1158 if (flag == MAT_GET_VALUES) { 1159 sendcount = ad->nz + bd->nz; 1160 sendbuf = b->a + b->i[rstarts[rank]]; 1161 a_sendbuf = ad->a; 1162 b_sendbuf = bd->a; 1163 b_sendj = bd->j; 1164 n = A->rmap->rend - A->rmap->rstart; 1165 cnt = 0; 1166 for (i=0; i<n; i++) { 1167 1168 /* put in lower diagonal portion */ 1169 m = bd->i[i+1] - bd->i[i]; 1170 while (m > 0) { 1171 /* is it above diagonal (in bd (compressed) numbering) */ 1172 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1173 sendbuf[cnt++] = *b_sendbuf++; 1174 m--; 1175 b_sendj++; 1176 } 1177 1178 /* put in diagonal portion */ 1179 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1180 sendbuf[cnt++] = *a_sendbuf++; 1181 } 1182 1183 /* put in upper diagonal portion */ 1184 while (m-- > 0) { 1185 sendbuf[cnt++] = *b_sendbuf++; 1186 b_sendj++; 1187 } 1188 } 1189 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1190 1191 /* ----------------------------------------------------------------- 1192 Gather all numerical values to all processors 1193 */ 1194 if (!recvcounts) { 1195 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1196 } 1197 for (i=0; i<size; i++) { 1198 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1199 } 1200 displs[0] = 0; 1201 for (i=1; i<size; i++) { 1202 displs[i] = displs[i-1] + recvcounts[i-1]; 1203 } 1204 recvbuf = b->a; 1205 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1206 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1207 #else 1208 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1209 #endif 1210 } /* endof (flag == MAT_GET_VALUES) */ 1211 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1212 1213 if (A->symmetric) { 1214 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1215 } else if (A->hermitian) { 1216 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1217 } else if (A->structurally_symmetric) { 1218 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1219 } 1220 PetscFunctionReturn(0); 1221 } 1222 1223 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1224 { 1225 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1226 Mat submat,A = c->A,B = c->B; 1227 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1228 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1229 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1230 const PetscInt *icol,*irow; 1231 PetscInt nrow,ncol,start; 1232 PetscErrorCode ierr; 1233 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1234 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1235 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1236 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1237 PetscInt *lens,rmax,ncols,*cols,Crow; 1238 #if defined(PETSC_USE_CTABLE) 1239 PetscTable cmap,rmap; 1240 PetscInt *cmap_loc,*rmap_loc; 1241 #else 1242 PetscInt *cmap,*rmap; 1243 #endif 1244 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1245 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1246 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1247 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1248 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1249 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1250 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1251 MPI_Comm comm; 1252 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1253 PetscMPIInt *onodes1,*olengths1,idex,end; 1254 Mat_SubSppt *smatis1; 1255 PetscBool isrowsorted; 1256 1257 PetscFunctionBegin; 1258 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1259 1260 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1261 size = c->size; 1262 rank = c->rank; 1263 1264 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1265 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1266 1267 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1268 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1269 if (allcolumns) { 1270 icol = NULL; 1271 ncol = C->cmap->N; 1272 } else { 1273 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1274 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1275 } 1276 1277 if (scall == MAT_INITIAL_MATRIX) { 1278 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1279 1280 /* Get some new tags to keep the communication clean */ 1281 tag1 = ((PetscObject)C)->tag; 1282 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1283 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1284 1285 /* evaluate communication - mesg to who, length of mesg, and buffer space 1286 required. Based on this, buffers are allocated, and data copied into them */ 1287 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1288 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1289 1290 /* w1[proc] = num of rows owned by proc -- to be requested */ 1291 proc = 0; 1292 nrqs = 0; /* num of outgoing messages */ 1293 for (j=0; j<nrow; j++) { 1294 row = irow[j]; /* sorted! */ 1295 while (row >= C->rmap->range[proc+1]) proc++; 1296 w1[proc]++; 1297 row2proc[j] = proc; /* map row index to proc */ 1298 1299 if (proc != rank && !w2[proc]) { 1300 w2[proc] = 1; nrqs++; 1301 } 1302 } 1303 w1[rank] = 0; /* rows owned by self will not be requested */ 1304 1305 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1306 for (proc=0,j=0; proc<size; proc++) { 1307 if (w1[proc]) { pa[j++] = proc;} 1308 } 1309 1310 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1311 msz = 0; /* total mesg length (for all procs) */ 1312 for (i=0; i<nrqs; i++) { 1313 proc = pa[i]; 1314 w1[proc] += 3; 1315 msz += w1[proc]; 1316 } 1317 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1318 1319 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1320 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1321 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1322 1323 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1324 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1325 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1326 1327 /* Now post the Irecvs corresponding to these messages */ 1328 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1329 1330 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1331 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1332 1333 /* Allocate Memory for outgoing messages */ 1334 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1335 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1336 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1337 1338 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1339 iptr = tmp; 1340 for (i=0; i<nrqs; i++) { 1341 proc = pa[i]; 1342 sbuf1[proc] = iptr; 1343 iptr += w1[proc]; 1344 } 1345 1346 /* Form the outgoing messages */ 1347 /* Initialize the header space */ 1348 for (i=0; i<nrqs; i++) { 1349 proc = pa[i]; 1350 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1351 ptr[proc] = sbuf1[proc] + 3; 1352 } 1353 1354 /* Parse the isrow and copy data into outbuf */ 1355 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1356 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1357 proc = row2proc[j]; 1358 if (proc != rank) { /* copy to the outgoing buf*/ 1359 *ptr[proc] = irow[j]; 1360 ctr[proc]++; ptr[proc]++; 1361 } 1362 } 1363 1364 /* Update the headers for the current IS */ 1365 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1366 if ((ctr_j = ctr[j])) { 1367 sbuf1_j = sbuf1[j]; 1368 k = ++sbuf1_j[0]; 1369 sbuf1_j[2*k] = ctr_j; 1370 sbuf1_j[2*k-1] = 0; 1371 } 1372 } 1373 1374 /* Now post the sends */ 1375 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1376 for (i=0; i<nrqs; ++i) { 1377 proc = pa[i]; 1378 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1379 } 1380 1381 /* Post Receives to capture the buffer size */ 1382 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1383 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1384 1385 rbuf2[0] = tmp + msz; 1386 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1387 1388 for (i=0; i<nrqs; ++i) { 1389 proc = pa[i]; 1390 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1391 } 1392 1393 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1394 1395 /* Send to other procs the buf size they should allocate */ 1396 /* Receive messages*/ 1397 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1398 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1399 1400 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1401 for (i=0; i<nrqr; ++i) { 1402 req_size[i] = 0; 1403 rbuf1_i = rbuf1[i]; 1404 start = 2*rbuf1_i[0] + 1; 1405 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1406 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1407 sbuf2_i = sbuf2[i]; 1408 for (j=start; j<end; j++) { 1409 k = rbuf1_i[j] - rstart; 1410 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1411 sbuf2_i[j] = ncols; 1412 req_size[i] += ncols; 1413 } 1414 req_source1[i] = r_status1[i].MPI_SOURCE; 1415 1416 /* form the header */ 1417 sbuf2_i[0] = req_size[i]; 1418 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1419 1420 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1421 } 1422 1423 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1424 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1425 1426 /* rbuf2 is received, Post recv column indices a->j */ 1427 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1428 1429 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1430 for (i=0; i<nrqs; ++i) { 1431 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1432 req_source2[i] = r_status2[i].MPI_SOURCE; 1433 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1434 } 1435 1436 /* Wait on sends1 and sends2 */ 1437 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1438 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1439 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1440 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1441 1442 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1443 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1444 1445 /* Now allocate sending buffers for a->j, and send them off */ 1446 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1447 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1448 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1449 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1450 1451 for (i=0; i<nrqr; i++) { /* for each requested message */ 1452 rbuf1_i = rbuf1[i]; 1453 sbuf_aj_i = sbuf_aj[i]; 1454 ct1 = 2*rbuf1_i[0] + 1; 1455 ct2 = 0; 1456 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1457 1458 kmax = rbuf1[i][2]; 1459 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1460 row = rbuf1_i[ct1] - rstart; 1461 nzA = ai[row+1] - ai[row]; 1462 nzB = bi[row+1] - bi[row]; 1463 ncols = nzA + nzB; 1464 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1465 1466 /* load the column indices for this row into cols*/ 1467 cols = sbuf_aj_i + ct2; 1468 1469 lwrite = 0; 1470 for (l=0; l<nzB; l++) { 1471 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1472 } 1473 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1474 for (l=0; l<nzB; l++) { 1475 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1476 } 1477 1478 ct2 += ncols; 1479 } 1480 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1481 } 1482 1483 /* create column map (cmap): global col of C -> local col of submat */ 1484 #if defined(PETSC_USE_CTABLE) 1485 if (!allcolumns) { 1486 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1487 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1488 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1489 if (icol[j] >= cstart && icol[j] <cend) { 1490 cmap_loc[icol[j] - cstart] = j+1; 1491 } else { /* use PetscTable for non-local col indices */ 1492 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1493 } 1494 } 1495 } else { 1496 cmap = NULL; 1497 cmap_loc = NULL; 1498 } 1499 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1500 #else 1501 if (!allcolumns) { 1502 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1503 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1504 } else { 1505 cmap = NULL; 1506 } 1507 #endif 1508 1509 /* Create lens for MatSeqAIJSetPreallocation() */ 1510 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1511 1512 /* Compute lens from local part of C */ 1513 for (j=0; j<nrow; j++) { 1514 row = irow[j]; 1515 proc = row2proc[j]; 1516 if (proc == rank) { 1517 /* diagonal part A = c->A */ 1518 ncols = ai[row-rstart+1] - ai[row-rstart]; 1519 cols = aj + ai[row-rstart]; 1520 if (!allcolumns) { 1521 for (k=0; k<ncols; k++) { 1522 #if defined(PETSC_USE_CTABLE) 1523 tcol = cmap_loc[cols[k]]; 1524 #else 1525 tcol = cmap[cols[k]+cstart]; 1526 #endif 1527 if (tcol) lens[j]++; 1528 } 1529 } else { /* allcolumns */ 1530 lens[j] = ncols; 1531 } 1532 1533 /* off-diagonal part B = c->B */ 1534 ncols = bi[row-rstart+1] - bi[row-rstart]; 1535 cols = bj + bi[row-rstart]; 1536 if (!allcolumns) { 1537 for (k=0; k<ncols; k++) { 1538 #if defined(PETSC_USE_CTABLE) 1539 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1540 #else 1541 tcol = cmap[bmap[cols[k]]]; 1542 #endif 1543 if (tcol) lens[j]++; 1544 } 1545 } else { /* allcolumns */ 1546 lens[j] += ncols; 1547 } 1548 } 1549 } 1550 1551 /* Create row map (rmap): global row of C -> local row of submat */ 1552 #if defined(PETSC_USE_CTABLE) 1553 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1554 for (j=0; j<nrow; j++) { 1555 row = irow[j]; 1556 proc = row2proc[j]; 1557 if (proc == rank) { /* a local row */ 1558 rmap_loc[row - rstart] = j; 1559 } else { 1560 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1561 } 1562 } 1563 #else 1564 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1565 for (j=0; j<nrow; j++) { 1566 rmap[irow[j]] = j; 1567 } 1568 #endif 1569 1570 /* Update lens from offproc data */ 1571 /* recv a->j is done */ 1572 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1573 for (i=0; i<nrqs; i++) { 1574 proc = pa[i]; 1575 sbuf1_i = sbuf1[proc]; 1576 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1577 ct1 = 2 + 1; 1578 ct2 = 0; 1579 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1580 rbuf3_i = rbuf3[i]; /* received C->j */ 1581 1582 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1583 max1 = sbuf1_i[2]; 1584 for (k=0; k<max1; k++,ct1++) { 1585 #if defined(PETSC_USE_CTABLE) 1586 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1587 row--; 1588 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1589 #else 1590 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1591 #endif 1592 /* Now, store row index of submat in sbuf1_i[ct1] */ 1593 sbuf1_i[ct1] = row; 1594 1595 nnz = rbuf2_i[ct1]; 1596 if (!allcolumns) { 1597 for (l=0; l<nnz; l++,ct2++) { 1598 #if defined(PETSC_USE_CTABLE) 1599 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1600 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1601 } else { 1602 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1603 } 1604 #else 1605 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1606 #endif 1607 if (tcol) lens[row]++; 1608 } 1609 } else { /* allcolumns */ 1610 lens[row] += nnz; 1611 } 1612 } 1613 } 1614 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1615 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1616 1617 /* Create the submatrices */ 1618 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1619 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1620 1621 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1622 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1623 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1624 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1625 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1626 1627 /* create struct Mat_SubSppt and attached it to submat */ 1628 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1629 subc = (Mat_SeqAIJ*)submat->data; 1630 subc->submatis1 = smatis1; 1631 1632 smatis1->id = 0; 1633 smatis1->nrqs = nrqs; 1634 smatis1->nrqr = nrqr; 1635 smatis1->rbuf1 = rbuf1; 1636 smatis1->rbuf2 = rbuf2; 1637 smatis1->rbuf3 = rbuf3; 1638 smatis1->sbuf2 = sbuf2; 1639 smatis1->req_source2 = req_source2; 1640 1641 smatis1->sbuf1 = sbuf1; 1642 smatis1->ptr = ptr; 1643 smatis1->tmp = tmp; 1644 smatis1->ctr = ctr; 1645 1646 smatis1->pa = pa; 1647 smatis1->req_size = req_size; 1648 smatis1->req_source1 = req_source1; 1649 1650 smatis1->allcolumns = allcolumns; 1651 smatis1->singleis = PETSC_TRUE; 1652 smatis1->row2proc = row2proc; 1653 smatis1->rmap = rmap; 1654 smatis1->cmap = cmap; 1655 #if defined(PETSC_USE_CTABLE) 1656 smatis1->rmap_loc = rmap_loc; 1657 smatis1->cmap_loc = cmap_loc; 1658 #endif 1659 1660 smatis1->destroy = submat->ops->destroy; 1661 submat->ops->destroy = MatDestroy_SeqAIJ_Submatrices; 1662 submat->factortype = C->factortype; 1663 1664 /* compute rmax */ 1665 rmax = 0; 1666 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1667 1668 } else { /* scall == MAT_REUSE_MATRIX */ 1669 submat = submats[0]; 1670 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1671 1672 subc = (Mat_SeqAIJ*)submat->data; 1673 rmax = subc->rmax; 1674 smatis1 = subc->submatis1; 1675 nrqs = smatis1->nrqs; 1676 nrqr = smatis1->nrqr; 1677 rbuf1 = smatis1->rbuf1; 1678 rbuf2 = smatis1->rbuf2; 1679 rbuf3 = smatis1->rbuf3; 1680 req_source2 = smatis1->req_source2; 1681 1682 sbuf1 = smatis1->sbuf1; 1683 sbuf2 = smatis1->sbuf2; 1684 ptr = smatis1->ptr; 1685 tmp = smatis1->tmp; 1686 ctr = smatis1->ctr; 1687 1688 pa = smatis1->pa; 1689 req_size = smatis1->req_size; 1690 req_source1 = smatis1->req_source1; 1691 1692 allcolumns = smatis1->allcolumns; 1693 row2proc = smatis1->row2proc; 1694 rmap = smatis1->rmap; 1695 cmap = smatis1->cmap; 1696 #if defined(PETSC_USE_CTABLE) 1697 rmap_loc = smatis1->rmap_loc; 1698 cmap_loc = smatis1->cmap_loc; 1699 #endif 1700 } 1701 1702 /* Post recv matrix values */ 1703 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1704 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1705 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1706 for (i=0; i<nrqs; ++i) { 1707 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1708 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1709 } 1710 1711 /* Allocate sending buffers for a->a, and send them off */ 1712 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1713 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1714 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1715 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1716 1717 for (i=0; i<nrqr; i++) { 1718 rbuf1_i = rbuf1[i]; 1719 sbuf_aa_i = sbuf_aa[i]; 1720 ct1 = 2*rbuf1_i[0]+1; 1721 ct2 = 0; 1722 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1723 1724 kmax = rbuf1_i[2]; 1725 for (k=0; k<kmax; k++,ct1++) { 1726 row = rbuf1_i[ct1] - rstart; 1727 nzA = ai[row+1] - ai[row]; 1728 nzB = bi[row+1] - bi[row]; 1729 ncols = nzA + nzB; 1730 cworkB = bj + bi[row]; 1731 vworkA = a_a + ai[row]; 1732 vworkB = b_a + bi[row]; 1733 1734 /* load the column values for this row into vals*/ 1735 vals = sbuf_aa_i + ct2; 1736 1737 lwrite = 0; 1738 for (l=0; l<nzB; l++) { 1739 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1740 } 1741 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1742 for (l=0; l<nzB; l++) { 1743 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1744 } 1745 1746 ct2 += ncols; 1747 } 1748 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1749 } 1750 1751 /* Assemble submat */ 1752 /* First assemble the local rows */ 1753 for (j=0; j<nrow; j++) { 1754 row = irow[j]; 1755 proc = row2proc[j]; 1756 if (proc == rank) { 1757 Crow = row - rstart; /* local row index of C */ 1758 #if defined(PETSC_USE_CTABLE) 1759 row = rmap_loc[Crow]; /* row index of submat */ 1760 #else 1761 row = rmap[row]; 1762 #endif 1763 1764 if (allcolumns) { 1765 /* diagonal part A = c->A */ 1766 ncols = ai[Crow+1] - ai[Crow]; 1767 cols = aj + ai[Crow]; 1768 vals = a->a + ai[Crow]; 1769 i = 0; 1770 for (k=0; k<ncols; k++) { 1771 subcols[i] = cols[k] + cstart; 1772 subvals[i++] = vals[k]; 1773 } 1774 1775 /* off-diagonal part B = c->B */ 1776 ncols = bi[Crow+1] - bi[Crow]; 1777 cols = bj + bi[Crow]; 1778 vals = b->a + bi[Crow]; 1779 for (k=0; k<ncols; k++) { 1780 subcols[i] = bmap[cols[k]]; 1781 subvals[i++] = vals[k]; 1782 } 1783 1784 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1785 1786 } else { /* !allcolumns */ 1787 #if defined(PETSC_USE_CTABLE) 1788 /* diagonal part A = c->A */ 1789 ncols = ai[Crow+1] - ai[Crow]; 1790 cols = aj + ai[Crow]; 1791 vals = a->a + ai[Crow]; 1792 i = 0; 1793 for (k=0; k<ncols; k++) { 1794 tcol = cmap_loc[cols[k]]; 1795 if (tcol) { 1796 subcols[i] = --tcol; 1797 subvals[i++] = vals[k]; 1798 } 1799 } 1800 1801 /* off-diagonal part B = c->B */ 1802 ncols = bi[Crow+1] - bi[Crow]; 1803 cols = bj + bi[Crow]; 1804 vals = b->a + bi[Crow]; 1805 for (k=0; k<ncols; k++) { 1806 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1807 if (tcol) { 1808 subcols[i] = --tcol; 1809 subvals[i++] = vals[k]; 1810 } 1811 } 1812 #else 1813 /* diagonal part A = c->A */ 1814 ncols = ai[Crow+1] - ai[Crow]; 1815 cols = aj + ai[Crow]; 1816 vals = a->a + ai[Crow]; 1817 i = 0; 1818 for (k=0; k<ncols; k++) { 1819 tcol = cmap[cols[k]+cstart]; 1820 if (tcol) { 1821 subcols[i] = --tcol; 1822 subvals[i++] = vals[k]; 1823 } 1824 } 1825 1826 /* off-diagonal part B = c->B */ 1827 ncols = bi[Crow+1] - bi[Crow]; 1828 cols = bj + bi[Crow]; 1829 vals = b->a + bi[Crow]; 1830 for (k=0; k<ncols; k++) { 1831 tcol = cmap[bmap[cols[k]]]; 1832 if (tcol) { 1833 subcols[i] = --tcol; 1834 subvals[i++] = vals[k]; 1835 } 1836 } 1837 #endif 1838 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1839 } 1840 } 1841 } 1842 1843 /* Now assemble the off-proc rows */ 1844 for (i=0; i<nrqs; i++) { /* for each requested message */ 1845 /* recv values from other processes */ 1846 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1847 proc = pa[idex]; 1848 sbuf1_i = sbuf1[proc]; 1849 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1850 ct1 = 2 + 1; 1851 ct2 = 0; /* count of received C->j */ 1852 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1853 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1854 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1855 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1856 1857 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1858 max1 = sbuf1_i[2]; /* num of rows */ 1859 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1860 row = sbuf1_i[ct1]; /* row index of submat */ 1861 if (!allcolumns) { 1862 idex = 0; 1863 if (scall == MAT_INITIAL_MATRIX) { 1864 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1865 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1866 #if defined(PETSC_USE_CTABLE) 1867 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1868 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1869 } else { 1870 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1871 } 1872 #else 1873 tcol = cmap[rbuf3_i[ct2]]; 1874 #endif 1875 if (tcol) { 1876 subcols[idex] = --tcol; 1877 subvals[idex++] = rbuf4_i[ct2]; 1878 1879 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1880 For reuse, we replace received C->j with index that should be inserted to submat */ 1881 rbuf3_i[ct3++] = ct2; 1882 } 1883 } 1884 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1885 1886 } else { /* scall == MAT_REUSE_MATRIX */ 1887 submat = submats[0]; 1888 subc = (Mat_SeqAIJ*)submat->data; 1889 1890 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1891 for (l=0; l<nnz; l++) { 1892 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1893 subvals[idex++] = rbuf4_i[ct2]; 1894 } 1895 1896 bj = subc->j + subc->i[row]; 1897 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1898 } 1899 } else { /* allcolumns */ 1900 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1901 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1902 ct2 += nnz; 1903 } 1904 } 1905 } 1906 1907 /* sending a->a are done */ 1908 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1909 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1910 1911 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1912 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1913 submats[0] = submat; 1914 1915 /* Restore the indices */ 1916 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1917 if (!allcolumns) { 1918 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1919 } 1920 1921 /* Destroy allocated memory */ 1922 for (i=0; i<nrqs; ++i) { 1923 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1924 } 1925 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1926 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1927 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1928 1929 if (scall == MAT_INITIAL_MATRIX) { 1930 ierr = PetscFree(lens);CHKERRQ(ierr); 1931 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1932 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1933 } 1934 PetscFunctionReturn(0); 1935 } 1936 1937 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1938 { 1939 PetscErrorCode ierr; 1940 PetscInt ncol; 1941 PetscBool colflag,allcolumns=PETSC_FALSE; 1942 1943 PetscFunctionBegin; 1944 /* Allocate memory to hold all the submatrices */ 1945 if (scall == MAT_INITIAL_MATRIX) { 1946 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1947 } 1948 1949 /* Check for special case: each processor gets entire matrix columns */ 1950 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1951 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1952 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1953 1954 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1955 PetscFunctionReturn(0); 1956 } 1957 1958 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1959 { 1960 PetscErrorCode ierr; 1961 PetscInt nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2]; 1962 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1963 Mat_SeqAIJ *subc; 1964 Mat_SubSppt *smat; 1965 1966 PetscFunctionBegin; 1967 /* Check for special case: each processor has a single IS */ 1968 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1969 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1970 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */ 1971 PetscFunctionReturn(0); 1972 } 1973 1974 /* Collect global wantallmatrix and nstages */ 1975 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1976 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1977 if (!nmax) nmax = 1; 1978 1979 if (scall == MAT_INITIAL_MATRIX) { 1980 /* Collect global wantallmatrix and nstages */ 1981 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1982 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1983 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1984 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1985 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1986 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1987 wantallmatrix = PETSC_TRUE; 1988 1989 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1990 } 1991 } 1992 1993 /* Determine the number of stages through which submatrices are done 1994 Each stage will extract nmax submatrices. 1995 nmax is determined by the matrix column dimension. 1996 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1997 */ 1998 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1999 2000 in[0] = -1*(PetscInt)wantallmatrix; 2001 in[1] = nstages; 2002 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2003 wantallmatrix = (PetscBool)(-out[0]); 2004 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2005 2006 } else { /* MAT_REUSE_MATRIX */ 2007 if (ismax) { 2008 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2009 smat = subc->submatis1; 2010 } else { /* (*submat)[0] is a dummy matrix */ 2011 smat = (Mat_SubSppt*)(*submat)[0]->data; 2012 } 2013 if (!smat) { 2014 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2015 wantallmatrix = PETSC_TRUE; 2016 } else if (smat->singleis) { 2017 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2018 PetscFunctionReturn(0); 2019 } else { 2020 nstages = smat->nstages; 2021 } 2022 } 2023 2024 if (wantallmatrix) { 2025 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2026 PetscFunctionReturn(0); 2027 } 2028 2029 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2030 if (scall == MAT_INITIAL_MATRIX) { 2031 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2032 } 2033 2034 for (i=0,pos=0; i<nstages; i++) { 2035 if (pos+nmax <= ismax) max_no = nmax; 2036 else if (pos == ismax) max_no = 0; 2037 else max_no = ismax-pos; 2038 2039 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2040 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2041 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2042 smat->nstages = nstages; 2043 } 2044 pos += max_no; 2045 } 2046 2047 if (ismax && scall == MAT_INITIAL_MATRIX) { 2048 /* save nstages for reuse */ 2049 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2050 smat = subc->submatis1; 2051 smat->nstages = nstages; 2052 } 2053 PetscFunctionReturn(0); 2054 } 2055 2056 /* -------------------------------------------------------------------------*/ 2057 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2058 { 2059 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2060 Mat A = c->A; 2061 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2062 const PetscInt **icol,**irow; 2063 PetscInt *nrow,*ncol,start; 2064 PetscErrorCode ierr; 2065 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2066 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2067 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2068 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2069 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2070 #if defined(PETSC_USE_CTABLE) 2071 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2072 #else 2073 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2074 #endif 2075 const PetscInt *irow_i; 2076 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2077 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2078 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2079 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2080 MPI_Status *r_status3,*r_status4,*s_status4; 2081 MPI_Comm comm; 2082 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2083 PetscMPIInt *onodes1,*olengths1,end; 2084 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2085 Mat_SubSppt *smat_i; 2086 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2087 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2088 2089 PetscFunctionBegin; 2090 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2091 size = c->size; 2092 rank = c->rank; 2093 2094 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2095 ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2096 2097 for (i=0; i<ismax; i++) { 2098 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2099 if (!issorted[i]) iscsorted = issorted[i]; 2100 2101 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2102 2103 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2104 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2105 2106 /* Check for special case: allcolumn */ 2107 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2108 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2109 if (colflag && ncol[i] == C->cmap->N) { 2110 allcolumns[i] = PETSC_TRUE; 2111 icol[i] = NULL; 2112 } else { 2113 allcolumns[i] = PETSC_FALSE; 2114 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2115 } 2116 } 2117 2118 if (scall == MAT_REUSE_MATRIX) { 2119 /* Assumes new rows are same length as the old rows */ 2120 for (i=0; i<ismax; i++) { 2121 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2122 subc = (Mat_SeqAIJ*)submats[i]->data; 2123 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2124 2125 /* Initial matrix as if empty */ 2126 ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2127 2128 smat_i = subc->submatis1; 2129 2130 nrqs = smat_i->nrqs; 2131 nrqr = smat_i->nrqr; 2132 rbuf1 = smat_i->rbuf1; 2133 rbuf2 = smat_i->rbuf2; 2134 rbuf3 = smat_i->rbuf3; 2135 req_source2 = smat_i->req_source2; 2136 2137 sbuf1 = smat_i->sbuf1; 2138 sbuf2 = smat_i->sbuf2; 2139 ptr = smat_i->ptr; 2140 tmp = smat_i->tmp; 2141 ctr = smat_i->ctr; 2142 2143 pa = smat_i->pa; 2144 req_size = smat_i->req_size; 2145 req_source1 = smat_i->req_source1; 2146 2147 allcolumns[i] = smat_i->allcolumns; 2148 row2proc[i] = smat_i->row2proc; 2149 rmap[i] = smat_i->rmap; 2150 cmap[i] = smat_i->cmap; 2151 } 2152 2153 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2154 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2155 smat_i = (Mat_SubSppt*)submats[0]->data; 2156 2157 nrqs = smat_i->nrqs; 2158 nrqr = smat_i->nrqr; 2159 rbuf1 = smat_i->rbuf1; 2160 rbuf2 = smat_i->rbuf2; 2161 rbuf3 = smat_i->rbuf3; 2162 req_source2 = smat_i->req_source2; 2163 2164 sbuf1 = smat_i->sbuf1; 2165 sbuf2 = smat_i->sbuf2; 2166 ptr = smat_i->ptr; 2167 tmp = smat_i->tmp; 2168 ctr = smat_i->ctr; 2169 2170 pa = smat_i->pa; 2171 req_size = smat_i->req_size; 2172 req_source1 = smat_i->req_source1; 2173 2174 allcolumns[0] = PETSC_FALSE; 2175 } 2176 } else { /* scall == MAT_INITIAL_MATRIX */ 2177 /* Get some new tags to keep the communication clean */ 2178 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2179 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2180 2181 /* evaluate communication - mesg to who, length of mesg, and buffer space 2182 required. Based on this, buffers are allocated, and data copied into them*/ 2183 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2184 2185 for (i=0; i<ismax; i++) { 2186 jmax = nrow[i]; 2187 irow_i = irow[i]; 2188 2189 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2190 row2proc[i] = row2proc_i; 2191 2192 if (issorted[i]) proc = 0; 2193 for (j=0; j<jmax; j++) { 2194 if (!issorted[i]) proc = 0; 2195 row = irow_i[j]; 2196 while (row >= C->rmap->range[proc+1]) proc++; 2197 w4[proc]++; 2198 row2proc_i[j] = proc; /* map row index to proc */ 2199 } 2200 for (j=0; j<size; j++) { 2201 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2202 } 2203 } 2204 2205 nrqs = 0; /* no of outgoing messages */ 2206 msz = 0; /* total mesg length (for all procs) */ 2207 w1[rank] = 0; /* no mesg sent to self */ 2208 w3[rank] = 0; 2209 for (i=0; i<size; i++) { 2210 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2211 } 2212 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2213 for (i=0,j=0; i<size; i++) { 2214 if (w1[i]) { pa[j] = i; j++; } 2215 } 2216 2217 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2218 for (i=0; i<nrqs; i++) { 2219 j = pa[i]; 2220 w1[j] += w2[j] + 2* w3[j]; 2221 msz += w1[j]; 2222 } 2223 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2224 2225 /* Determine the number of messages to expect, their lengths, from from-ids */ 2226 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2227 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2228 2229 /* Now post the Irecvs corresponding to these messages */ 2230 tag0 = ((PetscObject)C)->tag; 2231 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2232 2233 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2234 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2235 2236 /* Allocate Memory for outgoing messages */ 2237 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2238 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2239 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2240 2241 { 2242 PetscInt *iptr = tmp; 2243 k = 0; 2244 for (i=0; i<nrqs; i++) { 2245 j = pa[i]; 2246 iptr += k; 2247 sbuf1[j] = iptr; 2248 k = w1[j]; 2249 } 2250 } 2251 2252 /* Form the outgoing messages. Initialize the header space */ 2253 for (i=0; i<nrqs; i++) { 2254 j = pa[i]; 2255 sbuf1[j][0] = 0; 2256 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2257 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2258 } 2259 2260 /* Parse the isrow and copy data into outbuf */ 2261 for (i=0; i<ismax; i++) { 2262 row2proc_i = row2proc[i]; 2263 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2264 irow_i = irow[i]; 2265 jmax = nrow[i]; 2266 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2267 proc = row2proc_i[j]; 2268 if (proc != rank) { /* copy to the outgoing buf*/ 2269 ctr[proc]++; 2270 *ptr[proc] = irow_i[j]; 2271 ptr[proc]++; 2272 } 2273 } 2274 /* Update the headers for the current IS */ 2275 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2276 if ((ctr_j = ctr[j])) { 2277 sbuf1_j = sbuf1[j]; 2278 k = ++sbuf1_j[0]; 2279 sbuf1_j[2*k] = ctr_j; 2280 sbuf1_j[2*k-1] = i; 2281 } 2282 } 2283 } 2284 2285 /* Now post the sends */ 2286 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2287 for (i=0; i<nrqs; ++i) { 2288 j = pa[i]; 2289 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2290 } 2291 2292 /* Post Receives to capture the buffer size */ 2293 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2294 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2295 rbuf2[0] = tmp + msz; 2296 for (i=1; i<nrqs; ++i) { 2297 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2298 } 2299 for (i=0; i<nrqs; ++i) { 2300 j = pa[i]; 2301 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2302 } 2303 2304 /* Send to other procs the buf size they should allocate */ 2305 /* Receive messages*/ 2306 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2307 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2308 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2309 { 2310 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2311 PetscInt *sbuf2_i; 2312 2313 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2314 for (i=0; i<nrqr; ++i) { 2315 req_size[i] = 0; 2316 rbuf1_i = rbuf1[i]; 2317 start = 2*rbuf1_i[0] + 1; 2318 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2319 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2320 sbuf2_i = sbuf2[i]; 2321 for (j=start; j<end; j++) { 2322 id = rbuf1_i[j] - rstart; 2323 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2324 sbuf2_i[j] = ncols; 2325 req_size[i] += ncols; 2326 } 2327 req_source1[i] = r_status1[i].MPI_SOURCE; 2328 /* form the header */ 2329 sbuf2_i[0] = req_size[i]; 2330 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2331 2332 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2333 } 2334 } 2335 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2336 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2337 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2338 2339 /* Receive messages*/ 2340 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2341 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2342 2343 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2344 for (i=0; i<nrqs; ++i) { 2345 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2346 req_source2[i] = r_status2[i].MPI_SOURCE; 2347 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2348 } 2349 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2350 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2351 2352 /* Wait on sends1 and sends2 */ 2353 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2354 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2355 2356 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2357 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2358 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2359 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2360 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2361 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2362 2363 /* Now allocate sending buffers for a->j, and send them off */ 2364 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2365 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2366 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2367 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2368 2369 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2370 { 2371 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2372 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2373 PetscInt cend = C->cmap->rend; 2374 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2375 2376 for (i=0; i<nrqr; i++) { 2377 rbuf1_i = rbuf1[i]; 2378 sbuf_aj_i = sbuf_aj[i]; 2379 ct1 = 2*rbuf1_i[0] + 1; 2380 ct2 = 0; 2381 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2382 kmax = rbuf1[i][2*j]; 2383 for (k=0; k<kmax; k++,ct1++) { 2384 row = rbuf1_i[ct1] - rstart; 2385 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2386 ncols = nzA + nzB; 2387 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2388 2389 /* load the column indices for this row into cols */ 2390 cols = sbuf_aj_i + ct2; 2391 2392 lwrite = 0; 2393 for (l=0; l<nzB; l++) { 2394 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2395 } 2396 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2397 for (l=0; l<nzB; l++) { 2398 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2399 } 2400 2401 ct2 += ncols; 2402 } 2403 } 2404 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2405 } 2406 } 2407 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2408 2409 /* create col map: global col of C -> local col of submatrices */ 2410 { 2411 const PetscInt *icol_i; 2412 #if defined(PETSC_USE_CTABLE) 2413 for (i=0; i<ismax; i++) { 2414 if (!allcolumns[i]) { 2415 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2416 2417 jmax = ncol[i]; 2418 icol_i = icol[i]; 2419 cmap_i = cmap[i]; 2420 for (j=0; j<jmax; j++) { 2421 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2422 } 2423 } else cmap[i] = NULL; 2424 } 2425 #else 2426 for (i=0; i<ismax; i++) { 2427 if (!allcolumns[i]) { 2428 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2429 jmax = ncol[i]; 2430 icol_i = icol[i]; 2431 cmap_i = cmap[i]; 2432 for (j=0; j<jmax; j++) { 2433 cmap_i[icol_i[j]] = j+1; 2434 } 2435 } else cmap[i] = NULL; 2436 } 2437 #endif 2438 } 2439 2440 /* Create lens which is required for MatCreate... */ 2441 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2442 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2443 2444 if (ismax) { 2445 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2446 } 2447 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2448 2449 /* Update lens from local data */ 2450 for (i=0; i<ismax; i++) { 2451 row2proc_i = row2proc[i]; 2452 jmax = nrow[i]; 2453 if (!allcolumns[i]) cmap_i = cmap[i]; 2454 irow_i = irow[i]; 2455 lens_i = lens[i]; 2456 for (j=0; j<jmax; j++) { 2457 row = irow_i[j]; 2458 proc = row2proc_i[j]; 2459 if (proc == rank) { 2460 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2461 if (!allcolumns[i]) { 2462 for (k=0; k<ncols; k++) { 2463 #if defined(PETSC_USE_CTABLE) 2464 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2465 #else 2466 tcol = cmap_i[cols[k]]; 2467 #endif 2468 if (tcol) lens_i[j]++; 2469 } 2470 } else { /* allcolumns */ 2471 lens_i[j] = ncols; 2472 } 2473 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2474 } 2475 } 2476 } 2477 2478 /* Create row map: global row of C -> local row of submatrices */ 2479 #if defined(PETSC_USE_CTABLE) 2480 for (i=0; i<ismax; i++) { 2481 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2482 irow_i = irow[i]; 2483 jmax = nrow[i]; 2484 for (j=0; j<jmax; j++) { 2485 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2486 } 2487 } 2488 #else 2489 for (i=0; i<ismax; i++) { 2490 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2491 rmap_i = rmap[i]; 2492 irow_i = irow[i]; 2493 jmax = nrow[i]; 2494 for (j=0; j<jmax; j++) { 2495 rmap_i[irow_i[j]] = j; 2496 } 2497 } 2498 #endif 2499 2500 /* Update lens from offproc data */ 2501 { 2502 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2503 2504 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2505 for (tmp2=0; tmp2<nrqs; tmp2++) { 2506 sbuf1_i = sbuf1[pa[tmp2]]; 2507 jmax = sbuf1_i[0]; 2508 ct1 = 2*jmax+1; 2509 ct2 = 0; 2510 rbuf2_i = rbuf2[tmp2]; 2511 rbuf3_i = rbuf3[tmp2]; 2512 for (j=1; j<=jmax; j++) { 2513 is_no = sbuf1_i[2*j-1]; 2514 max1 = sbuf1_i[2*j]; 2515 lens_i = lens[is_no]; 2516 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2517 rmap_i = rmap[is_no]; 2518 for (k=0; k<max1; k++,ct1++) { 2519 #if defined(PETSC_USE_CTABLE) 2520 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2521 row--; 2522 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2523 #else 2524 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2525 #endif 2526 max2 = rbuf2_i[ct1]; 2527 for (l=0; l<max2; l++,ct2++) { 2528 if (!allcolumns[is_no]) { 2529 #if defined(PETSC_USE_CTABLE) 2530 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2531 #else 2532 tcol = cmap_i[rbuf3_i[ct2]]; 2533 #endif 2534 if (tcol) lens_i[row]++; 2535 } else { /* allcolumns */ 2536 lens_i[row]++; /* lens_i[row] += max2 ? */ 2537 } 2538 } 2539 } 2540 } 2541 } 2542 } 2543 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2544 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2545 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2546 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2547 2548 /* Create the submatrices */ 2549 for (i=0; i<ismax; i++) { 2550 PetscInt rbs,cbs; 2551 2552 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2553 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2554 2555 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2556 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2557 2558 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2559 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2560 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2561 2562 /* create struct Mat_SubSppt and attached it to submat */ 2563 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2564 subc = (Mat_SeqAIJ*)submats[i]->data; 2565 subc->submatis1 = smat_i; 2566 2567 smat_i->destroy = submats[i]->ops->destroy; 2568 submats[i]->ops->destroy = MatDestroy_SeqAIJ_Submatrices; 2569 submats[i]->factortype = C->factortype; 2570 2571 smat_i->id = i; 2572 smat_i->nrqs = nrqs; 2573 smat_i->nrqr = nrqr; 2574 smat_i->rbuf1 = rbuf1; 2575 smat_i->rbuf2 = rbuf2; 2576 smat_i->rbuf3 = rbuf3; 2577 smat_i->sbuf2 = sbuf2; 2578 smat_i->req_source2 = req_source2; 2579 2580 smat_i->sbuf1 = sbuf1; 2581 smat_i->ptr = ptr; 2582 smat_i->tmp = tmp; 2583 smat_i->ctr = ctr; 2584 2585 smat_i->pa = pa; 2586 smat_i->req_size = req_size; 2587 smat_i->req_source1 = req_source1; 2588 2589 smat_i->allcolumns = allcolumns[i]; 2590 smat_i->singleis = PETSC_FALSE; 2591 smat_i->row2proc = row2proc[i]; 2592 smat_i->rmap = rmap[i]; 2593 smat_i->cmap = cmap[i]; 2594 } 2595 2596 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2597 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2598 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2599 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2600 2601 /* create struct Mat_SubSppt and attached it to submat */ 2602 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2603 submats[0]->data = (void*)smat_i; 2604 2605 smat_i->destroy = submats[0]->ops->destroy; 2606 submats[0]->ops->destroy = MatDestroy_Dummy_Submatrices; 2607 submats[0]->factortype = C->factortype; 2608 2609 smat_i->id = 0; 2610 smat_i->nrqs = nrqs; 2611 smat_i->nrqr = nrqr; 2612 smat_i->rbuf1 = rbuf1; 2613 smat_i->rbuf2 = rbuf2; 2614 smat_i->rbuf3 = rbuf3; 2615 smat_i->sbuf2 = sbuf2; 2616 smat_i->req_source2 = req_source2; 2617 2618 smat_i->sbuf1 = sbuf1; 2619 smat_i->ptr = ptr; 2620 smat_i->tmp = tmp; 2621 smat_i->ctr = ctr; 2622 2623 smat_i->pa = pa; 2624 smat_i->req_size = req_size; 2625 smat_i->req_source1 = req_source1; 2626 2627 smat_i->allcolumns = PETSC_FALSE; 2628 smat_i->singleis = PETSC_FALSE; 2629 smat_i->row2proc = NULL; 2630 smat_i->rmap = NULL; 2631 smat_i->cmap = NULL; 2632 } 2633 2634 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2635 ierr = PetscFree(lens);CHKERRQ(ierr); 2636 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2637 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2638 2639 } /* endof scall == MAT_INITIAL_MATRIX */ 2640 2641 /* Post recv matrix values */ 2642 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2643 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2644 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2645 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2646 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2647 for (i=0; i<nrqs; ++i) { 2648 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2649 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2650 } 2651 2652 /* Allocate sending buffers for a->a, and send them off */ 2653 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2654 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2655 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2656 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2657 2658 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2659 { 2660 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2661 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2662 PetscInt cend = C->cmap->rend; 2663 PetscInt *b_j = b->j; 2664 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2665 2666 for (i=0; i<nrqr; i++) { 2667 rbuf1_i = rbuf1[i]; 2668 sbuf_aa_i = sbuf_aa[i]; 2669 ct1 = 2*rbuf1_i[0]+1; 2670 ct2 = 0; 2671 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2672 kmax = rbuf1_i[2*j]; 2673 for (k=0; k<kmax; k++,ct1++) { 2674 row = rbuf1_i[ct1] - rstart; 2675 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2676 ncols = nzA + nzB; 2677 cworkB = b_j + b_i[row]; 2678 vworkA = a_a + a_i[row]; 2679 vworkB = b_a + b_i[row]; 2680 2681 /* load the column values for this row into vals*/ 2682 vals = sbuf_aa_i+ct2; 2683 2684 lwrite = 0; 2685 for (l=0; l<nzB; l++) { 2686 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2687 } 2688 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2689 for (l=0; l<nzB; l++) { 2690 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2691 } 2692 2693 ct2 += ncols; 2694 } 2695 } 2696 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2697 } 2698 } 2699 2700 /* Assemble the matrices */ 2701 /* First assemble the local rows */ 2702 for (i=0; i<ismax; i++) { 2703 row2proc_i = row2proc[i]; 2704 subc = (Mat_SeqAIJ*)submats[i]->data; 2705 imat_ilen = subc->ilen; 2706 imat_j = subc->j; 2707 imat_i = subc->i; 2708 imat_a = subc->a; 2709 2710 if (!allcolumns[i]) cmap_i = cmap[i]; 2711 rmap_i = rmap[i]; 2712 irow_i = irow[i]; 2713 jmax = nrow[i]; 2714 for (j=0; j<jmax; j++) { 2715 row = irow_i[j]; 2716 proc = row2proc_i[j]; 2717 if (proc == rank) { 2718 old_row = row; 2719 #if defined(PETSC_USE_CTABLE) 2720 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2721 row--; 2722 #else 2723 row = rmap_i[row]; 2724 #endif 2725 ilen_row = imat_ilen[row]; 2726 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2727 mat_i = imat_i[row]; 2728 mat_a = imat_a + mat_i; 2729 mat_j = imat_j + mat_i; 2730 if (!allcolumns[i]) { 2731 for (k=0; k<ncols; k++) { 2732 #if defined(PETSC_USE_CTABLE) 2733 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2734 #else 2735 tcol = cmap_i[cols[k]]; 2736 #endif 2737 if (tcol) { 2738 *mat_j++ = tcol - 1; 2739 *mat_a++ = vals[k]; 2740 ilen_row++; 2741 } 2742 } 2743 } else { /* allcolumns */ 2744 for (k=0; k<ncols; k++) { 2745 *mat_j++ = cols[k]; /* global col index! */ 2746 *mat_a++ = vals[k]; 2747 ilen_row++; 2748 } 2749 } 2750 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2751 2752 imat_ilen[row] = ilen_row; 2753 } 2754 } 2755 } 2756 2757 /* Now assemble the off proc rows */ 2758 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2759 for (tmp2=0; tmp2<nrqs; tmp2++) { 2760 sbuf1_i = sbuf1[pa[tmp2]]; 2761 jmax = sbuf1_i[0]; 2762 ct1 = 2*jmax + 1; 2763 ct2 = 0; 2764 rbuf2_i = rbuf2[tmp2]; 2765 rbuf3_i = rbuf3[tmp2]; 2766 rbuf4_i = rbuf4[tmp2]; 2767 for (j=1; j<=jmax; j++) { 2768 is_no = sbuf1_i[2*j-1]; 2769 rmap_i = rmap[is_no]; 2770 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2771 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2772 imat_ilen = subc->ilen; 2773 imat_j = subc->j; 2774 imat_i = subc->i; 2775 imat_a = subc->a; 2776 max1 = sbuf1_i[2*j]; 2777 for (k=0; k<max1; k++,ct1++) { 2778 row = sbuf1_i[ct1]; 2779 #if defined(PETSC_USE_CTABLE) 2780 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2781 row--; 2782 #else 2783 row = rmap_i[row]; 2784 #endif 2785 ilen = imat_ilen[row]; 2786 mat_i = imat_i[row]; 2787 mat_a = imat_a + mat_i; 2788 mat_j = imat_j + mat_i; 2789 max2 = rbuf2_i[ct1]; 2790 if (!allcolumns[is_no]) { 2791 for (l=0; l<max2; l++,ct2++) { 2792 #if defined(PETSC_USE_CTABLE) 2793 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2794 #else 2795 tcol = cmap_i[rbuf3_i[ct2]]; 2796 #endif 2797 if (tcol) { 2798 *mat_j++ = tcol - 1; 2799 *mat_a++ = rbuf4_i[ct2]; 2800 ilen++; 2801 } 2802 } 2803 } else { /* allcolumns */ 2804 for (l=0; l<max2; l++,ct2++) { 2805 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2806 *mat_a++ = rbuf4_i[ct2]; 2807 ilen++; 2808 } 2809 } 2810 imat_ilen[row] = ilen; 2811 } 2812 } 2813 } 2814 2815 if (!iscsorted) { /* sort column indices of the rows */ 2816 for (i=0; i<ismax; i++) { 2817 subc = (Mat_SeqAIJ*)submats[i]->data; 2818 imat_j = subc->j; 2819 imat_i = subc->i; 2820 imat_a = subc->a; 2821 imat_ilen = subc->ilen; 2822 2823 if (allcolumns[i]) continue; 2824 jmax = nrow[i]; 2825 for (j=0; j<jmax; j++) { 2826 mat_i = imat_i[j]; 2827 mat_a = imat_a + mat_i; 2828 mat_j = imat_j + mat_i; 2829 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2830 } 2831 } 2832 } 2833 2834 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2835 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2836 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2837 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2838 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2839 2840 /* Restore the indices */ 2841 for (i=0; i<ismax; i++) { 2842 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2843 if (!allcolumns[i]) { 2844 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2845 } 2846 } 2847 2848 for (i=0; i<ismax; i++) { 2849 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2850 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2851 } 2852 2853 /* Destroy allocated memory */ 2854 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2855 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2856 ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr); 2857 2858 for (i=0; i<nrqs; ++i) { 2859 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2860 } 2861 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2862 2863 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2864 PetscFunctionReturn(0); 2865 } 2866 2867 /* 2868 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2869 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2870 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2871 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2872 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2873 state, and needs to be "assembled" later by compressing B's column space. 2874 2875 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2876 Following this call, C->A & C->B have been created, even if empty. 2877 */ 2878 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2879 { 2880 /* If making this function public, change the error returned in this function away from _PLIB. */ 2881 PetscErrorCode ierr; 2882 Mat_MPIAIJ *aij; 2883 Mat_SeqAIJ *Baij; 2884 PetscBool seqaij,Bdisassembled; 2885 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2886 PetscScalar v; 2887 const PetscInt *rowindices,*colindices; 2888 2889 PetscFunctionBegin; 2890 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2891 if (A) { 2892 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2893 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2894 if (rowemb) { 2895 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2896 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2897 } else { 2898 if (C->rmap->n != A->rmap->n) { 2899 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2900 } 2901 } 2902 if (dcolemb) { 2903 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2904 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2905 } else { 2906 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2907 } 2908 } 2909 if (B) { 2910 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2911 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2912 if (rowemb) { 2913 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2914 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2915 } else { 2916 if (C->rmap->n != B->rmap->n) { 2917 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2918 } 2919 } 2920 if (ocolemb) { 2921 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2922 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2923 } else { 2924 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2925 } 2926 } 2927 2928 aij = (Mat_MPIAIJ*)C->data; 2929 if (!aij->A) { 2930 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2931 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2932 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2933 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2934 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2935 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2936 } 2937 if (A) { 2938 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2939 } else { 2940 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2941 } 2942 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2943 /* 2944 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2945 need to "disassemble" B -- convert it to using C's global indices. 2946 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2947 2948 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2949 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2950 2951 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2952 At least avoid calling MatSetValues() and the implied searches? 2953 */ 2954 2955 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2956 #if defined(PETSC_USE_CTABLE) 2957 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2958 #else 2959 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2960 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2961 if (aij->B) { 2962 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2963 } 2964 #endif 2965 ngcol = 0; 2966 if (aij->lvec) { 2967 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2968 } 2969 if (aij->garray) { 2970 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2971 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2972 } 2973 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2974 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2975 } 2976 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2977 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2978 } 2979 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2980 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2981 } 2982 } 2983 Bdisassembled = PETSC_FALSE; 2984 if (!aij->B) { 2985 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2986 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2987 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2988 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2989 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2990 Bdisassembled = PETSC_TRUE; 2991 } 2992 if (B) { 2993 Baij = (Mat_SeqAIJ*)B->data; 2994 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2995 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2996 for (i=0; i<B->rmap->n; i++) { 2997 nz[i] = Baij->i[i+1] - Baij->i[i]; 2998 } 2999 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3000 ierr = PetscFree(nz);CHKERRQ(ierr); 3001 } 3002 3003 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3004 shift = rend-rstart; 3005 count = 0; 3006 rowindices = NULL; 3007 colindices = NULL; 3008 if (rowemb) { 3009 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3010 } 3011 if (ocolemb) { 3012 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3013 } 3014 for (i=0; i<B->rmap->n; i++) { 3015 PetscInt row; 3016 row = i; 3017 if (rowindices) row = rowindices[i]; 3018 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3019 col = Baij->j[count]; 3020 if (colindices) col = colindices[col]; 3021 if (Bdisassembled && col>=rstart) col += shift; 3022 v = Baij->a[count]; 3023 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3024 ++count; 3025 } 3026 } 3027 /* No assembly for aij->B is necessary. */ 3028 /* FIXME: set aij->B's nonzerostate correctly. */ 3029 } else { 3030 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3031 } 3032 C->preallocated = PETSC_TRUE; 3033 C->was_assembled = PETSC_FALSE; 3034 C->assembled = PETSC_FALSE; 3035 /* 3036 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3037 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3038 */ 3039 PetscFunctionReturn(0); 3040 } 3041 3042 /* 3043 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3044 */ 3045 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3046 { 3047 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3048 3049 PetscFunctionBegin; 3050 PetscValidPointer(A,2); 3051 PetscValidPointer(B,3); 3052 /* FIXME: make sure C is assembled */ 3053 *A = aij->A; 3054 *B = aij->B; 3055 /* Note that we don't incref *A and *B, so be careful! */ 3056 PetscFunctionReturn(0); 3057 } 3058 3059 /* 3060 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3061 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3062 */ 3063 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3064 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3065 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3066 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3067 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3068 { 3069 PetscErrorCode ierr; 3070 PetscMPIInt isize,flag; 3071 PetscInt i,ii,cismax,ispar; 3072 Mat *A,*B; 3073 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3074 3075 PetscFunctionBegin; 3076 if (!ismax) PetscFunctionReturn(0); 3077 3078 for (i = 0, cismax = 0; i < ismax; ++i) { 3079 PetscMPIInt isize; 3080 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3081 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3082 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3083 if (isize > 1) ++cismax; 3084 } 3085 3086 /* 3087 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3088 ispar counts the number of parallel ISs across C's comm. 3089 */ 3090 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3091 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3092 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3093 PetscFunctionReturn(0); 3094 } 3095 3096 /* if (ispar) */ 3097 /* 3098 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3099 These are used to extract the off-diag portion of the resulting parallel matrix. 3100 The row IS for the off-diag portion is the same as for the diag portion, 3101 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3102 */ 3103 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3104 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3105 for (i = 0, ii = 0; i < ismax; ++i) { 3106 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3107 if (isize > 1) { 3108 /* 3109 TODO: This is the part that's ***NOT SCALABLE***. 3110 To fix this we need to extract just the indices of C's nonzero columns 3111 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3112 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3113 to be done without serializing on the IS list, so, most likely, it is best 3114 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3115 */ 3116 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3117 /* Now we have to 3118 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3119 were sorted on each rank, concatenated they might no longer be sorted; 3120 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3121 indices in the nondecreasing order to the original index positions. 3122 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3123 */ 3124 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3125 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3126 ++ii; 3127 } 3128 } 3129 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3130 for (i = 0, ii = 0; i < ismax; ++i) { 3131 PetscInt j,issize; 3132 const PetscInt *indices; 3133 3134 /* 3135 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3136 */ 3137 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3138 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3139 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3140 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3141 for (j = 1; j < issize; ++j) { 3142 if (indices[j] == indices[j-1]) { 3143 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3144 } 3145 } 3146 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3147 3148 3149 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3150 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3151 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3152 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3153 for (j = 1; j < issize; ++j) { 3154 if (indices[j-1] == indices[j]) { 3155 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3156 } 3157 } 3158 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3159 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3160 if (isize > 1) { 3161 cisrow[ii] = isrow[i]; 3162 ++ii; 3163 } 3164 } 3165 /* 3166 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3167 array of sequential matrices underlying the resulting parallel matrices. 3168 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3169 contain duplicates. 3170 3171 There are as many diag matrices as there are original index sets. There are only as many parallel 3172 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3173 3174 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3175 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3176 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3177 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3178 with A[i] and B[ii] extracted from the corresponding MPI submat. 3179 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3180 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3181 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3182 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3183 values into A[i] and B[ii] sitting inside the corresponding submat. 3184 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3185 A[i], B[ii], AA[i] or BB[ii] matrices. 3186 */ 3187 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3188 if (scall == MAT_INITIAL_MATRIX) { 3189 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3190 } 3191 3192 /* Now obtain the sequential A and B submatrices separately. */ 3193 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3194 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3195 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3196 3197 /* 3198 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3199 matrices A & B have been extracted directly into the parallel matrices containing them, or 3200 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3201 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3202 to have the same sparsity pattern. 3203 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3204 must be constructed for C. This is done by setseqmat(s). 3205 */ 3206 for (i = 0, ii = 0; i < ismax; ++i) { 3207 /* 3208 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3209 That way we can avoid sorting and computing permutations when reusing. 3210 To this end: 3211 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3212 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3213 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3214 */ 3215 MatStructure pattern; 3216 pattern = DIFFERENT_NONZERO_PATTERN; 3217 3218 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3219 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3220 if (isize > 1) { 3221 if (scall == MAT_INITIAL_MATRIX) { 3222 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3223 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3224 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3225 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3226 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3227 } 3228 /* 3229 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3230 */ 3231 { 3232 Mat AA,BB; 3233 AA = A[i]; 3234 BB = B[ii]; 3235 if (AA || BB) { 3236 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3237 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3238 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3239 } 3240 3241 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3242 } 3243 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3244 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3245 ++ii; 3246 } else { /* if (isize == 1) */ 3247 if (scall == MAT_REUSE_MATRIX) { 3248 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3249 } 3250 if (isrow_p[i] || iscol_p[i]) { 3251 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3252 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3253 /* Otherwise A is extracted straight into (*submats)[i]. */ 3254 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3255 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3256 } else (*submat)[i] = A[i]; 3257 } 3258 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3259 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3260 } 3261 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3262 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3263 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3264 ierr = PetscFree(A);CHKERRQ(ierr); 3265 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3266 PetscFunctionReturn(0); 3267 } 3268 3269 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3270 { 3271 PetscErrorCode ierr; 3272 3273 PetscFunctionBegin; 3274 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3275 PetscFunctionReturn(0); 3276 } 3277