1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++){ 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++){ 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank){ 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows){ 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++){ 115 if (tosizes_temp[i]){ 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++){ 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++){ 133 for (j=0; j<fromsizes[2*i]; j++){ 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++){ 169 for (j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++){ 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++){ 209 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 210 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 213 } 214 /* retrieve information to get row id and its overlap */ 215 for (i=0; i<nrecvs; ){ 216 is_id = recvdata[i++]; 217 data_size = recvdata[i++]; 218 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 219 isz_i = isz[is_id]; 220 for (j=0; j< data_size; j++){ 221 col = recvdata[i++]; 222 indices_i[isz_i++] = col; 223 } 224 isz[is_id] = isz_i; 225 } 226 /* remove duplicate entities */ 227 for (i=0; i<nidx; i++){ 228 indices_i = indices_temp+(max_lsize+nrecvs)*i; 229 isz_i = isz[i]; 230 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 231 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 232 } 233 ierr = PetscFree(isz);CHKERRQ(ierr); 234 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 239 { 240 PetscLayout rmap,cmap; 241 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 242 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 243 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 244 const PetscInt *gcols,*ai,*aj,*bi,*bj; 245 Mat amat,bmat; 246 PetscMPIInt rank; 247 PetscBool done; 248 MPI_Comm comm; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 253 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 254 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 255 /* Even if the mat is symmetric, we still assume it is not symmetric */ 256 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 257 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 258 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 /* total number of nonzero values is used to estimate the memory usage in the next step */ 261 tnz = ai[an]+bi[bn]; 262 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 263 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 264 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 265 /* to find the longest message */ 266 max_fszs = 0; 267 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 268 /* better way to estimate number of nonzero in the mat??? */ 269 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 270 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 271 rows_pos = 0; 272 totalrows = 0; 273 for (i=0; i<nfrom; i++){ 274 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 275 /* group data together */ 276 for (j=0; j<fromsizes[2*i]; j+=2){ 277 is_id = fromrows[rows_pos++];/* no of is */ 278 rows_i = rows_data[is_id]; 279 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 280 } 281 /* estimate a space to avoid multiple allocations */ 282 for (j=0; j<nidx; j++){ 283 indvc_ij = 0; 284 rows_i = rows_data[j]; 285 for (l=0; l<rows_pos_i[j]; l++){ 286 row = rows_i[l]-rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for (k=start; k<end; k++){ /* Amat */ 290 col = aj[k] + cstart; 291 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for (k=start; k<end; k++) { /* Bmat */ 296 col = gcols[bj[k]]; 297 indices_tmp[indvc_ij++] = col; 298 } 299 } 300 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 301 indv_counts[i*nidx+j] = indvc_ij; 302 totalrows += indvc_ij; 303 } 304 } 305 /* message triple <no of is, number of rows, rows> */ 306 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 307 totalrows = 0; 308 rows_pos = 0; 309 /* use this code again */ 310 for (i=0;i<nfrom;i++){ 311 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 312 for (j=0; j<fromsizes[2*i]; j+=2){ 313 is_id = fromrows[rows_pos++]; 314 rows_i = rows_data[is_id]; 315 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 316 } 317 /* add data */ 318 for (j=0; j<nidx; j++){ 319 if (!indv_counts[i*nidx+j]) continue; 320 indvc_ij = 0; 321 sbdata[totalrows++] = j; 322 sbdata[totalrows++] = indv_counts[i*nidx+j]; 323 sbsizes[2*i] += 2; 324 rows_i = rows_data[j]; 325 for (l=0; l<rows_pos_i[j]; l++){ 326 row = rows_i[l]-rstart; 327 start = ai[row]; 328 end = ai[row+1]; 329 for (k=start; k<end; k++){ /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; 332 } 333 start = bi[row]; 334 end = bi[row+1]; 335 for (k=start; k<end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 341 sbsizes[2*i] += indvc_ij; 342 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 343 totalrows += indvc_ij; 344 } 345 } 346 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 347 for (i=0; i<nfrom; i++){ 348 offsets[i+1] = offsets[i] + sbsizes[2*i]; 349 sbsizes[2*i+1] = offsets[i]; 350 } 351 ierr = PetscFree(offsets);CHKERRQ(ierr); 352 if (sbrowsizes) *sbrowsizes = sbsizes; 353 if (sbrows) *sbrows = sbdata; 354 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 355 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 356 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 361 { 362 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 363 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 364 PetscInt lsize,lsize_tmp,owner; 365 PetscMPIInt rank; 366 Mat amat,bmat; 367 PetscBool done; 368 PetscLayout cmap,rmap; 369 MPI_Comm comm; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 374 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 375 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 376 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 377 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 378 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 /* is it a safe way to compute number of nonzero values ? */ 381 tnz = ai[an]+bi[bn]; 382 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 383 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 384 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 385 /* it is a better way to estimate memory than the old implementation 386 * where global size of matrix is used 387 * */ 388 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 389 for (i=0; i<nidx; i++) { 390 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 391 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 392 lsize_tmp = 0; 393 for (j=0; j<lsize; j++) { 394 owner = -1; 395 row = indices[j]; 396 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 397 if (owner != rank) continue; 398 /* local number */ 399 row -= rstart; 400 start = ai[row]; 401 end = ai[row+1]; 402 for (k=start; k<end; k++) { /* Amat */ 403 col = aj[k] + cstart; 404 indices_temp[lsize_tmp++] = col; 405 } 406 start = bi[row]; 407 end = bi[row+1]; 408 for (k=start; k<end; k++) { /* Bmat */ 409 col = gcols[bj[k]]; 410 indices_temp[lsize_tmp++] = col; 411 } 412 } 413 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 414 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 415 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 416 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 417 } 418 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 419 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 420 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests recieved (which have to be or which have been processed 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscErrorCode ierr; 461 PetscMPIInt size,rank,tag1,tag2; 462 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 463 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 464 PetscBT *table; 465 MPI_Comm comm; 466 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 467 MPI_Status *s_status,*recv_status; 468 char *t_p; 469 470 PetscFunctionBegin; 471 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 477 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 478 479 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 480 481 for (i=0; i<imax; i++) { 482 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 483 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 489 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 490 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 491 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 492 for (i=0; i<imax; i++) { 493 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 idx_i = idx[i]; 495 len = n[i]; 496 for (j=0; j<len; j++) { 497 row = idx_i[j]; 498 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 499 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 500 w4[proc]++; 501 } 502 for (j=0; j<size; j++) { 503 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 504 } 505 } 506 507 nrqs = 0; /* no of outgoing messages */ 508 msz = 0; /* total mesg length (for all proc */ 509 w1[rank] = 0; /* no mesg sent to intself */ 510 w3[rank] = 0; 511 for (i=0; i<size; i++) { 512 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 513 } 514 /* pa - is list of processors to communicate with */ 515 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 516 for (i=0,j=0; i<size; i++) { 517 if (w1[i]) {pa[j] = i; j++;} 518 } 519 520 /* Each message would have a header = 1 + 2*(no of IS) + data */ 521 for (i=0; i<nrqs; i++) { 522 j = pa[i]; 523 w1[j] += w2[j] + 2*w3[j]; 524 msz += w1[j]; 525 } 526 527 /* Determine the number of messages to expect, their lengths, from from-ids */ 528 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 529 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 530 531 /* Now post the Irecvs corresponding to these messages */ 532 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 533 534 /* Allocate Memory for outgoing messages */ 535 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 536 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 537 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 538 539 { 540 PetscInt *iptr = tmp,ict = 0; 541 for (i=0; i<nrqs; i++) { 542 j = pa[i]; 543 iptr += ict; 544 outdat[j] = iptr; 545 ict = w1[j]; 546 } 547 } 548 549 /* Form the outgoing messages */ 550 /* plug in the headers */ 551 for (i=0; i<nrqs; i++) { 552 j = pa[i]; 553 outdat[j][0] = 0; 554 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 555 ptr[j] = outdat[j] + 2*w3[j] + 1; 556 } 557 558 /* Memory for doing local proc's work */ 559 { 560 PetscInt M_BPB_imax = 0; 561 #if defined(PETSC_USE_CTABLE) 562 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 563 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 564 for (i=0; i<imax; i++) { 565 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 566 } 567 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 570 } 571 #else 572 PetscInt Mimax = 0; 573 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 574 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 575 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 data[i] = d_p + M*i; 579 } 580 #endif 581 } 582 583 /* Parse the IS and update local tables and the outgoing buf with the data */ 584 { 585 PetscInt n_i,isz_i,*outdat_j,ctr_j; 586 PetscBT table_i; 587 588 for (i=0; i<imax; i++) { 589 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 590 n_i = n[i]; 591 table_i = table[i]; 592 idx_i = idx[i]; 593 #if defined(PETSC_USE_CTABLE) 594 table_data_i = table_data[i]; 595 #else 596 data_i = data[i]; 597 #endif 598 isz_i = isz[i]; 599 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 600 row = idx_i[j]; 601 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 602 if (proc != rank) { /* copy to the outgoing buffer */ 603 ctr[proc]++; 604 *ptr[proc] = row; 605 ptr[proc]++; 606 } else if (!PetscBTLookupSet(table_i,row)) { 607 #if defined(PETSC_USE_CTABLE) 608 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 609 #else 610 data_i[isz_i] = row; /* Update the local table */ 611 #endif 612 isz_i++; 613 } 614 } 615 /* Update the headers for the current IS */ 616 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 617 if ((ctr_j = ctr[j])) { 618 outdat_j = outdat[j]; 619 k = ++outdat_j[0]; 620 outdat_j[2*k] = ctr_j; 621 outdat_j[2*k-1] = i; 622 } 623 } 624 isz[i] = isz_i; 625 } 626 } 627 628 /* Now post the sends */ 629 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 630 for (i=0; i<nrqs; ++i) { 631 j = pa[i]; 632 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 633 } 634 635 /* No longer need the original indices */ 636 for (i=0; i<imax; ++i) { 637 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 638 } 639 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 640 641 for (i=0; i<imax; ++i) { 642 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 643 } 644 645 /* Do Local work */ 646 #if defined(PETSC_USE_CTABLE) 647 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 648 #else 649 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 650 #endif 651 652 /* Receive messages */ 653 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 654 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 655 656 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 657 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 658 659 /* Phase 1 sends are complete - deallocate buffers */ 660 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 661 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 662 663 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 664 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 665 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 666 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 667 ierr = PetscFree(rbuf);CHKERRQ(ierr); 668 669 670 /* Send the data back */ 671 /* Do a global reduction to know the buffer space req for incoming messages */ 672 { 673 PetscMPIInt *rw1; 674 675 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 676 677 for (i=0; i<nrqr; ++i) { 678 proc = recv_status[i].MPI_SOURCE; 679 680 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 681 rw1[proc] = isz1[i]; 682 } 683 ierr = PetscFree(onodes1);CHKERRQ(ierr); 684 ierr = PetscFree(olengths1);CHKERRQ(ierr); 685 686 /* Determine the number of messages to expect, their lengths, from from-ids */ 687 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 688 ierr = PetscFree(rw1);CHKERRQ(ierr); 689 } 690 /* Now post the Irecvs corresponding to these messages */ 691 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 692 693 /* Now post the sends */ 694 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 695 for (i=0; i<nrqr; ++i) { 696 j = recv_status[i].MPI_SOURCE; 697 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 698 } 699 700 /* receive work done on other processors */ 701 { 702 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 703 PetscMPIInt idex; 704 PetscBT table_i; 705 MPI_Status *status2; 706 707 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 708 for (i=0; i<nrqs; ++i) { 709 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 710 /* Process the message */ 711 rbuf2_i = rbuf2[idex]; 712 ct1 = 2*rbuf2_i[0]+1; 713 jmax = rbuf2[idex][0]; 714 for (j=1; j<=jmax; j++) { 715 max = rbuf2_i[2*j]; 716 is_no = rbuf2_i[2*j-1]; 717 isz_i = isz[is_no]; 718 table_i = table[is_no]; 719 #if defined(PETSC_USE_CTABLE) 720 table_data_i = table_data[is_no]; 721 #else 722 data_i = data[is_no]; 723 #endif 724 for (k=0; k<max; k++,ct1++) { 725 row = rbuf2_i[ct1]; 726 if (!PetscBTLookupSet(table_i,row)) { 727 #if defined(PETSC_USE_CTABLE) 728 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 729 #else 730 data_i[isz_i] = row; 731 #endif 732 isz_i++; 733 } 734 } 735 isz[is_no] = isz_i; 736 } 737 } 738 739 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 740 ierr = PetscFree(status2);CHKERRQ(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 } 768 769 ierr = PetscFree(onodes2);CHKERRQ(ierr); 770 ierr = PetscFree(olengths2);CHKERRQ(ierr); 771 772 ierr = PetscFree(pa);CHKERRQ(ierr); 773 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 774 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 775 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 776 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 779 ierr = PetscFree(s_status);CHKERRQ(ierr); 780 ierr = PetscFree(recv_status);CHKERRQ(ierr); 781 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 782 ierr = PetscFree(xdata);CHKERRQ(ierr); 783 ierr = PetscFree(isz1);CHKERRQ(ierr); 784 #if defined(PETSC_USE_CTABLE) 785 for (i=0; i<imax; i++) { 786 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 787 } 788 ierr = PetscFree(table_data);CHKERRQ(ierr); 789 ierr = PetscFree(tdata);CHKERRQ(ierr); 790 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 791 #else 792 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 793 #endif 794 PetscFunctionReturn(0); 795 } 796 797 /* 798 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 799 the work on the local processor. 800 801 Inputs: 802 C - MAT_MPIAIJ; 803 imax - total no of index sets processed at a time; 804 table - an array of char - size = m bits. 805 806 Output: 807 isz - array containing the count of the solution elements corresponding 808 to each index set; 809 data or table_data - pointer to the solutions 810 */ 811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 812 { 813 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 814 Mat A = c->A,B = c->B; 815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 816 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 817 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 818 PetscBT table_i; 819 #if defined(PETSC_USE_CTABLE) 820 PetscTable table_data_i; 821 PetscErrorCode ierr; 822 PetscTablePosition tpos; 823 PetscInt tcount,*tdata; 824 #else 825 PetscInt *data_i; 826 #endif 827 828 PetscFunctionBegin; 829 rstart = C->rmap->rstart; 830 cstart = C->cmap->rstart; 831 ai = a->i; 832 aj = a->j; 833 bi = b->i; 834 bj = b->j; 835 garray = c->garray; 836 837 for (i=0; i<imax; i++) { 838 #if defined(PETSC_USE_CTABLE) 839 /* copy existing entries of table_data_i into tdata[] */ 840 table_data_i = table_data[i]; 841 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 842 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 843 844 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 845 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 846 while (tpos) { 847 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 848 tdata[--j] = --row; 849 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 850 } 851 #else 852 data_i = data[i]; 853 #endif 854 table_i = table[i]; 855 isz_i = isz[i]; 856 max = isz[i]; 857 858 for (j=0; j<max; j++) { 859 #if defined(PETSC_USE_CTABLE) 860 row = tdata[j] - rstart; 861 #else 862 row = data_i[j] - rstart; 863 #endif 864 start = ai[row]; 865 end = ai[row+1]; 866 for (k=start; k<end; k++) { /* Amat */ 867 val = aj[k] + cstart; 868 if (!PetscBTLookupSet(table_i,val)) { 869 #if defined(PETSC_USE_CTABLE) 870 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 871 #else 872 data_i[isz_i] = val; 873 #endif 874 isz_i++; 875 } 876 } 877 start = bi[row]; 878 end = bi[row+1]; 879 for (k=start; k<end; k++) { /* Bmat */ 880 val = garray[bj[k]]; 881 if (!PetscBTLookupSet(table_i,val)) { 882 #if defined(PETSC_USE_CTABLE) 883 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 884 #else 885 data_i[isz_i] = val; 886 #endif 887 isz_i++; 888 } 889 } 890 } 891 isz[i] = isz_i; 892 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscFree(tdata);CHKERRQ(ierr); 895 #endif 896 } 897 PetscFunctionReturn(0); 898 } 899 900 /* 901 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 902 and return the output 903 904 Input: 905 C - the matrix 906 nrqr - no of messages being processed. 907 rbuf - an array of pointers to the recieved requests 908 909 Output: 910 xdata - array of messages to be sent back 911 isz1 - size of each message 912 913 For better efficiency perhaps we should malloc separately each xdata[i], 914 then if a remalloc is required we need only copy the data for that one row 915 rather then all previous rows as it is now where a single large chunck of 916 memory is used. 917 918 */ 919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 920 { 921 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 922 Mat A = c->A,B = c->B; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 924 PetscErrorCode ierr; 925 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 926 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 927 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 928 PetscInt *rbuf_i,kmax,rbuf_0; 929 PetscBT xtable; 930 931 PetscFunctionBegin; 932 m = C->rmap->N; 933 rstart = C->rmap->rstart; 934 cstart = C->cmap->rstart; 935 ai = a->i; 936 aj = a->j; 937 bi = b->i; 938 bj = b->j; 939 garray = c->garray; 940 941 942 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 943 rbuf_i = rbuf[i]; 944 rbuf_0 = rbuf_i[0]; 945 ct += rbuf_0; 946 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 947 } 948 949 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 950 else max1 = 1; 951 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 952 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 953 ++no_malloc; 954 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 955 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 956 957 ct3 = 0; 958 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 959 rbuf_i = rbuf[i]; 960 rbuf_0 = rbuf_i[0]; 961 ct1 = 2*rbuf_0+1; 962 ct2 = ct1; 963 ct3 += ct1; 964 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 965 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 966 oct2 = ct2; 967 kmax = rbuf_i[2*j]; 968 for (k=0; k<kmax; k++,ct1++) { 969 row = rbuf_i[ct1]; 970 if (!PetscBTLookupSet(xtable,row)) { 971 if (!(ct3 < mem_estimate)) { 972 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 973 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 974 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 975 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 976 xdata[0] = tmp; 977 mem_estimate = new_estimate; ++no_malloc; 978 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 979 } 980 xdata[i][ct2++] = row; 981 ct3++; 982 } 983 } 984 for (k=oct2,max2=ct2; k<max2; k++) { 985 row = xdata[i][k] - rstart; 986 start = ai[row]; 987 end = ai[row+1]; 988 for (l=start; l<end; l++) { 989 val = aj[l] + cstart; 990 if (!PetscBTLookupSet(xtable,val)) { 991 if (!(ct3 < mem_estimate)) { 992 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 993 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 994 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 995 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 996 xdata[0] = tmp; 997 mem_estimate = new_estimate; ++no_malloc; 998 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 999 } 1000 xdata[i][ct2++] = val; 1001 ct3++; 1002 } 1003 } 1004 start = bi[row]; 1005 end = bi[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = garray[bj[l]]; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 } 1023 /* Update the header*/ 1024 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1025 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1026 } 1027 xdata[i][0] = rbuf_0; 1028 xdata[i+1] = xdata[i] + ct2; 1029 isz1[i] = ct2; /* size of each message */ 1030 } 1031 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1032 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 /* -------------------------------------------------------------------------*/ 1036 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1037 /* 1038 Every processor gets the entire matrix 1039 */ 1040 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1041 { 1042 Mat B; 1043 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1044 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1045 PetscErrorCode ierr; 1046 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1047 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1048 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1049 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1050 1051 PetscFunctionBegin; 1052 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1053 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1054 1055 if (scall == MAT_INITIAL_MATRIX) { 1056 /* ---------------------------------------------------------------- 1057 Tell every processor the number of nonzeros per row 1058 */ 1059 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1060 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1061 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1062 } 1063 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1064 for (i=0; i<size; i++) { 1065 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1066 displs[i] = A->rmap->range[i]; 1067 } 1068 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1069 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1070 #else 1071 sendcount = A->rmap->rend - A->rmap->rstart; 1072 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1073 #endif 1074 /* --------------------------------------------------------------- 1075 Create the sequential matrix of the same type as the local block diagonal 1076 */ 1077 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1078 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1079 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1080 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1081 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1082 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1083 **Bin = B; 1084 b = (Mat_SeqAIJ*)B->data; 1085 1086 /*-------------------------------------------------------------------- 1087 Copy my part of matrix column indices over 1088 */ 1089 sendcount = ad->nz + bd->nz; 1090 jsendbuf = b->j + b->i[rstarts[rank]]; 1091 a_jsendbuf = ad->j; 1092 b_jsendbuf = bd->j; 1093 n = A->rmap->rend - A->rmap->rstart; 1094 cnt = 0; 1095 for (i=0; i<n; i++) { 1096 1097 /* put in lower diagonal portion */ 1098 m = bd->i[i+1] - bd->i[i]; 1099 while (m > 0) { 1100 /* is it above diagonal (in bd (compressed) numbering) */ 1101 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1102 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1103 m--; 1104 } 1105 1106 /* put in diagonal portion */ 1107 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1108 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1109 } 1110 1111 /* put in upper diagonal portion */ 1112 while (m-- > 0) { 1113 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1114 } 1115 } 1116 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1117 1118 /*-------------------------------------------------------------------- 1119 Gather all column indices to all processors 1120 */ 1121 for (i=0; i<size; i++) { 1122 recvcounts[i] = 0; 1123 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1124 recvcounts[i] += lens[j]; 1125 } 1126 } 1127 displs[0] = 0; 1128 for (i=1; i<size; i++) { 1129 displs[i] = displs[i-1] + recvcounts[i-1]; 1130 } 1131 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1132 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1133 #else 1134 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1135 #endif 1136 /*-------------------------------------------------------------------- 1137 Assemble the matrix into useable form (note numerical values not yet set) 1138 */ 1139 /* set the b->ilen (length of each row) values */ 1140 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1141 /* set the b->i indices */ 1142 b->i[0] = 0; 1143 for (i=1; i<=A->rmap->N; i++) { 1144 b->i[i] = b->i[i-1] + lens[i-1]; 1145 } 1146 ierr = PetscFree(lens);CHKERRQ(ierr); 1147 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1148 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 1150 } else { 1151 B = **Bin; 1152 b = (Mat_SeqAIJ*)B->data; 1153 } 1154 1155 /*-------------------------------------------------------------------- 1156 Copy my part of matrix numerical values into the values location 1157 */ 1158 if (flag == MAT_GET_VALUES) { 1159 sendcount = ad->nz + bd->nz; 1160 sendbuf = b->a + b->i[rstarts[rank]]; 1161 a_sendbuf = ad->a; 1162 b_sendbuf = bd->a; 1163 b_sendj = bd->j; 1164 n = A->rmap->rend - A->rmap->rstart; 1165 cnt = 0; 1166 for (i=0; i<n; i++) { 1167 1168 /* put in lower diagonal portion */ 1169 m = bd->i[i+1] - bd->i[i]; 1170 while (m > 0) { 1171 /* is it above diagonal (in bd (compressed) numbering) */ 1172 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1173 sendbuf[cnt++] = *b_sendbuf++; 1174 m--; 1175 b_sendj++; 1176 } 1177 1178 /* put in diagonal portion */ 1179 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1180 sendbuf[cnt++] = *a_sendbuf++; 1181 } 1182 1183 /* put in upper diagonal portion */ 1184 while (m-- > 0) { 1185 sendbuf[cnt++] = *b_sendbuf++; 1186 b_sendj++; 1187 } 1188 } 1189 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1190 1191 /* ----------------------------------------------------------------- 1192 Gather all numerical values to all processors 1193 */ 1194 if (!recvcounts) { 1195 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1196 } 1197 for (i=0; i<size; i++) { 1198 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1199 } 1200 displs[0] = 0; 1201 for (i=1; i<size; i++) { 1202 displs[i] = displs[i-1] + recvcounts[i-1]; 1203 } 1204 recvbuf = b->a; 1205 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1206 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1207 #else 1208 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1209 #endif 1210 } /* endof (flag == MAT_GET_VALUES) */ 1211 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1212 1213 if (A->symmetric) { 1214 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1215 } else if (A->hermitian) { 1216 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1217 } else if (A->structurally_symmetric) { 1218 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1219 } 1220 PetscFunctionReturn(0); 1221 } 1222 1223 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1224 { 1225 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1226 Mat submat,A = c->A,B = c->B; 1227 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1228 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1229 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1230 const PetscInt *icol,*irow; 1231 PetscInt nrow,ncol,start; 1232 PetscErrorCode ierr; 1233 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1234 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1235 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1236 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1237 PetscInt *lens,rmax,ncols,*cols,Crow; 1238 #if defined(PETSC_USE_CTABLE) 1239 PetscTable cmap,rmap; 1240 PetscInt *cmap_loc,*rmap_loc; 1241 #else 1242 PetscInt *cmap,*rmap; 1243 #endif 1244 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1245 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1246 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1247 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1248 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1249 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1250 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1251 MPI_Comm comm; 1252 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1253 PetscMPIInt *onodes1,*olengths1,idex,end; 1254 Mat_SubSppt *smatis1; 1255 PetscBool isrowsorted; 1256 1257 PetscFunctionBegin; 1258 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1259 1260 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1261 size = c->size; 1262 rank = c->rank; 1263 1264 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1265 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1266 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1267 if (allcolumns) { 1268 icol = NULL; 1269 ncol = C->cmap->N; 1270 } else { 1271 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1272 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1273 } 1274 1275 if (scall == MAT_INITIAL_MATRIX) { 1276 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1277 1278 /* Get some new tags to keep the communication clean */ 1279 tag1 = ((PetscObject)C)->tag; 1280 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1281 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1282 1283 /* evaluate communication - mesg to who, length of mesg, and buffer space 1284 required. Based on this, buffers are allocated, and data copied into them */ 1285 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1286 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1287 1288 /* w1[proc] = num of rows owned by proc -- to be requested */ 1289 proc = 0; 1290 nrqs = 0; /* num of outgoing messages */ 1291 for (j=0; j<nrow; j++) { 1292 row = irow[j]; 1293 if (!isrowsorted) proc = 0; 1294 while (row >= C->rmap->range[proc+1]) proc++; 1295 w1[proc]++; 1296 row2proc[j] = proc; /* map row index to proc */ 1297 1298 if (proc != rank && !w2[proc]) { 1299 w2[proc] = 1; nrqs++; 1300 } 1301 } 1302 w1[rank] = 0; /* rows owned by self will not be requested */ 1303 1304 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1305 for (proc=0,j=0; proc<size; proc++) { 1306 if (w1[proc]) { pa[j++] = proc;} 1307 } 1308 1309 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1310 msz = 0; /* total mesg length (for all procs) */ 1311 for (i=0; i<nrqs; i++) { 1312 proc = pa[i]; 1313 w1[proc] += 3; 1314 msz += w1[proc]; 1315 } 1316 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1317 1318 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1319 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1320 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1321 1322 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1323 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1324 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1325 1326 /* Now post the Irecvs corresponding to these messages */ 1327 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1328 1329 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1330 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1331 1332 /* Allocate Memory for outgoing messages */ 1333 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1334 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1335 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1336 1337 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1338 iptr = tmp; 1339 for (i=0; i<nrqs; i++) { 1340 proc = pa[i]; 1341 sbuf1[proc] = iptr; 1342 iptr += w1[proc]; 1343 } 1344 1345 /* Form the outgoing messages */ 1346 /* Initialize the header space */ 1347 for (i=0; i<nrqs; i++) { 1348 proc = pa[i]; 1349 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1350 ptr[proc] = sbuf1[proc] + 3; 1351 } 1352 1353 /* Parse the isrow and copy data into outbuf */ 1354 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1355 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1356 proc = row2proc[j]; 1357 if (proc != rank) { /* copy to the outgoing buf*/ 1358 *ptr[proc] = irow[j]; 1359 ctr[proc]++; ptr[proc]++; 1360 } 1361 } 1362 1363 /* Update the headers for the current IS */ 1364 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1365 if ((ctr_j = ctr[j])) { 1366 sbuf1_j = sbuf1[j]; 1367 k = ++sbuf1_j[0]; 1368 sbuf1_j[2*k] = ctr_j; 1369 sbuf1_j[2*k-1] = 0; 1370 } 1371 } 1372 1373 /* Now post the sends */ 1374 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1375 for (i=0; i<nrqs; ++i) { 1376 proc = pa[i]; 1377 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1378 } 1379 1380 /* Post Receives to capture the buffer size */ 1381 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1382 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1383 1384 rbuf2[0] = tmp + msz; 1385 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1386 1387 for (i=0; i<nrqs; ++i) { 1388 proc = pa[i]; 1389 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1390 } 1391 1392 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1393 1394 /* Send to other procs the buf size they should allocate */ 1395 /* Receive messages*/ 1396 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1397 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1398 1399 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1400 for (i=0; i<nrqr; ++i) { 1401 req_size[i] = 0; 1402 rbuf1_i = rbuf1[i]; 1403 start = 2*rbuf1_i[0] + 1; 1404 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1405 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1406 sbuf2_i = sbuf2[i]; 1407 for (j=start; j<end; j++) { 1408 k = rbuf1_i[j] - rstart; 1409 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1410 sbuf2_i[j] = ncols; 1411 req_size[i] += ncols; 1412 } 1413 req_source1[i] = r_status1[i].MPI_SOURCE; 1414 1415 /* form the header */ 1416 sbuf2_i[0] = req_size[i]; 1417 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1418 1419 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1420 } 1421 1422 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1423 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1424 1425 /* rbuf2 is received, Post recv column indices a->j */ 1426 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1427 1428 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1429 for (i=0; i<nrqs; ++i) { 1430 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1431 req_source2[i] = r_status2[i].MPI_SOURCE; 1432 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1433 } 1434 1435 /* Wait on sends1 and sends2 */ 1436 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1437 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1438 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1439 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1440 1441 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1442 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1443 1444 /* Now allocate sending buffers for a->j, and send them off */ 1445 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1446 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1447 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1448 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1449 1450 for (i=0; i<nrqr; i++) { /* for each requested message */ 1451 rbuf1_i = rbuf1[i]; 1452 sbuf_aj_i = sbuf_aj[i]; 1453 ct1 = 2*rbuf1_i[0] + 1; 1454 ct2 = 0; 1455 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1456 1457 kmax = rbuf1[i][2]; 1458 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1459 row = rbuf1_i[ct1] - rstart; 1460 nzA = ai[row+1] - ai[row]; 1461 nzB = bi[row+1] - bi[row]; 1462 ncols = nzA + nzB; 1463 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1464 1465 /* load the column indices for this row into cols*/ 1466 cols = sbuf_aj_i + ct2; 1467 1468 lwrite = 0; 1469 for (l=0; l<nzB; l++) { 1470 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1471 } 1472 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1473 for (l=0; l<nzB; l++) { 1474 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1475 } 1476 1477 ct2 += ncols; 1478 } 1479 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1480 } 1481 1482 /* create column map (cmap): global col of C -> local col of submat */ 1483 #if defined(PETSC_USE_CTABLE) 1484 if (!allcolumns) { 1485 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1486 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1487 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1488 if (icol[j] >= cstart && icol[j] <cend) { 1489 cmap_loc[icol[j] - cstart] = j+1; 1490 } else { /* use PetscTable for non-local col indices */ 1491 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1492 } 1493 } 1494 } else { 1495 cmap = NULL; 1496 cmap_loc = NULL; 1497 } 1498 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1499 #else 1500 if (!allcolumns) { 1501 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1502 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1503 } else { 1504 cmap = NULL; 1505 } 1506 #endif 1507 1508 /* Create lens for MatSeqAIJSetPreallocation() */ 1509 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1510 1511 /* Compute lens from local part of C */ 1512 for (j=0; j<nrow; j++) { 1513 row = irow[j]; 1514 proc = row2proc[j]; 1515 if (proc == rank) { 1516 /* diagonal part A = c->A */ 1517 ncols = ai[row-rstart+1] - ai[row-rstart]; 1518 cols = aj + ai[row-rstart]; 1519 if (!allcolumns) { 1520 for (k=0; k<ncols; k++) { 1521 #if defined(PETSC_USE_CTABLE) 1522 tcol = cmap_loc[cols[k]]; 1523 #else 1524 tcol = cmap[cols[k]+cstart]; 1525 #endif 1526 if (tcol) lens[j]++; 1527 } 1528 } else { /* allcolumns */ 1529 lens[j] = ncols; 1530 } 1531 1532 /* off-diagonal part B = c->B */ 1533 ncols = bi[row-rstart+1] - bi[row-rstart]; 1534 cols = bj + bi[row-rstart]; 1535 if (!allcolumns) { 1536 for (k=0; k<ncols; k++) { 1537 #if defined(PETSC_USE_CTABLE) 1538 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1539 #else 1540 tcol = cmap[bmap[cols[k]]]; 1541 #endif 1542 if (tcol) lens[j]++; 1543 } 1544 } else { /* allcolumns */ 1545 lens[j] += ncols; 1546 } 1547 } 1548 } 1549 1550 /* Create row map (rmap): global row of C -> local row of submat */ 1551 #if defined(PETSC_USE_CTABLE) 1552 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1553 for (j=0; j<nrow; j++) { 1554 row = irow[j]; 1555 proc = row2proc[j]; 1556 if (proc == rank) { /* a local row */ 1557 rmap_loc[row - rstart] = j; 1558 } else { 1559 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1560 } 1561 } 1562 #else 1563 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1564 for (j=0; j<nrow; j++) { 1565 rmap[irow[j]] = j; 1566 } 1567 #endif 1568 1569 /* Update lens from offproc data */ 1570 /* recv a->j is done */ 1571 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1572 for (i=0; i<nrqs; i++) { 1573 proc = pa[i]; 1574 sbuf1_i = sbuf1[proc]; 1575 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1576 ct1 = 2 + 1; 1577 ct2 = 0; 1578 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1579 rbuf3_i = rbuf3[i]; /* received C->j */ 1580 1581 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1582 max1 = sbuf1_i[2]; 1583 for (k=0; k<max1; k++,ct1++) { 1584 #if defined(PETSC_USE_CTABLE) 1585 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1586 row--; 1587 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1588 #else 1589 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1590 #endif 1591 /* Now, store row index of submat in sbuf1_i[ct1] */ 1592 sbuf1_i[ct1] = row; 1593 1594 nnz = rbuf2_i[ct1]; 1595 if (!allcolumns) { 1596 for (l=0; l<nnz; l++,ct2++) { 1597 #if defined(PETSC_USE_CTABLE) 1598 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1599 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1600 } else { 1601 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1602 } 1603 #else 1604 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1605 #endif 1606 if (tcol) lens[row]++; 1607 } 1608 } else { /* allcolumns */ 1609 lens[row] += nnz; 1610 } 1611 } 1612 } 1613 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1614 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1615 1616 /* Create the submatrices */ 1617 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1618 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1619 1620 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1621 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1622 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1623 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1624 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1625 1626 /* create struct Mat_SubSppt and attached it to submat */ 1627 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1628 subc = (Mat_SeqAIJ*)submat->data; 1629 subc->submatis1 = smatis1; 1630 1631 smatis1->id = 0; 1632 smatis1->nrqs = nrqs; 1633 smatis1->nrqr = nrqr; 1634 smatis1->rbuf1 = rbuf1; 1635 smatis1->rbuf2 = rbuf2; 1636 smatis1->rbuf3 = rbuf3; 1637 smatis1->sbuf2 = sbuf2; 1638 smatis1->req_source2 = req_source2; 1639 1640 smatis1->sbuf1 = sbuf1; 1641 smatis1->ptr = ptr; 1642 smatis1->tmp = tmp; 1643 smatis1->ctr = ctr; 1644 1645 smatis1->pa = pa; 1646 smatis1->req_size = req_size; 1647 smatis1->req_source1 = req_source1; 1648 1649 smatis1->allcolumns = allcolumns; 1650 smatis1->singleis = PETSC_TRUE; 1651 smatis1->row2proc = row2proc; 1652 smatis1->rmap = rmap; 1653 smatis1->cmap = cmap; 1654 #if defined(PETSC_USE_CTABLE) 1655 smatis1->rmap_loc = rmap_loc; 1656 smatis1->cmap_loc = cmap_loc; 1657 #endif 1658 1659 smatis1->destroy = submat->ops->destroy; 1660 submat->ops->destroy = MatDestroy_SeqAIJ_Submatrices; 1661 submat->factortype = C->factortype; 1662 1663 /* compute rmax */ 1664 rmax = 0; 1665 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1666 1667 } else { /* scall == MAT_REUSE_MATRIX */ 1668 submat = submats[0]; 1669 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1670 1671 subc = (Mat_SeqAIJ*)submat->data; 1672 rmax = subc->rmax; 1673 smatis1 = subc->submatis1; 1674 nrqs = smatis1->nrqs; 1675 nrqr = smatis1->nrqr; 1676 rbuf1 = smatis1->rbuf1; 1677 rbuf2 = smatis1->rbuf2; 1678 rbuf3 = smatis1->rbuf3; 1679 req_source2 = smatis1->req_source2; 1680 1681 sbuf1 = smatis1->sbuf1; 1682 sbuf2 = smatis1->sbuf2; 1683 ptr = smatis1->ptr; 1684 tmp = smatis1->tmp; 1685 ctr = smatis1->ctr; 1686 1687 pa = smatis1->pa; 1688 req_size = smatis1->req_size; 1689 req_source1 = smatis1->req_source1; 1690 1691 allcolumns = smatis1->allcolumns; 1692 row2proc = smatis1->row2proc; 1693 rmap = smatis1->rmap; 1694 cmap = smatis1->cmap; 1695 #if defined(PETSC_USE_CTABLE) 1696 rmap_loc = smatis1->rmap_loc; 1697 cmap_loc = smatis1->cmap_loc; 1698 #endif 1699 } 1700 1701 /* Post recv matrix values */ 1702 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1703 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1704 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1705 for (i=0; i<nrqs; ++i) { 1706 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1707 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1708 } 1709 1710 /* Allocate sending buffers for a->a, and send them off */ 1711 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1712 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1713 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1714 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1715 1716 for (i=0; i<nrqr; i++) { 1717 rbuf1_i = rbuf1[i]; 1718 sbuf_aa_i = sbuf_aa[i]; 1719 ct1 = 2*rbuf1_i[0]+1; 1720 ct2 = 0; 1721 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1722 1723 kmax = rbuf1_i[2]; 1724 for (k=0; k<kmax; k++,ct1++) { 1725 row = rbuf1_i[ct1] - rstart; 1726 nzA = ai[row+1] - ai[row]; 1727 nzB = bi[row+1] - bi[row]; 1728 ncols = nzA + nzB; 1729 cworkB = bj + bi[row]; 1730 vworkA = a_a + ai[row]; 1731 vworkB = b_a + bi[row]; 1732 1733 /* load the column values for this row into vals*/ 1734 vals = sbuf_aa_i + ct2; 1735 1736 lwrite = 0; 1737 for (l=0; l<nzB; l++) { 1738 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1739 } 1740 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1741 for (l=0; l<nzB; l++) { 1742 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1743 } 1744 1745 ct2 += ncols; 1746 } 1747 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1748 } 1749 1750 /* Assemble submat */ 1751 /* First assemble the local rows */ 1752 for (j=0; j<nrow; j++) { 1753 row = irow[j]; 1754 proc = row2proc[j]; 1755 if (proc == rank) { 1756 Crow = row - rstart; /* local row index of C */ 1757 #if defined(PETSC_USE_CTABLE) 1758 row = rmap_loc[Crow]; /* row index of submat */ 1759 #else 1760 row = rmap[row]; 1761 #endif 1762 1763 if (allcolumns) { 1764 /* diagonal part A = c->A */ 1765 ncols = ai[Crow+1] - ai[Crow]; 1766 cols = aj + ai[Crow]; 1767 vals = a->a + ai[Crow]; 1768 i = 0; 1769 for (k=0; k<ncols; k++) { 1770 subcols[i] = cols[k] + cstart; 1771 subvals[i++] = vals[k]; 1772 } 1773 1774 /* off-diagonal part B = c->B */ 1775 ncols = bi[Crow+1] - bi[Crow]; 1776 cols = bj + bi[Crow]; 1777 vals = b->a + bi[Crow]; 1778 for (k=0; k<ncols; k++) { 1779 subcols[i] = bmap[cols[k]]; 1780 subvals[i++] = vals[k]; 1781 } 1782 1783 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1784 1785 } else { /* !allcolumns */ 1786 #if defined(PETSC_USE_CTABLE) 1787 /* diagonal part A = c->A */ 1788 ncols = ai[Crow+1] - ai[Crow]; 1789 cols = aj + ai[Crow]; 1790 vals = a->a + ai[Crow]; 1791 i = 0; 1792 for (k=0; k<ncols; k++) { 1793 tcol = cmap_loc[cols[k]]; 1794 if (tcol) { 1795 subcols[i] = --tcol; 1796 subvals[i++] = vals[k]; 1797 } 1798 } 1799 1800 /* off-diagonal part B = c->B */ 1801 ncols = bi[Crow+1] - bi[Crow]; 1802 cols = bj + bi[Crow]; 1803 vals = b->a + bi[Crow]; 1804 for (k=0; k<ncols; k++) { 1805 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1806 if (tcol) { 1807 subcols[i] = --tcol; 1808 subvals[i++] = vals[k]; 1809 } 1810 } 1811 #else 1812 /* diagonal part A = c->A */ 1813 ncols = ai[Crow+1] - ai[Crow]; 1814 cols = aj + ai[Crow]; 1815 vals = a->a + ai[Crow]; 1816 i = 0; 1817 for (k=0; k<ncols; k++) { 1818 tcol = cmap[cols[k]+cstart]; 1819 if (tcol) { 1820 subcols[i] = --tcol; 1821 subvals[i++] = vals[k]; 1822 } 1823 } 1824 1825 /* off-diagonal part B = c->B */ 1826 ncols = bi[Crow+1] - bi[Crow]; 1827 cols = bj + bi[Crow]; 1828 vals = b->a + bi[Crow]; 1829 for (k=0; k<ncols; k++) { 1830 tcol = cmap[bmap[cols[k]]]; 1831 if (tcol) { 1832 subcols[i] = --tcol; 1833 subvals[i++] = vals[k]; 1834 } 1835 } 1836 #endif 1837 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1838 } 1839 } 1840 } 1841 1842 /* Now assemble the off-proc rows */ 1843 for (i=0; i<nrqs; i++) { /* for each requested message */ 1844 /* recv values from other processes */ 1845 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1846 proc = pa[idex]; 1847 sbuf1_i = sbuf1[proc]; 1848 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1849 ct1 = 2 + 1; 1850 ct2 = 0; /* count of received C->j */ 1851 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1852 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1853 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1854 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1855 1856 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1857 max1 = sbuf1_i[2]; /* num of rows */ 1858 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1859 row = sbuf1_i[ct1]; /* row index of submat */ 1860 if (!allcolumns) { 1861 idex = 0; 1862 if (scall == MAT_INITIAL_MATRIX) { 1863 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1864 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1865 #if defined(PETSC_USE_CTABLE) 1866 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1867 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1868 } else { 1869 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1870 } 1871 #else 1872 tcol = cmap[rbuf3_i[ct2]]; 1873 #endif 1874 if (tcol) { 1875 subcols[idex] = --tcol; 1876 subvals[idex++] = rbuf4_i[ct2]; 1877 1878 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1879 For reuse, we replace received C->j with index that should be inserted to submat */ 1880 rbuf3_i[ct3++] = ct2; 1881 } 1882 } 1883 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1884 1885 } else { /* scall == MAT_REUSE_MATRIX */ 1886 submat = submats[0]; 1887 subc = (Mat_SeqAIJ*)submat->data; 1888 1889 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1890 for (l=0; l<nnz; l++) { 1891 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1892 subvals[idex++] = rbuf4_i[ct2]; 1893 } 1894 1895 bj = subc->j + subc->i[row]; 1896 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1897 } 1898 } else { /* allcolumns */ 1899 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1900 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1901 ct2 += nnz; 1902 } 1903 } 1904 } 1905 1906 /* sending a->a are done */ 1907 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1908 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1909 1910 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1911 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1912 submats[0] = submat; 1913 1914 /* Restore the indices */ 1915 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1916 if (!allcolumns) { 1917 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1918 } 1919 1920 /* Destroy allocated memory */ 1921 for (i=0; i<nrqs; ++i) { 1922 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1923 } 1924 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1925 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1926 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1927 1928 if (scall == MAT_INITIAL_MATRIX) { 1929 ierr = PetscFree(lens);CHKERRQ(ierr); 1930 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1931 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1932 } 1933 PetscFunctionReturn(0); 1934 } 1935 1936 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1937 { 1938 PetscErrorCode ierr; 1939 PetscInt ncol; 1940 PetscBool colflag,allcolumns=PETSC_FALSE; 1941 1942 PetscFunctionBegin; 1943 /* Allocate memory to hold all the submatrices */ 1944 if (scall == MAT_INITIAL_MATRIX) { 1945 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1946 } 1947 1948 /* Check for special case: each processor gets entire matrix columns */ 1949 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1950 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1951 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1952 1953 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1954 PetscFunctionReturn(0); 1955 } 1956 1957 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1958 { 1959 PetscErrorCode ierr; 1960 PetscInt nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2]; 1961 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1962 Mat_SeqAIJ *subc; 1963 Mat_SubSppt *smat; 1964 1965 PetscFunctionBegin; 1966 /* Check for special case: each processor has a single IS */ 1967 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1968 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1969 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */ 1970 PetscFunctionReturn(0); 1971 } 1972 1973 /* Collect global wantallmatrix and nstages */ 1974 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1975 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1976 if (!nmax) nmax = 1; 1977 1978 if (scall == MAT_INITIAL_MATRIX) { 1979 /* Collect global wantallmatrix and nstages */ 1980 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1981 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1982 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1983 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1984 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1985 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1986 wantallmatrix = PETSC_TRUE; 1987 1988 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1989 } 1990 } 1991 1992 /* Determine the number of stages through which submatrices are done 1993 Each stage will extract nmax submatrices. 1994 nmax is determined by the matrix column dimension. 1995 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1996 */ 1997 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 1998 1999 in[0] = -1*(PetscInt)wantallmatrix; 2000 in[1] = nstages; 2001 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2002 wantallmatrix = (PetscBool)(-out[0]); 2003 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2004 2005 } else { /* MAT_REUSE_MATRIX */ 2006 if (ismax) { 2007 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2008 smat = subc->submatis1; 2009 } else { /* (*submat)[0] is a dummy matrix */ 2010 smat = (Mat_SubSppt*)(*submat)[0]->data; 2011 } 2012 if (!smat) { 2013 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2014 wantallmatrix = PETSC_TRUE; 2015 } else if (smat->singleis) { 2016 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2017 PetscFunctionReturn(0); 2018 } else { 2019 nstages = smat->nstages; 2020 } 2021 } 2022 2023 if (wantallmatrix) { 2024 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2025 PetscFunctionReturn(0); 2026 } 2027 2028 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2029 if (scall == MAT_INITIAL_MATRIX) { 2030 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2031 } 2032 2033 for (i=0,pos=0; i<nstages; i++) { 2034 if (pos+nmax <= ismax) max_no = nmax; 2035 else if (pos == ismax) max_no = 0; 2036 else max_no = ismax-pos; 2037 2038 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2039 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2040 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2041 smat->nstages = nstages; 2042 } 2043 pos += max_no; 2044 } 2045 2046 if (ismax && scall == MAT_INITIAL_MATRIX) { 2047 /* save nstages for reuse */ 2048 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2049 smat = subc->submatis1; 2050 smat->nstages = nstages; 2051 } 2052 PetscFunctionReturn(0); 2053 } 2054 2055 /* -------------------------------------------------------------------------*/ 2056 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2057 { 2058 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2059 Mat A = c->A; 2060 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2061 const PetscInt **icol,**irow; 2062 PetscInt *nrow,*ncol,start; 2063 PetscErrorCode ierr; 2064 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2065 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2066 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2067 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2068 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2069 #if defined(PETSC_USE_CTABLE) 2070 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2071 #else 2072 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2073 #endif 2074 const PetscInt *irow_i; 2075 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2076 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2077 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2078 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2079 MPI_Status *r_status3,*r_status4,*s_status4; 2080 MPI_Comm comm; 2081 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2082 PetscMPIInt *onodes1,*olengths1,end; 2083 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2084 Mat_SubSppt *smat_i; 2085 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2086 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2087 2088 PetscFunctionBegin; 2089 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2090 size = c->size; 2091 rank = c->rank; 2092 2093 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2094 ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2095 2096 for (i=0; i<ismax; i++) { 2097 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2098 if (!issorted[i]) iscsorted = issorted[i]; 2099 2100 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2101 2102 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2103 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2104 2105 /* Check for special case: allcolumn */ 2106 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2107 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2108 if (colflag && ncol[i] == C->cmap->N) { 2109 allcolumns[i] = PETSC_TRUE; 2110 icol[i] = NULL; 2111 } else { 2112 allcolumns[i] = PETSC_FALSE; 2113 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2114 } 2115 } 2116 2117 if (scall == MAT_REUSE_MATRIX) { 2118 /* Assumes new rows are same length as the old rows */ 2119 for (i=0; i<ismax; i++) { 2120 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2121 subc = (Mat_SeqAIJ*)submats[i]->data; 2122 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2123 2124 /* Initial matrix as if empty */ 2125 ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2126 2127 smat_i = subc->submatis1; 2128 2129 nrqs = smat_i->nrqs; 2130 nrqr = smat_i->nrqr; 2131 rbuf1 = smat_i->rbuf1; 2132 rbuf2 = smat_i->rbuf2; 2133 rbuf3 = smat_i->rbuf3; 2134 req_source2 = smat_i->req_source2; 2135 2136 sbuf1 = smat_i->sbuf1; 2137 sbuf2 = smat_i->sbuf2; 2138 ptr = smat_i->ptr; 2139 tmp = smat_i->tmp; 2140 ctr = smat_i->ctr; 2141 2142 pa = smat_i->pa; 2143 req_size = smat_i->req_size; 2144 req_source1 = smat_i->req_source1; 2145 2146 allcolumns[i] = smat_i->allcolumns; 2147 row2proc[i] = smat_i->row2proc; 2148 rmap[i] = smat_i->rmap; 2149 cmap[i] = smat_i->cmap; 2150 } 2151 2152 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2153 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2154 smat_i = (Mat_SubSppt*)submats[0]->data; 2155 2156 nrqs = smat_i->nrqs; 2157 nrqr = smat_i->nrqr; 2158 rbuf1 = smat_i->rbuf1; 2159 rbuf2 = smat_i->rbuf2; 2160 rbuf3 = smat_i->rbuf3; 2161 req_source2 = smat_i->req_source2; 2162 2163 sbuf1 = smat_i->sbuf1; 2164 sbuf2 = smat_i->sbuf2; 2165 ptr = smat_i->ptr; 2166 tmp = smat_i->tmp; 2167 ctr = smat_i->ctr; 2168 2169 pa = smat_i->pa; 2170 req_size = smat_i->req_size; 2171 req_source1 = smat_i->req_source1; 2172 2173 allcolumns[0] = PETSC_FALSE; 2174 } 2175 } else { /* scall == MAT_INITIAL_MATRIX */ 2176 /* Get some new tags to keep the communication clean */ 2177 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2178 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2179 2180 /* evaluate communication - mesg to who, length of mesg, and buffer space 2181 required. Based on this, buffers are allocated, and data copied into them*/ 2182 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2183 2184 for (i=0; i<ismax; i++) { 2185 jmax = nrow[i]; 2186 irow_i = irow[i]; 2187 2188 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2189 row2proc[i] = row2proc_i; 2190 2191 if (issorted[i]) proc = 0; 2192 for (j=0; j<jmax; j++) { 2193 if (!issorted[i]) proc = 0; 2194 row = irow_i[j]; 2195 while (row >= C->rmap->range[proc+1]) proc++; 2196 w4[proc]++; 2197 row2proc_i[j] = proc; /* map row index to proc */ 2198 } 2199 for (j=0; j<size; j++) { 2200 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2201 } 2202 } 2203 2204 nrqs = 0; /* no of outgoing messages */ 2205 msz = 0; /* total mesg length (for all procs) */ 2206 w1[rank] = 0; /* no mesg sent to self */ 2207 w3[rank] = 0; 2208 for (i=0; i<size; i++) { 2209 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2210 } 2211 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2212 for (i=0,j=0; i<size; i++) { 2213 if (w1[i]) { pa[j] = i; j++; } 2214 } 2215 2216 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2217 for (i=0; i<nrqs; i++) { 2218 j = pa[i]; 2219 w1[j] += w2[j] + 2* w3[j]; 2220 msz += w1[j]; 2221 } 2222 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2223 2224 /* Determine the number of messages to expect, their lengths, from from-ids */ 2225 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2226 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2227 2228 /* Now post the Irecvs corresponding to these messages */ 2229 tag0 = ((PetscObject)C)->tag; 2230 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2231 2232 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2233 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2234 2235 /* Allocate Memory for outgoing messages */ 2236 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2237 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2238 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2239 2240 { 2241 PetscInt *iptr = tmp; 2242 k = 0; 2243 for (i=0; i<nrqs; i++) { 2244 j = pa[i]; 2245 iptr += k; 2246 sbuf1[j] = iptr; 2247 k = w1[j]; 2248 } 2249 } 2250 2251 /* Form the outgoing messages. Initialize the header space */ 2252 for (i=0; i<nrqs; i++) { 2253 j = pa[i]; 2254 sbuf1[j][0] = 0; 2255 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2256 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2257 } 2258 2259 /* Parse the isrow and copy data into outbuf */ 2260 for (i=0; i<ismax; i++) { 2261 row2proc_i = row2proc[i]; 2262 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2263 irow_i = irow[i]; 2264 jmax = nrow[i]; 2265 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2266 proc = row2proc_i[j]; 2267 if (proc != rank) { /* copy to the outgoing buf*/ 2268 ctr[proc]++; 2269 *ptr[proc] = irow_i[j]; 2270 ptr[proc]++; 2271 } 2272 } 2273 /* Update the headers for the current IS */ 2274 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2275 if ((ctr_j = ctr[j])) { 2276 sbuf1_j = sbuf1[j]; 2277 k = ++sbuf1_j[0]; 2278 sbuf1_j[2*k] = ctr_j; 2279 sbuf1_j[2*k-1] = i; 2280 } 2281 } 2282 } 2283 2284 /* Now post the sends */ 2285 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2286 for (i=0; i<nrqs; ++i) { 2287 j = pa[i]; 2288 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2289 } 2290 2291 /* Post Receives to capture the buffer size */ 2292 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2293 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2294 rbuf2[0] = tmp + msz; 2295 for (i=1; i<nrqs; ++i) { 2296 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2297 } 2298 for (i=0; i<nrqs; ++i) { 2299 j = pa[i]; 2300 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2301 } 2302 2303 /* Send to other procs the buf size they should allocate */ 2304 /* Receive messages*/ 2305 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2306 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2307 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2308 { 2309 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2310 PetscInt *sbuf2_i; 2311 2312 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2313 for (i=0; i<nrqr; ++i) { 2314 req_size[i] = 0; 2315 rbuf1_i = rbuf1[i]; 2316 start = 2*rbuf1_i[0] + 1; 2317 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2318 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2319 sbuf2_i = sbuf2[i]; 2320 for (j=start; j<end; j++) { 2321 id = rbuf1_i[j] - rstart; 2322 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2323 sbuf2_i[j] = ncols; 2324 req_size[i] += ncols; 2325 } 2326 req_source1[i] = r_status1[i].MPI_SOURCE; 2327 /* form the header */ 2328 sbuf2_i[0] = req_size[i]; 2329 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2330 2331 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2332 } 2333 } 2334 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2335 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2336 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2337 2338 /* Receive messages*/ 2339 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2340 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2341 2342 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2343 for (i=0; i<nrqs; ++i) { 2344 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2345 req_source2[i] = r_status2[i].MPI_SOURCE; 2346 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2347 } 2348 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2349 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2350 2351 /* Wait on sends1 and sends2 */ 2352 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2353 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2354 2355 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2356 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2357 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2358 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2359 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2360 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2361 2362 /* Now allocate sending buffers for a->j, and send them off */ 2363 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2364 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2365 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2366 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2367 2368 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2369 { 2370 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2371 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2372 PetscInt cend = C->cmap->rend; 2373 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2374 2375 for (i=0; i<nrqr; i++) { 2376 rbuf1_i = rbuf1[i]; 2377 sbuf_aj_i = sbuf_aj[i]; 2378 ct1 = 2*rbuf1_i[0] + 1; 2379 ct2 = 0; 2380 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2381 kmax = rbuf1[i][2*j]; 2382 for (k=0; k<kmax; k++,ct1++) { 2383 row = rbuf1_i[ct1] - rstart; 2384 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2385 ncols = nzA + nzB; 2386 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2387 2388 /* load the column indices for this row into cols */ 2389 cols = sbuf_aj_i + ct2; 2390 2391 lwrite = 0; 2392 for (l=0; l<nzB; l++) { 2393 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2394 } 2395 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2396 for (l=0; l<nzB; l++) { 2397 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2398 } 2399 2400 ct2 += ncols; 2401 } 2402 } 2403 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2404 } 2405 } 2406 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2407 2408 /* create col map: global col of C -> local col of submatrices */ 2409 { 2410 const PetscInt *icol_i; 2411 #if defined(PETSC_USE_CTABLE) 2412 for (i=0; i<ismax; i++) { 2413 if (!allcolumns[i]) { 2414 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2415 2416 jmax = ncol[i]; 2417 icol_i = icol[i]; 2418 cmap_i = cmap[i]; 2419 for (j=0; j<jmax; j++) { 2420 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2421 } 2422 } else cmap[i] = NULL; 2423 } 2424 #else 2425 for (i=0; i<ismax; i++) { 2426 if (!allcolumns[i]) { 2427 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2428 jmax = ncol[i]; 2429 icol_i = icol[i]; 2430 cmap_i = cmap[i]; 2431 for (j=0; j<jmax; j++) { 2432 cmap_i[icol_i[j]] = j+1; 2433 } 2434 } else cmap[i] = NULL; 2435 } 2436 #endif 2437 } 2438 2439 /* Create lens which is required for MatCreate... */ 2440 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2441 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2442 2443 if (ismax) { 2444 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2445 } 2446 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2447 2448 /* Update lens from local data */ 2449 for (i=0; i<ismax; i++) { 2450 row2proc_i = row2proc[i]; 2451 jmax = nrow[i]; 2452 if (!allcolumns[i]) cmap_i = cmap[i]; 2453 irow_i = irow[i]; 2454 lens_i = lens[i]; 2455 for (j=0; j<jmax; j++) { 2456 row = irow_i[j]; 2457 proc = row2proc_i[j]; 2458 if (proc == rank) { 2459 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2460 if (!allcolumns[i]) { 2461 for (k=0; k<ncols; k++) { 2462 #if defined(PETSC_USE_CTABLE) 2463 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2464 #else 2465 tcol = cmap_i[cols[k]]; 2466 #endif 2467 if (tcol) lens_i[j]++; 2468 } 2469 } else { /* allcolumns */ 2470 lens_i[j] = ncols; 2471 } 2472 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2473 } 2474 } 2475 } 2476 2477 /* Create row map: global row of C -> local row of submatrices */ 2478 #if defined(PETSC_USE_CTABLE) 2479 for (i=0; i<ismax; i++) { 2480 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2481 irow_i = irow[i]; 2482 jmax = nrow[i]; 2483 for (j=0; j<jmax; j++) { 2484 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2485 } 2486 } 2487 #else 2488 for (i=0; i<ismax; i++) { 2489 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2490 rmap_i = rmap[i]; 2491 irow_i = irow[i]; 2492 jmax = nrow[i]; 2493 for (j=0; j<jmax; j++) { 2494 rmap_i[irow_i[j]] = j; 2495 } 2496 } 2497 #endif 2498 2499 /* Update lens from offproc data */ 2500 { 2501 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2502 2503 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2504 for (tmp2=0; tmp2<nrqs; tmp2++) { 2505 sbuf1_i = sbuf1[pa[tmp2]]; 2506 jmax = sbuf1_i[0]; 2507 ct1 = 2*jmax+1; 2508 ct2 = 0; 2509 rbuf2_i = rbuf2[tmp2]; 2510 rbuf3_i = rbuf3[tmp2]; 2511 for (j=1; j<=jmax; j++) { 2512 is_no = sbuf1_i[2*j-1]; 2513 max1 = sbuf1_i[2*j]; 2514 lens_i = lens[is_no]; 2515 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2516 rmap_i = rmap[is_no]; 2517 for (k=0; k<max1; k++,ct1++) { 2518 #if defined(PETSC_USE_CTABLE) 2519 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2520 row--; 2521 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2522 #else 2523 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2524 #endif 2525 max2 = rbuf2_i[ct1]; 2526 for (l=0; l<max2; l++,ct2++) { 2527 if (!allcolumns[is_no]) { 2528 #if defined(PETSC_USE_CTABLE) 2529 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2530 #else 2531 tcol = cmap_i[rbuf3_i[ct2]]; 2532 #endif 2533 if (tcol) lens_i[row]++; 2534 } else { /* allcolumns */ 2535 lens_i[row]++; /* lens_i[row] += max2 ? */ 2536 } 2537 } 2538 } 2539 } 2540 } 2541 } 2542 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2543 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2544 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2545 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2546 2547 /* Create the submatrices */ 2548 for (i=0; i<ismax; i++) { 2549 PetscInt rbs,cbs; 2550 2551 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2552 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2553 2554 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2555 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2556 2557 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2558 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2559 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2560 2561 /* create struct Mat_SubSppt and attached it to submat */ 2562 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2563 subc = (Mat_SeqAIJ*)submats[i]->data; 2564 subc->submatis1 = smat_i; 2565 2566 smat_i->destroy = submats[i]->ops->destroy; 2567 submats[i]->ops->destroy = MatDestroy_SeqAIJ_Submatrices; 2568 submats[i]->factortype = C->factortype; 2569 2570 smat_i->id = i; 2571 smat_i->nrqs = nrqs; 2572 smat_i->nrqr = nrqr; 2573 smat_i->rbuf1 = rbuf1; 2574 smat_i->rbuf2 = rbuf2; 2575 smat_i->rbuf3 = rbuf3; 2576 smat_i->sbuf2 = sbuf2; 2577 smat_i->req_source2 = req_source2; 2578 2579 smat_i->sbuf1 = sbuf1; 2580 smat_i->ptr = ptr; 2581 smat_i->tmp = tmp; 2582 smat_i->ctr = ctr; 2583 2584 smat_i->pa = pa; 2585 smat_i->req_size = req_size; 2586 smat_i->req_source1 = req_source1; 2587 2588 smat_i->allcolumns = allcolumns[i]; 2589 smat_i->singleis = PETSC_FALSE; 2590 smat_i->row2proc = row2proc[i]; 2591 smat_i->rmap = rmap[i]; 2592 smat_i->cmap = cmap[i]; 2593 } 2594 2595 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2596 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2597 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2598 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2599 2600 /* create struct Mat_SubSppt and attached it to submat */ 2601 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2602 submats[0]->data = (void*)smat_i; 2603 2604 smat_i->destroy = submats[0]->ops->destroy; 2605 submats[0]->ops->destroy = MatDestroy_Dummy_Submatrices; 2606 submats[0]->factortype = C->factortype; 2607 2608 smat_i->id = 0; 2609 smat_i->nrqs = nrqs; 2610 smat_i->nrqr = nrqr; 2611 smat_i->rbuf1 = rbuf1; 2612 smat_i->rbuf2 = rbuf2; 2613 smat_i->rbuf3 = rbuf3; 2614 smat_i->sbuf2 = sbuf2; 2615 smat_i->req_source2 = req_source2; 2616 2617 smat_i->sbuf1 = sbuf1; 2618 smat_i->ptr = ptr; 2619 smat_i->tmp = tmp; 2620 smat_i->ctr = ctr; 2621 2622 smat_i->pa = pa; 2623 smat_i->req_size = req_size; 2624 smat_i->req_source1 = req_source1; 2625 2626 smat_i->allcolumns = PETSC_FALSE; 2627 smat_i->singleis = PETSC_FALSE; 2628 smat_i->row2proc = NULL; 2629 smat_i->rmap = NULL; 2630 smat_i->cmap = NULL; 2631 } 2632 2633 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2634 ierr = PetscFree(lens);CHKERRQ(ierr); 2635 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2636 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2637 2638 } /* endof scall == MAT_INITIAL_MATRIX */ 2639 2640 /* Post recv matrix values */ 2641 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2642 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2643 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2644 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2645 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2646 for (i=0; i<nrqs; ++i) { 2647 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2648 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2649 } 2650 2651 /* Allocate sending buffers for a->a, and send them off */ 2652 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2653 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2654 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2655 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2656 2657 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2658 { 2659 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2660 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2661 PetscInt cend = C->cmap->rend; 2662 PetscInt *b_j = b->j; 2663 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2664 2665 for (i=0; i<nrqr; i++) { 2666 rbuf1_i = rbuf1[i]; 2667 sbuf_aa_i = sbuf_aa[i]; 2668 ct1 = 2*rbuf1_i[0]+1; 2669 ct2 = 0; 2670 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2671 kmax = rbuf1_i[2*j]; 2672 for (k=0; k<kmax; k++,ct1++) { 2673 row = rbuf1_i[ct1] - rstart; 2674 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2675 ncols = nzA + nzB; 2676 cworkB = b_j + b_i[row]; 2677 vworkA = a_a + a_i[row]; 2678 vworkB = b_a + b_i[row]; 2679 2680 /* load the column values for this row into vals*/ 2681 vals = sbuf_aa_i+ct2; 2682 2683 lwrite = 0; 2684 for (l=0; l<nzB; l++) { 2685 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2686 } 2687 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2688 for (l=0; l<nzB; l++) { 2689 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2690 } 2691 2692 ct2 += ncols; 2693 } 2694 } 2695 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2696 } 2697 } 2698 2699 /* Assemble the matrices */ 2700 /* First assemble the local rows */ 2701 for (i=0; i<ismax; i++) { 2702 row2proc_i = row2proc[i]; 2703 subc = (Mat_SeqAIJ*)submats[i]->data; 2704 imat_ilen = subc->ilen; 2705 imat_j = subc->j; 2706 imat_i = subc->i; 2707 imat_a = subc->a; 2708 2709 if (!allcolumns[i]) cmap_i = cmap[i]; 2710 rmap_i = rmap[i]; 2711 irow_i = irow[i]; 2712 jmax = nrow[i]; 2713 for (j=0; j<jmax; j++) { 2714 row = irow_i[j]; 2715 proc = row2proc_i[j]; 2716 if (proc == rank) { 2717 old_row = row; 2718 #if defined(PETSC_USE_CTABLE) 2719 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2720 row--; 2721 #else 2722 row = rmap_i[row]; 2723 #endif 2724 ilen_row = imat_ilen[row]; 2725 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2726 mat_i = imat_i[row]; 2727 mat_a = imat_a + mat_i; 2728 mat_j = imat_j + mat_i; 2729 if (!allcolumns[i]) { 2730 for (k=0; k<ncols; k++) { 2731 #if defined(PETSC_USE_CTABLE) 2732 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2733 #else 2734 tcol = cmap_i[cols[k]]; 2735 #endif 2736 if (tcol) { 2737 *mat_j++ = tcol - 1; 2738 *mat_a++ = vals[k]; 2739 ilen_row++; 2740 } 2741 } 2742 } else { /* allcolumns */ 2743 for (k=0; k<ncols; k++) { 2744 *mat_j++ = cols[k]; /* global col index! */ 2745 *mat_a++ = vals[k]; 2746 ilen_row++; 2747 } 2748 } 2749 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2750 2751 imat_ilen[row] = ilen_row; 2752 } 2753 } 2754 } 2755 2756 /* Now assemble the off proc rows */ 2757 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2758 for (tmp2=0; tmp2<nrqs; tmp2++) { 2759 sbuf1_i = sbuf1[pa[tmp2]]; 2760 jmax = sbuf1_i[0]; 2761 ct1 = 2*jmax + 1; 2762 ct2 = 0; 2763 rbuf2_i = rbuf2[tmp2]; 2764 rbuf3_i = rbuf3[tmp2]; 2765 rbuf4_i = rbuf4[tmp2]; 2766 for (j=1; j<=jmax; j++) { 2767 is_no = sbuf1_i[2*j-1]; 2768 rmap_i = rmap[is_no]; 2769 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2770 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2771 imat_ilen = subc->ilen; 2772 imat_j = subc->j; 2773 imat_i = subc->i; 2774 imat_a = subc->a; 2775 max1 = sbuf1_i[2*j]; 2776 for (k=0; k<max1; k++,ct1++) { 2777 row = sbuf1_i[ct1]; 2778 #if defined(PETSC_USE_CTABLE) 2779 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2780 row--; 2781 #else 2782 row = rmap_i[row]; 2783 #endif 2784 ilen = imat_ilen[row]; 2785 mat_i = imat_i[row]; 2786 mat_a = imat_a + mat_i; 2787 mat_j = imat_j + mat_i; 2788 max2 = rbuf2_i[ct1]; 2789 if (!allcolumns[is_no]) { 2790 for (l=0; l<max2; l++,ct2++) { 2791 #if defined(PETSC_USE_CTABLE) 2792 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2793 #else 2794 tcol = cmap_i[rbuf3_i[ct2]]; 2795 #endif 2796 if (tcol) { 2797 *mat_j++ = tcol - 1; 2798 *mat_a++ = rbuf4_i[ct2]; 2799 ilen++; 2800 } 2801 } 2802 } else { /* allcolumns */ 2803 for (l=0; l<max2; l++,ct2++) { 2804 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2805 *mat_a++ = rbuf4_i[ct2]; 2806 ilen++; 2807 } 2808 } 2809 imat_ilen[row] = ilen; 2810 } 2811 } 2812 } 2813 2814 if (!iscsorted) { /* sort column indices of the rows */ 2815 for (i=0; i<ismax; i++) { 2816 subc = (Mat_SeqAIJ*)submats[i]->data; 2817 imat_j = subc->j; 2818 imat_i = subc->i; 2819 imat_a = subc->a; 2820 imat_ilen = subc->ilen; 2821 2822 if (allcolumns[i]) continue; 2823 jmax = nrow[i]; 2824 for (j=0; j<jmax; j++) { 2825 mat_i = imat_i[j]; 2826 mat_a = imat_a + mat_i; 2827 mat_j = imat_j + mat_i; 2828 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2829 } 2830 } 2831 } 2832 2833 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2834 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2835 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2836 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2837 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2838 2839 /* Restore the indices */ 2840 for (i=0; i<ismax; i++) { 2841 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2842 if (!allcolumns[i]) { 2843 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2844 } 2845 } 2846 2847 for (i=0; i<ismax; i++) { 2848 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2849 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2850 } 2851 2852 /* Destroy allocated memory */ 2853 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2854 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2855 ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr); 2856 2857 for (i=0; i<nrqs; ++i) { 2858 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2859 } 2860 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2861 2862 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2863 PetscFunctionReturn(0); 2864 } 2865 2866 /* 2867 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2868 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2869 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2870 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2871 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2872 state, and needs to be "assembled" later by compressing B's column space. 2873 2874 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2875 Following this call, C->A & C->B have been created, even if empty. 2876 */ 2877 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2878 { 2879 /* If making this function public, change the error returned in this function away from _PLIB. */ 2880 PetscErrorCode ierr; 2881 Mat_MPIAIJ *aij; 2882 Mat_SeqAIJ *Baij; 2883 PetscBool seqaij,Bdisassembled; 2884 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2885 PetscScalar v; 2886 const PetscInt *rowindices,*colindices; 2887 2888 PetscFunctionBegin; 2889 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2890 if (A) { 2891 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2892 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2893 if (rowemb) { 2894 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2895 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2896 } else { 2897 if (C->rmap->n != A->rmap->n) { 2898 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2899 } 2900 } 2901 if (dcolemb) { 2902 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2903 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2904 } else { 2905 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2906 } 2907 } 2908 if (B) { 2909 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2910 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2911 if (rowemb) { 2912 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2913 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2914 } else { 2915 if (C->rmap->n != B->rmap->n) { 2916 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2917 } 2918 } 2919 if (ocolemb) { 2920 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2921 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2922 } else { 2923 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2924 } 2925 } 2926 2927 aij = (Mat_MPIAIJ*)C->data; 2928 if (!aij->A) { 2929 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2930 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2931 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2932 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2933 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2934 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2935 } 2936 if (A) { 2937 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2938 } else { 2939 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2940 } 2941 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2942 /* 2943 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2944 need to "disassemble" B -- convert it to using C's global indices. 2945 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2946 2947 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2948 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2949 2950 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2951 At least avoid calling MatSetValues() and the implied searches? 2952 */ 2953 2954 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2955 #if defined(PETSC_USE_CTABLE) 2956 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2957 #else 2958 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2959 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2960 if (aij->B) { 2961 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2962 } 2963 #endif 2964 ngcol = 0; 2965 if (aij->lvec) { 2966 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2967 } 2968 if (aij->garray) { 2969 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2970 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2971 } 2972 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2973 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2974 } 2975 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2976 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2977 } 2978 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2979 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2980 } 2981 } 2982 Bdisassembled = PETSC_FALSE; 2983 if (!aij->B) { 2984 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2985 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2986 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2987 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2988 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2989 Bdisassembled = PETSC_TRUE; 2990 } 2991 if (B) { 2992 Baij = (Mat_SeqAIJ*)B->data; 2993 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2994 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2995 for (i=0; i<B->rmap->n; i++) { 2996 nz[i] = Baij->i[i+1] - Baij->i[i]; 2997 } 2998 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2999 ierr = PetscFree(nz);CHKERRQ(ierr); 3000 } 3001 3002 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3003 shift = rend-rstart; 3004 count = 0; 3005 rowindices = NULL; 3006 colindices = NULL; 3007 if (rowemb) { 3008 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3009 } 3010 if (ocolemb) { 3011 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3012 } 3013 for (i=0; i<B->rmap->n; i++) { 3014 PetscInt row; 3015 row = i; 3016 if (rowindices) row = rowindices[i]; 3017 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3018 col = Baij->j[count]; 3019 if (colindices) col = colindices[col]; 3020 if (Bdisassembled && col>=rstart) col += shift; 3021 v = Baij->a[count]; 3022 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3023 ++count; 3024 } 3025 } 3026 /* No assembly for aij->B is necessary. */ 3027 /* FIXME: set aij->B's nonzerostate correctly. */ 3028 } else { 3029 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3030 } 3031 C->preallocated = PETSC_TRUE; 3032 C->was_assembled = PETSC_FALSE; 3033 C->assembled = PETSC_FALSE; 3034 /* 3035 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3036 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3037 */ 3038 PetscFunctionReturn(0); 3039 } 3040 3041 /* 3042 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3043 */ 3044 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3045 { 3046 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3047 3048 PetscFunctionBegin; 3049 PetscValidPointer(A,2); 3050 PetscValidPointer(B,3); 3051 /* FIXME: make sure C is assembled */ 3052 *A = aij->A; 3053 *B = aij->B; 3054 /* Note that we don't incref *A and *B, so be careful! */ 3055 PetscFunctionReturn(0); 3056 } 3057 3058 /* 3059 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3060 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3061 */ 3062 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3063 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3064 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3065 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3066 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3067 { 3068 PetscErrorCode ierr; 3069 PetscMPIInt isize,flag; 3070 PetscInt i,ii,cismax,ispar; 3071 Mat *A,*B; 3072 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3073 3074 PetscFunctionBegin; 3075 if (!ismax) PetscFunctionReturn(0); 3076 3077 for (i = 0, cismax = 0; i < ismax; ++i) { 3078 PetscMPIInt isize; 3079 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3080 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3081 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3082 if (isize > 1) ++cismax; 3083 } 3084 3085 /* 3086 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3087 ispar counts the number of parallel ISs across C's comm. 3088 */ 3089 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3090 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3091 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3092 PetscFunctionReturn(0); 3093 } 3094 3095 /* if (ispar) */ 3096 /* 3097 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3098 These are used to extract the off-diag portion of the resulting parallel matrix. 3099 The row IS for the off-diag portion is the same as for the diag portion, 3100 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3101 */ 3102 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3103 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3104 for (i = 0, ii = 0; i < ismax; ++i) { 3105 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3106 if (isize > 1) { 3107 /* 3108 TODO: This is the part that's ***NOT SCALABLE***. 3109 To fix this we need to extract just the indices of C's nonzero columns 3110 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3111 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3112 to be done without serializing on the IS list, so, most likely, it is best 3113 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3114 */ 3115 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3116 /* Now we have to 3117 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3118 were sorted on each rank, concatenated they might no longer be sorted; 3119 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3120 indices in the nondecreasing order to the original index positions. 3121 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3122 */ 3123 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3124 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3125 ++ii; 3126 } 3127 } 3128 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3129 for (i = 0, ii = 0; i < ismax; ++i) { 3130 PetscInt j,issize; 3131 const PetscInt *indices; 3132 3133 /* 3134 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3135 */ 3136 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3137 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3138 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3139 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3140 for (j = 1; j < issize; ++j) { 3141 if (indices[j] == indices[j-1]) { 3142 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3143 } 3144 } 3145 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3146 3147 3148 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3149 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3150 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3151 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3152 for (j = 1; j < issize; ++j) { 3153 if (indices[j-1] == indices[j]) { 3154 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3155 } 3156 } 3157 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3158 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3159 if (isize > 1) { 3160 cisrow[ii] = isrow[i]; 3161 ++ii; 3162 } 3163 } 3164 /* 3165 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3166 array of sequential matrices underlying the resulting parallel matrices. 3167 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3168 contain duplicates. 3169 3170 There are as many diag matrices as there are original index sets. There are only as many parallel 3171 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3172 3173 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3174 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3175 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3176 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3177 with A[i] and B[ii] extracted from the corresponding MPI submat. 3178 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3179 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3180 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3181 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3182 values into A[i] and B[ii] sitting inside the corresponding submat. 3183 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3184 A[i], B[ii], AA[i] or BB[ii] matrices. 3185 */ 3186 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3187 if (scall == MAT_INITIAL_MATRIX) { 3188 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3189 } 3190 3191 /* Now obtain the sequential A and B submatrices separately. */ 3192 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3193 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3194 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3195 3196 /* 3197 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3198 matrices A & B have been extracted directly into the parallel matrices containing them, or 3199 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3200 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3201 to have the same sparsity pattern. 3202 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3203 must be constructed for C. This is done by setseqmat(s). 3204 */ 3205 for (i = 0, ii = 0; i < ismax; ++i) { 3206 /* 3207 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3208 That way we can avoid sorting and computing permutations when reusing. 3209 To this end: 3210 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3211 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3212 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3213 */ 3214 MatStructure pattern; 3215 pattern = DIFFERENT_NONZERO_PATTERN; 3216 3217 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3218 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3219 if (isize > 1) { 3220 if (scall == MAT_INITIAL_MATRIX) { 3221 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3222 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3223 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3224 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3225 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3226 } 3227 /* 3228 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3229 */ 3230 { 3231 Mat AA,BB; 3232 AA = A[i]; 3233 BB = B[ii]; 3234 if (AA || BB) { 3235 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3236 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3237 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3238 } 3239 3240 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3241 } 3242 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3243 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3244 ++ii; 3245 } else { /* if (isize == 1) */ 3246 if (scall == MAT_REUSE_MATRIX) { 3247 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3248 } 3249 if (isrow_p[i] || iscol_p[i]) { 3250 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3251 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3252 /* Otherwise A is extracted straight into (*submats)[i]. */ 3253 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3254 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3255 } else (*submat)[i] = A[i]; 3256 } 3257 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3258 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3259 } 3260 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3261 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3262 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3263 ierr = PetscFree(A);CHKERRQ(ierr); 3264 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3265 PetscFunctionReturn(0); 3266 } 3267 3268 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3269 { 3270 PetscErrorCode ierr; 3271 3272 PetscFunctionBegin; 3273 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3274 PetscFunctionReturn(0); 3275 } 3276