1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++){ 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++){ 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank){ 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows){ 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++){ 115 if (tosizes_temp[i]){ 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++){ 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++){ 133 for (j=0; j<fromsizes[2*i]; j++){ 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++){ 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++){ 169 for (j=0; j<tosizes[2*i]; j++){ 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++){ 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++){ 209 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 210 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 211 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 213 } 214 /* retrieve information to get row id and its overlap */ 215 for (i=0; i<nrecvs; ){ 216 is_id = recvdata[i++]; 217 data_size = recvdata[i++]; 218 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 219 isz_i = isz[is_id]; 220 for (j=0; j< data_size; j++){ 221 col = recvdata[i++]; 222 indices_i[isz_i++] = col; 223 } 224 isz[is_id] = isz_i; 225 } 226 /* remove duplicate entities */ 227 for (i=0; i<nidx; i++){ 228 indices_i = indices_temp+(max_lsize+nrecvs)*i; 229 isz_i = isz[i]; 230 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 231 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 232 } 233 ierr = PetscFree(isz);CHKERRQ(ierr); 234 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 239 { 240 PetscLayout rmap,cmap; 241 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 242 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 243 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 244 const PetscInt *gcols,*ai,*aj,*bi,*bj; 245 Mat amat,bmat; 246 PetscMPIInt rank; 247 PetscBool done; 248 MPI_Comm comm; 249 PetscErrorCode ierr; 250 251 PetscFunctionBegin; 252 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 253 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 254 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 255 /* Even if the mat is symmetric, we still assume it is not symmetric */ 256 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 257 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 258 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 /* total number of nonzero values is used to estimate the memory usage in the next step */ 261 tnz = ai[an]+bi[bn]; 262 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 263 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 264 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 265 /* to find the longest message */ 266 max_fszs = 0; 267 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 268 /* better way to estimate number of nonzero in the mat??? */ 269 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 270 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 271 rows_pos = 0; 272 totalrows = 0; 273 for (i=0; i<nfrom; i++){ 274 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 275 /* group data together */ 276 for (j=0; j<fromsizes[2*i]; j+=2){ 277 is_id = fromrows[rows_pos++];/* no of is */ 278 rows_i = rows_data[is_id]; 279 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 280 } 281 /* estimate a space to avoid multiple allocations */ 282 for (j=0; j<nidx; j++){ 283 indvc_ij = 0; 284 rows_i = rows_data[j]; 285 for (l=0; l<rows_pos_i[j]; l++){ 286 row = rows_i[l]-rstart; 287 start = ai[row]; 288 end = ai[row+1]; 289 for (k=start; k<end; k++){ /* Amat */ 290 col = aj[k] + cstart; 291 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 292 } 293 start = bi[row]; 294 end = bi[row+1]; 295 for (k=start; k<end; k++) { /* Bmat */ 296 col = gcols[bj[k]]; 297 indices_tmp[indvc_ij++] = col; 298 } 299 } 300 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 301 indv_counts[i*nidx+j] = indvc_ij; 302 totalrows += indvc_ij; 303 } 304 } 305 /* message triple <no of is, number of rows, rows> */ 306 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 307 totalrows = 0; 308 rows_pos = 0; 309 /* use this code again */ 310 for (i=0;i<nfrom;i++){ 311 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 312 for (j=0; j<fromsizes[2*i]; j+=2){ 313 is_id = fromrows[rows_pos++]; 314 rows_i = rows_data[is_id]; 315 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 316 } 317 /* add data */ 318 for (j=0; j<nidx; j++){ 319 if (!indv_counts[i*nidx+j]) continue; 320 indvc_ij = 0; 321 sbdata[totalrows++] = j; 322 sbdata[totalrows++] = indv_counts[i*nidx+j]; 323 sbsizes[2*i] += 2; 324 rows_i = rows_data[j]; 325 for (l=0; l<rows_pos_i[j]; l++){ 326 row = rows_i[l]-rstart; 327 start = ai[row]; 328 end = ai[row+1]; 329 for (k=start; k<end; k++){ /* Amat */ 330 col = aj[k] + cstart; 331 indices_tmp[indvc_ij++] = col; 332 } 333 start = bi[row]; 334 end = bi[row+1]; 335 for (k=start; k<end; k++) { /* Bmat */ 336 col = gcols[bj[k]]; 337 indices_tmp[indvc_ij++] = col; 338 } 339 } 340 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 341 sbsizes[2*i] += indvc_ij; 342 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 343 totalrows += indvc_ij; 344 } 345 } 346 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 347 for (i=0; i<nfrom; i++){ 348 offsets[i+1] = offsets[i] + sbsizes[2*i]; 349 sbsizes[2*i+1] = offsets[i]; 350 } 351 ierr = PetscFree(offsets);CHKERRQ(ierr); 352 if (sbrowsizes) *sbrowsizes = sbsizes; 353 if (sbrows) *sbrows = sbdata; 354 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 355 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 356 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 357 PetscFunctionReturn(0); 358 } 359 360 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 361 { 362 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 363 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 364 PetscInt lsize,lsize_tmp,owner; 365 PetscMPIInt rank; 366 Mat amat,bmat; 367 PetscBool done; 368 PetscLayout cmap,rmap; 369 MPI_Comm comm; 370 PetscErrorCode ierr; 371 372 PetscFunctionBegin; 373 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 374 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 375 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 376 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 377 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 378 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 379 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 380 /* is it a safe way to compute number of nonzero values ? */ 381 tnz = ai[an]+bi[bn]; 382 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 383 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 384 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 385 /* it is a better way to estimate memory than the old implementation 386 * where global size of matrix is used 387 * */ 388 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 389 for (i=0; i<nidx; i++) { 390 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 391 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 392 lsize_tmp = 0; 393 for (j=0; j<lsize; j++) { 394 owner = -1; 395 row = indices[j]; 396 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 397 if (owner != rank) continue; 398 /* local number */ 399 row -= rstart; 400 start = ai[row]; 401 end = ai[row+1]; 402 for (k=start; k<end; k++) { /* Amat */ 403 col = aj[k] + cstart; 404 indices_temp[lsize_tmp++] = col; 405 } 406 start = bi[row]; 407 end = bi[row+1]; 408 for (k=start; k<end; k++) { /* Bmat */ 409 col = gcols[bj[k]]; 410 indices_temp[lsize_tmp++] = col; 411 } 412 } 413 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 414 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 415 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 416 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 417 } 418 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 419 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 420 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 425 /* 426 Sample message format: 427 If a processor A wants processor B to process some elements corresponding 428 to index sets is[1],is[5] 429 mesg [0] = 2 (no of index sets in the mesg) 430 ----------- 431 mesg [1] = 1 => is[1] 432 mesg [2] = sizeof(is[1]); 433 ----------- 434 mesg [3] = 5 => is[5] 435 mesg [4] = sizeof(is[5]); 436 ----------- 437 mesg [5] 438 mesg [n] datas[1] 439 ----------- 440 mesg[n+1] 441 mesg[m] data(is[5]) 442 ----------- 443 444 Notes: 445 nrqs - no of requests sent (or to be sent out) 446 nrqr - no of requests recieved (which have to be or which have been processed 447 */ 448 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 449 { 450 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 451 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 452 const PetscInt **idx,*idx_i; 453 PetscInt *n,**data,len; 454 #if defined(PETSC_USE_CTABLE) 455 PetscTable *table_data,table_data_i; 456 PetscInt *tdata,tcount,tcount_max; 457 #else 458 PetscInt *data_i,*d_p; 459 #endif 460 PetscErrorCode ierr; 461 PetscMPIInt size,rank,tag1,tag2; 462 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 463 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 464 PetscBT *table; 465 MPI_Comm comm; 466 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 467 MPI_Status *s_status,*recv_status; 468 char *t_p; 469 470 PetscFunctionBegin; 471 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 472 size = c->size; 473 rank = c->rank; 474 M = C->rmap->N; 475 476 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 477 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 478 479 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 480 481 for (i=0; i<imax; i++) { 482 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 483 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 484 } 485 486 /* evaluate communication - mesg to who,length of mesg, and buffer space 487 required. Based on this, buffers are allocated, and data copied into them */ 488 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 489 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 490 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 491 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 492 for (i=0; i<imax; i++) { 493 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 494 idx_i = idx[i]; 495 len = n[i]; 496 for (j=0; j<len; j++) { 497 row = idx_i[j]; 498 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 499 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 500 w4[proc]++; 501 } 502 for (j=0; j<size; j++) { 503 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 504 } 505 } 506 507 nrqs = 0; /* no of outgoing messages */ 508 msz = 0; /* total mesg length (for all proc */ 509 w1[rank] = 0; /* no mesg sent to intself */ 510 w3[rank] = 0; 511 for (i=0; i<size; i++) { 512 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 513 } 514 /* pa - is list of processors to communicate with */ 515 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 516 for (i=0,j=0; i<size; i++) { 517 if (w1[i]) {pa[j] = i; j++;} 518 } 519 520 /* Each message would have a header = 1 + 2*(no of IS) + data */ 521 for (i=0; i<nrqs; i++) { 522 j = pa[i]; 523 w1[j] += w2[j] + 2*w3[j]; 524 msz += w1[j]; 525 } 526 527 /* Determine the number of messages to expect, their lengths, from from-ids */ 528 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 529 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 530 531 /* Now post the Irecvs corresponding to these messages */ 532 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 533 534 /* Allocate Memory for outgoing messages */ 535 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 536 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 537 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 538 539 { 540 PetscInt *iptr = tmp,ict = 0; 541 for (i=0; i<nrqs; i++) { 542 j = pa[i]; 543 iptr += ict; 544 outdat[j] = iptr; 545 ict = w1[j]; 546 } 547 } 548 549 /* Form the outgoing messages */ 550 /* plug in the headers */ 551 for (i=0; i<nrqs; i++) { 552 j = pa[i]; 553 outdat[j][0] = 0; 554 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 555 ptr[j] = outdat[j] + 2*w3[j] + 1; 556 } 557 558 /* Memory for doing local proc's work */ 559 { 560 PetscInt M_BPB_imax = 0; 561 #if defined(PETSC_USE_CTABLE) 562 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 563 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 564 for (i=0; i<imax; i++) { 565 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 566 } 567 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 570 } 571 #else 572 PetscInt Mimax = 0; 573 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 574 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 575 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 576 for (i=0; i<imax; i++) { 577 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 578 data[i] = d_p + M*i; 579 } 580 #endif 581 } 582 583 /* Parse the IS and update local tables and the outgoing buf with the data */ 584 { 585 PetscInt n_i,isz_i,*outdat_j,ctr_j; 586 PetscBT table_i; 587 588 for (i=0; i<imax; i++) { 589 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 590 n_i = n[i]; 591 table_i = table[i]; 592 idx_i = idx[i]; 593 #if defined(PETSC_USE_CTABLE) 594 table_data_i = table_data[i]; 595 #else 596 data_i = data[i]; 597 #endif 598 isz_i = isz[i]; 599 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 600 row = idx_i[j]; 601 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 602 if (proc != rank) { /* copy to the outgoing buffer */ 603 ctr[proc]++; 604 *ptr[proc] = row; 605 ptr[proc]++; 606 } else if (!PetscBTLookupSet(table_i,row)) { 607 #if defined(PETSC_USE_CTABLE) 608 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 609 #else 610 data_i[isz_i] = row; /* Update the local table */ 611 #endif 612 isz_i++; 613 } 614 } 615 /* Update the headers for the current IS */ 616 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 617 if ((ctr_j = ctr[j])) { 618 outdat_j = outdat[j]; 619 k = ++outdat_j[0]; 620 outdat_j[2*k] = ctr_j; 621 outdat_j[2*k-1] = i; 622 } 623 } 624 isz[i] = isz_i; 625 } 626 } 627 628 /* Now post the sends */ 629 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 630 for (i=0; i<nrqs; ++i) { 631 j = pa[i]; 632 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 633 } 634 635 /* No longer need the original indices */ 636 for (i=0; i<imax; ++i) { 637 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 638 } 639 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 640 641 for (i=0; i<imax; ++i) { 642 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 643 } 644 645 /* Do Local work */ 646 #if defined(PETSC_USE_CTABLE) 647 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 648 #else 649 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 650 #endif 651 652 /* Receive messages */ 653 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 654 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 655 656 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 657 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 658 659 /* Phase 1 sends are complete - deallocate buffers */ 660 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 661 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 662 663 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 664 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 665 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 666 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 667 ierr = PetscFree(rbuf);CHKERRQ(ierr); 668 669 670 /* Send the data back */ 671 /* Do a global reduction to know the buffer space req for incoming messages */ 672 { 673 PetscMPIInt *rw1; 674 675 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 676 677 for (i=0; i<nrqr; ++i) { 678 proc = recv_status[i].MPI_SOURCE; 679 680 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 681 rw1[proc] = isz1[i]; 682 } 683 ierr = PetscFree(onodes1);CHKERRQ(ierr); 684 ierr = PetscFree(olengths1);CHKERRQ(ierr); 685 686 /* Determine the number of messages to expect, their lengths, from from-ids */ 687 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 688 ierr = PetscFree(rw1);CHKERRQ(ierr); 689 } 690 /* Now post the Irecvs corresponding to these messages */ 691 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 692 693 /* Now post the sends */ 694 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 695 for (i=0; i<nrqr; ++i) { 696 j = recv_status[i].MPI_SOURCE; 697 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 698 } 699 700 /* receive work done on other processors */ 701 { 702 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 703 PetscMPIInt idex; 704 PetscBT table_i; 705 MPI_Status *status2; 706 707 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 708 for (i=0; i<nrqs; ++i) { 709 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 710 /* Process the message */ 711 rbuf2_i = rbuf2[idex]; 712 ct1 = 2*rbuf2_i[0]+1; 713 jmax = rbuf2[idex][0]; 714 for (j=1; j<=jmax; j++) { 715 max = rbuf2_i[2*j]; 716 is_no = rbuf2_i[2*j-1]; 717 isz_i = isz[is_no]; 718 table_i = table[is_no]; 719 #if defined(PETSC_USE_CTABLE) 720 table_data_i = table_data[is_no]; 721 #else 722 data_i = data[is_no]; 723 #endif 724 for (k=0; k<max; k++,ct1++) { 725 row = rbuf2_i[ct1]; 726 if (!PetscBTLookupSet(table_i,row)) { 727 #if defined(PETSC_USE_CTABLE) 728 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 729 #else 730 data_i[isz_i] = row; 731 #endif 732 isz_i++; 733 } 734 } 735 isz[is_no] = isz_i; 736 } 737 } 738 739 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 740 ierr = PetscFree(status2);CHKERRQ(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 } 768 769 ierr = PetscFree(onodes2);CHKERRQ(ierr); 770 ierr = PetscFree(olengths2);CHKERRQ(ierr); 771 772 ierr = PetscFree(pa);CHKERRQ(ierr); 773 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 774 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 775 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 776 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 779 ierr = PetscFree(s_status);CHKERRQ(ierr); 780 ierr = PetscFree(recv_status);CHKERRQ(ierr); 781 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 782 ierr = PetscFree(xdata);CHKERRQ(ierr); 783 ierr = PetscFree(isz1);CHKERRQ(ierr); 784 #if defined(PETSC_USE_CTABLE) 785 for (i=0; i<imax; i++) { 786 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 787 } 788 ierr = PetscFree(table_data);CHKERRQ(ierr); 789 ierr = PetscFree(tdata);CHKERRQ(ierr); 790 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 791 #else 792 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 793 #endif 794 PetscFunctionReturn(0); 795 } 796 797 /* 798 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 799 the work on the local processor. 800 801 Inputs: 802 C - MAT_MPIAIJ; 803 imax - total no of index sets processed at a time; 804 table - an array of char - size = m bits. 805 806 Output: 807 isz - array containing the count of the solution elements corresponding 808 to each index set; 809 data or table_data - pointer to the solutions 810 */ 811 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 812 { 813 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 814 Mat A = c->A,B = c->B; 815 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 816 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 817 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 818 PetscBT table_i; 819 #if defined(PETSC_USE_CTABLE) 820 PetscTable table_data_i; 821 PetscErrorCode ierr; 822 PetscTablePosition tpos; 823 PetscInt tcount,*tdata; 824 #else 825 PetscInt *data_i; 826 #endif 827 828 PetscFunctionBegin; 829 rstart = C->rmap->rstart; 830 cstart = C->cmap->rstart; 831 ai = a->i; 832 aj = a->j; 833 bi = b->i; 834 bj = b->j; 835 garray = c->garray; 836 837 for (i=0; i<imax; i++) { 838 #if defined(PETSC_USE_CTABLE) 839 /* copy existing entries of table_data_i into tdata[] */ 840 table_data_i = table_data[i]; 841 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 842 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 843 844 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 845 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 846 while (tpos) { 847 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 848 tdata[--j] = --row; 849 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 850 } 851 #else 852 data_i = data[i]; 853 #endif 854 table_i = table[i]; 855 isz_i = isz[i]; 856 max = isz[i]; 857 858 for (j=0; j<max; j++) { 859 #if defined(PETSC_USE_CTABLE) 860 row = tdata[j] - rstart; 861 #else 862 row = data_i[j] - rstart; 863 #endif 864 start = ai[row]; 865 end = ai[row+1]; 866 for (k=start; k<end; k++) { /* Amat */ 867 val = aj[k] + cstart; 868 if (!PetscBTLookupSet(table_i,val)) { 869 #if defined(PETSC_USE_CTABLE) 870 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 871 #else 872 data_i[isz_i] = val; 873 #endif 874 isz_i++; 875 } 876 } 877 start = bi[row]; 878 end = bi[row+1]; 879 for (k=start; k<end; k++) { /* Bmat */ 880 val = garray[bj[k]]; 881 if (!PetscBTLookupSet(table_i,val)) { 882 #if defined(PETSC_USE_CTABLE) 883 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 884 #else 885 data_i[isz_i] = val; 886 #endif 887 isz_i++; 888 } 889 } 890 } 891 isz[i] = isz_i; 892 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscFree(tdata);CHKERRQ(ierr); 895 #endif 896 } 897 PetscFunctionReturn(0); 898 } 899 900 /* 901 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 902 and return the output 903 904 Input: 905 C - the matrix 906 nrqr - no of messages being processed. 907 rbuf - an array of pointers to the recieved requests 908 909 Output: 910 xdata - array of messages to be sent back 911 isz1 - size of each message 912 913 For better efficiency perhaps we should malloc separately each xdata[i], 914 then if a remalloc is required we need only copy the data for that one row 915 rather then all previous rows as it is now where a single large chunck of 916 memory is used. 917 918 */ 919 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 920 { 921 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 922 Mat A = c->A,B = c->B; 923 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 924 PetscErrorCode ierr; 925 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 926 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 927 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 928 PetscInt *rbuf_i,kmax,rbuf_0; 929 PetscBT xtable; 930 931 PetscFunctionBegin; 932 m = C->rmap->N; 933 rstart = C->rmap->rstart; 934 cstart = C->cmap->rstart; 935 ai = a->i; 936 aj = a->j; 937 bi = b->i; 938 bj = b->j; 939 garray = c->garray; 940 941 942 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 943 rbuf_i = rbuf[i]; 944 rbuf_0 = rbuf_i[0]; 945 ct += rbuf_0; 946 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 947 } 948 949 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 950 else max1 = 1; 951 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 952 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 953 ++no_malloc; 954 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 955 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 956 957 ct3 = 0; 958 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 959 rbuf_i = rbuf[i]; 960 rbuf_0 = rbuf_i[0]; 961 ct1 = 2*rbuf_0+1; 962 ct2 = ct1; 963 ct3 += ct1; 964 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 965 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 966 oct2 = ct2; 967 kmax = rbuf_i[2*j]; 968 for (k=0; k<kmax; k++,ct1++) { 969 row = rbuf_i[ct1]; 970 if (!PetscBTLookupSet(xtable,row)) { 971 if (!(ct3 < mem_estimate)) { 972 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 973 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 974 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 975 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 976 xdata[0] = tmp; 977 mem_estimate = new_estimate; ++no_malloc; 978 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 979 } 980 xdata[i][ct2++] = row; 981 ct3++; 982 } 983 } 984 for (k=oct2,max2=ct2; k<max2; k++) { 985 row = xdata[i][k] - rstart; 986 start = ai[row]; 987 end = ai[row+1]; 988 for (l=start; l<end; l++) { 989 val = aj[l] + cstart; 990 if (!PetscBTLookupSet(xtable,val)) { 991 if (!(ct3 < mem_estimate)) { 992 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 993 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 994 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 995 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 996 xdata[0] = tmp; 997 mem_estimate = new_estimate; ++no_malloc; 998 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 999 } 1000 xdata[i][ct2++] = val; 1001 ct3++; 1002 } 1003 } 1004 start = bi[row]; 1005 end = bi[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = garray[bj[l]]; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 } 1023 /* Update the header*/ 1024 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1025 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1026 } 1027 xdata[i][0] = rbuf_0; 1028 xdata[i+1] = xdata[i] + ct2; 1029 isz1[i] = ct2; /* size of each message */ 1030 } 1031 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1032 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 /* -------------------------------------------------------------------------*/ 1036 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1037 /* 1038 Every processor gets the entire matrix 1039 */ 1040 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1041 { 1042 Mat B; 1043 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1044 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1045 PetscErrorCode ierr; 1046 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1047 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1048 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1049 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1050 1051 PetscFunctionBegin; 1052 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1053 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1054 1055 if (scall == MAT_INITIAL_MATRIX) { 1056 /* ---------------------------------------------------------------- 1057 Tell every processor the number of nonzeros per row 1058 */ 1059 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1060 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1061 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1062 } 1063 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1064 for (i=0; i<size; i++) { 1065 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1066 displs[i] = A->rmap->range[i]; 1067 } 1068 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1069 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1070 #else 1071 sendcount = A->rmap->rend - A->rmap->rstart; 1072 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1073 #endif 1074 /* --------------------------------------------------------------- 1075 Create the sequential matrix of the same type as the local block diagonal 1076 */ 1077 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1078 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1079 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1080 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1081 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1082 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1083 **Bin = B; 1084 b = (Mat_SeqAIJ*)B->data; 1085 1086 /*-------------------------------------------------------------------- 1087 Copy my part of matrix column indices over 1088 */ 1089 sendcount = ad->nz + bd->nz; 1090 jsendbuf = b->j + b->i[rstarts[rank]]; 1091 a_jsendbuf = ad->j; 1092 b_jsendbuf = bd->j; 1093 n = A->rmap->rend - A->rmap->rstart; 1094 cnt = 0; 1095 for (i=0; i<n; i++) { 1096 1097 /* put in lower diagonal portion */ 1098 m = bd->i[i+1] - bd->i[i]; 1099 while (m > 0) { 1100 /* is it above diagonal (in bd (compressed) numbering) */ 1101 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1102 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1103 m--; 1104 } 1105 1106 /* put in diagonal portion */ 1107 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1108 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1109 } 1110 1111 /* put in upper diagonal portion */ 1112 while (m-- > 0) { 1113 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1114 } 1115 } 1116 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1117 1118 /*-------------------------------------------------------------------- 1119 Gather all column indices to all processors 1120 */ 1121 for (i=0; i<size; i++) { 1122 recvcounts[i] = 0; 1123 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1124 recvcounts[i] += lens[j]; 1125 } 1126 } 1127 displs[0] = 0; 1128 for (i=1; i<size; i++) { 1129 displs[i] = displs[i-1] + recvcounts[i-1]; 1130 } 1131 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1132 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1133 #else 1134 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1135 #endif 1136 /*-------------------------------------------------------------------- 1137 Assemble the matrix into useable form (note numerical values not yet set) 1138 */ 1139 /* set the b->ilen (length of each row) values */ 1140 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1141 /* set the b->i indices */ 1142 b->i[0] = 0; 1143 for (i=1; i<=A->rmap->N; i++) { 1144 b->i[i] = b->i[i-1] + lens[i-1]; 1145 } 1146 ierr = PetscFree(lens);CHKERRQ(ierr); 1147 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1148 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 1150 } else { 1151 B = **Bin; 1152 b = (Mat_SeqAIJ*)B->data; 1153 } 1154 1155 /*-------------------------------------------------------------------- 1156 Copy my part of matrix numerical values into the values location 1157 */ 1158 if (flag == MAT_GET_VALUES) { 1159 sendcount = ad->nz + bd->nz; 1160 sendbuf = b->a + b->i[rstarts[rank]]; 1161 a_sendbuf = ad->a; 1162 b_sendbuf = bd->a; 1163 b_sendj = bd->j; 1164 n = A->rmap->rend - A->rmap->rstart; 1165 cnt = 0; 1166 for (i=0; i<n; i++) { 1167 1168 /* put in lower diagonal portion */ 1169 m = bd->i[i+1] - bd->i[i]; 1170 while (m > 0) { 1171 /* is it above diagonal (in bd (compressed) numbering) */ 1172 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1173 sendbuf[cnt++] = *b_sendbuf++; 1174 m--; 1175 b_sendj++; 1176 } 1177 1178 /* put in diagonal portion */ 1179 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1180 sendbuf[cnt++] = *a_sendbuf++; 1181 } 1182 1183 /* put in upper diagonal portion */ 1184 while (m-- > 0) { 1185 sendbuf[cnt++] = *b_sendbuf++; 1186 b_sendj++; 1187 } 1188 } 1189 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1190 1191 /* ----------------------------------------------------------------- 1192 Gather all numerical values to all processors 1193 */ 1194 if (!recvcounts) { 1195 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1196 } 1197 for (i=0; i<size; i++) { 1198 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1199 } 1200 displs[0] = 0; 1201 for (i=1; i<size; i++) { 1202 displs[i] = displs[i-1] + recvcounts[i-1]; 1203 } 1204 recvbuf = b->a; 1205 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1206 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1207 #else 1208 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1209 #endif 1210 } /* endof (flag == MAT_GET_VALUES) */ 1211 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1212 1213 if (A->symmetric) { 1214 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1215 } else if (A->hermitian) { 1216 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1217 } else if (A->structurally_symmetric) { 1218 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1219 } 1220 PetscFunctionReturn(0); 1221 } 1222 1223 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1224 { 1225 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1226 Mat submat,A = c->A,B = c->B; 1227 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1228 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1229 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1230 const PetscInt *icol,*irow; 1231 PetscInt nrow,ncol,start; 1232 PetscErrorCode ierr; 1233 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1234 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1235 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1236 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1237 PetscInt *lens,rmax,ncols,*cols,Crow; 1238 #if defined(PETSC_USE_CTABLE) 1239 PetscTable cmap,rmap; 1240 PetscInt *cmap_loc,*rmap_loc; 1241 #else 1242 PetscInt *cmap,*rmap; 1243 #endif 1244 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1245 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1246 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1247 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1248 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1249 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1250 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1251 MPI_Comm comm; 1252 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1253 PetscMPIInt *onodes1,*olengths1,idex,end; 1254 Mat_SubSppt *smatis1; 1255 PetscBool isrowsorted; 1256 1257 PetscFunctionBegin; 1258 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1259 1260 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1261 size = c->size; 1262 rank = c->rank; 1263 1264 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1265 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1266 1267 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1268 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1269 if (allcolumns) { 1270 icol = NULL; 1271 ncol = C->cmap->N; 1272 } else { 1273 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1274 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1275 } 1276 1277 if (scall == MAT_INITIAL_MATRIX) { 1278 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1279 1280 //printf("[%d] ncol %d, Cn %d, CN %d\n",rank,ncol,C->cmap->n,C->cmap->N); 1281 1282 /* Get some new tags to keep the communication clean */ 1283 tag1 = ((PetscObject)C)->tag; 1284 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1285 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1286 1287 /* evaluate communication - mesg to who, length of mesg, and buffer space 1288 required. Based on this, buffers are allocated, and data copied into them */ 1289 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1290 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1291 1292 /* w1[proc] = num of rows owned by proc -- to be requested */ 1293 proc = 0; 1294 nrqs = 0; /* num of outgoing messages */ 1295 for (j=0; j<nrow; j++) { 1296 row = irow[j]; /* sorted! */ 1297 while (row >= C->rmap->range[proc+1]) proc++; 1298 w1[proc]++; 1299 row2proc[j] = proc; /* map row index to proc */ 1300 1301 if (proc != rank && !w2[proc]) { 1302 w2[proc] = 1; nrqs++; 1303 } 1304 } 1305 w1[rank] = 0; /* rows owned by self will not be requested */ 1306 1307 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1308 for (proc=0,j=0; proc<size; proc++) { 1309 if (w1[proc]) { pa[j++] = proc;} 1310 } 1311 1312 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1313 msz = 0; /* total mesg length (for all procs) */ 1314 for (i=0; i<nrqs; i++) { 1315 proc = pa[i]; 1316 w1[proc] += 3; 1317 msz += w1[proc]; 1318 } 1319 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1320 1321 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1322 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1323 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1324 1325 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1326 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1327 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1328 1329 /* Now post the Irecvs corresponding to these messages */ 1330 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1331 1332 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1333 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1334 1335 /* Allocate Memory for outgoing messages */ 1336 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1337 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1338 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1339 1340 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1341 iptr = tmp; 1342 for (i=0; i<nrqs; i++) { 1343 proc = pa[i]; 1344 sbuf1[proc] = iptr; 1345 iptr += w1[proc]; 1346 } 1347 1348 /* Form the outgoing messages */ 1349 /* Initialize the header space */ 1350 for (i=0; i<nrqs; i++) { 1351 proc = pa[i]; 1352 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1353 ptr[proc] = sbuf1[proc] + 3; 1354 } 1355 1356 /* Parse the isrow and copy data into outbuf */ 1357 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1358 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1359 proc = row2proc[j]; 1360 if (proc != rank) { /* copy to the outgoing buf*/ 1361 *ptr[proc] = irow[j]; 1362 ctr[proc]++; ptr[proc]++; 1363 } 1364 } 1365 1366 /* Update the headers for the current IS */ 1367 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1368 if ((ctr_j = ctr[j])) { 1369 sbuf1_j = sbuf1[j]; 1370 k = ++sbuf1_j[0]; 1371 sbuf1_j[2*k] = ctr_j; 1372 sbuf1_j[2*k-1] = 0; 1373 } 1374 } 1375 1376 /* Now post the sends */ 1377 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1378 for (i=0; i<nrqs; ++i) { 1379 proc = pa[i]; 1380 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1381 } 1382 1383 /* Post Receives to capture the buffer size */ 1384 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1385 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1386 1387 rbuf2[0] = tmp + msz; 1388 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1389 1390 for (i=0; i<nrqs; ++i) { 1391 proc = pa[i]; 1392 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1393 } 1394 1395 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1396 1397 /* Send to other procs the buf size they should allocate */ 1398 /* Receive messages*/ 1399 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1400 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1401 1402 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1403 for (i=0; i<nrqr; ++i) { 1404 req_size[i] = 0; 1405 rbuf1_i = rbuf1[i]; 1406 start = 2*rbuf1_i[0] + 1; 1407 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1408 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1409 sbuf2_i = sbuf2[i]; 1410 for (j=start; j<end; j++) { 1411 k = rbuf1_i[j] - rstart; 1412 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1413 sbuf2_i[j] = ncols; 1414 req_size[i] += ncols; 1415 } 1416 req_source1[i] = r_status1[i].MPI_SOURCE; 1417 1418 /* form the header */ 1419 sbuf2_i[0] = req_size[i]; 1420 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1421 1422 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1423 } 1424 1425 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1426 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1427 1428 /* rbuf2 is received, Post recv column indices a->j */ 1429 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1430 1431 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1432 for (i=0; i<nrqs; ++i) { 1433 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1434 req_source2[i] = r_status2[i].MPI_SOURCE; 1435 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1436 } 1437 1438 /* Wait on sends1 and sends2 */ 1439 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1440 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1441 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1442 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1443 1444 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1445 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1446 1447 /* Now allocate sending buffers for a->j, and send them off */ 1448 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1449 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1450 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1451 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1452 1453 for (i=0; i<nrqr; i++) { /* for each requested message */ 1454 rbuf1_i = rbuf1[i]; 1455 sbuf_aj_i = sbuf_aj[i]; 1456 ct1 = 2*rbuf1_i[0] + 1; 1457 ct2 = 0; 1458 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1459 1460 kmax = rbuf1[i][2]; 1461 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1462 row = rbuf1_i[ct1] - rstart; 1463 nzA = ai[row+1] - ai[row]; 1464 nzB = bi[row+1] - bi[row]; 1465 ncols = nzA + nzB; 1466 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1467 1468 /* load the column indices for this row into cols*/ 1469 cols = sbuf_aj_i + ct2; 1470 1471 lwrite = 0; 1472 for (l=0; l<nzB; l++) { 1473 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1474 } 1475 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1476 for (l=0; l<nzB; l++) { 1477 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1478 } 1479 1480 ct2 += ncols; 1481 } 1482 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1483 } 1484 1485 /* create column map (cmap): global col of C -> local col of submat */ 1486 #if defined(PETSC_USE_CTABLE) 1487 if (!allcolumns) { 1488 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1489 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1490 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1491 if (icol[j] >= cstart && icol[j] <cend) { 1492 cmap_loc[icol[j] - cstart] = j+1; 1493 } else { /* use PetscTable for non-local col indices */ 1494 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1495 } 1496 } 1497 } else { 1498 cmap = NULL; 1499 cmap_loc = NULL; 1500 } 1501 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1502 #else 1503 if (!allcolumns) { 1504 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1505 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1506 } else { 1507 cmap = NULL; 1508 } 1509 #endif 1510 1511 /* Create lens for MatSeqAIJSetPreallocation() */ 1512 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1513 1514 /* Compute lens from local part of C */ 1515 for (j=0; j<nrow; j++) { 1516 row = irow[j]; 1517 proc = row2proc[j]; 1518 if (proc == rank) { 1519 /* diagonal part A = c->A */ 1520 ncols = ai[row-rstart+1] - ai[row-rstart]; 1521 cols = aj + ai[row-rstart]; 1522 if (!allcolumns) { 1523 for (k=0; k<ncols; k++) { 1524 #if defined(PETSC_USE_CTABLE) 1525 tcol = cmap_loc[cols[k]]; 1526 #else 1527 tcol = cmap[cols[k]+cstart]; 1528 #endif 1529 if (tcol) lens[j]++; 1530 } 1531 } else { /* allcolumns */ 1532 lens[j] = ncols; 1533 } 1534 1535 /* off-diagonal part B = c->B */ 1536 ncols = bi[row-rstart+1] - bi[row-rstart]; 1537 cols = bj + bi[row-rstart]; 1538 if (!allcolumns) { 1539 for (k=0; k<ncols; k++) { 1540 #if defined(PETSC_USE_CTABLE) 1541 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1542 #else 1543 tcol = cmap[bmap[cols[k]]]; 1544 #endif 1545 if (tcol) lens[j]++; 1546 } 1547 } else { /* allcolumns */ 1548 lens[j] += ncols; 1549 } 1550 } 1551 } 1552 1553 /* Create row map (rmap): global row of C -> local row of submat */ 1554 #if defined(PETSC_USE_CTABLE) 1555 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1556 for (j=0; j<nrow; j++) { 1557 row = irow[j]; 1558 proc = row2proc[j]; 1559 if (proc == rank) { /* a local row */ 1560 rmap_loc[row - rstart] = j; 1561 } else { 1562 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1563 } 1564 } 1565 #else 1566 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1567 for (j=0; j<nrow; j++) { 1568 rmap[irow[j]] = j; 1569 } 1570 #endif 1571 1572 /* Update lens from offproc data */ 1573 /* recv a->j is done */ 1574 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1575 for (i=0; i<nrqs; i++) { 1576 proc = pa[i]; 1577 sbuf1_i = sbuf1[proc]; 1578 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1579 ct1 = 2 + 1; 1580 ct2 = 0; 1581 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1582 rbuf3_i = rbuf3[i]; /* received C->j */ 1583 1584 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1585 max1 = sbuf1_i[2]; 1586 for (k=0; k<max1; k++,ct1++) { 1587 #if defined(PETSC_USE_CTABLE) 1588 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1589 row--; 1590 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1591 #else 1592 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1593 #endif 1594 /* Now, store row index of submat in sbuf1_i[ct1] */ 1595 sbuf1_i[ct1] = row; 1596 1597 nnz = rbuf2_i[ct1]; 1598 if (!allcolumns) { 1599 for (l=0; l<nnz; l++,ct2++) { 1600 #if defined(PETSC_USE_CTABLE) 1601 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1602 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1603 } else { 1604 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1605 } 1606 #else 1607 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1608 #endif 1609 if (tcol) lens[row]++; 1610 } 1611 } else { /* allcolumns */ 1612 lens[row] += nnz; 1613 } 1614 } 1615 } 1616 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1617 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1618 1619 /* Create the submatrices */ 1620 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1621 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1622 1623 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1624 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1625 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1626 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1627 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1628 1629 /* create struct Mat_SubSppt and attached it to submat */ 1630 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1631 subc = (Mat_SeqAIJ*)submat->data; 1632 subc->submatis1 = smatis1; 1633 1634 smatis1->id = 0; 1635 smatis1->nrqs = nrqs; 1636 smatis1->nrqr = nrqr; 1637 smatis1->rbuf1 = rbuf1; 1638 smatis1->rbuf2 = rbuf2; 1639 smatis1->rbuf3 = rbuf3; 1640 smatis1->sbuf2 = sbuf2; 1641 smatis1->req_source2 = req_source2; 1642 1643 smatis1->sbuf1 = sbuf1; 1644 smatis1->ptr = ptr; 1645 smatis1->tmp = tmp; 1646 smatis1->ctr = ctr; 1647 1648 smatis1->pa = pa; 1649 smatis1->req_size = req_size; 1650 smatis1->req_source1 = req_source1; 1651 1652 smatis1->allcolumns = allcolumns; 1653 smatis1->singleis = PETSC_TRUE; 1654 smatis1->row2proc = row2proc; 1655 smatis1->rmap = rmap; 1656 smatis1->cmap = cmap; 1657 #if defined(PETSC_USE_CTABLE) 1658 smatis1->rmap_loc = rmap_loc; 1659 smatis1->cmap_loc = cmap_loc; 1660 #endif 1661 1662 smatis1->destroy = submat->ops->destroy; 1663 submat->ops->destroy = MatDestroy_SeqAIJ_Submatrices; 1664 submat->factortype = C->factortype; 1665 1666 /* compute rmax */ 1667 rmax = 0; 1668 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1669 1670 } else { /* scall == MAT_REUSE_MATRIX */ 1671 submat = submats[0]; 1672 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1673 1674 subc = (Mat_SeqAIJ*)submat->data; 1675 rmax = subc->rmax; 1676 smatis1 = subc->submatis1; 1677 nrqs = smatis1->nrqs; 1678 nrqr = smatis1->nrqr; 1679 rbuf1 = smatis1->rbuf1; 1680 rbuf2 = smatis1->rbuf2; 1681 rbuf3 = smatis1->rbuf3; 1682 req_source2 = smatis1->req_source2; 1683 1684 sbuf1 = smatis1->sbuf1; 1685 sbuf2 = smatis1->sbuf2; 1686 ptr = smatis1->ptr; 1687 tmp = smatis1->tmp; 1688 ctr = smatis1->ctr; 1689 1690 pa = smatis1->pa; 1691 req_size = smatis1->req_size; 1692 req_source1 = smatis1->req_source1; 1693 1694 allcolumns = smatis1->allcolumns; 1695 row2proc = smatis1->row2proc; 1696 rmap = smatis1->rmap; 1697 cmap = smatis1->cmap; 1698 #if defined(PETSC_USE_CTABLE) 1699 rmap_loc = smatis1->rmap_loc; 1700 cmap_loc = smatis1->cmap_loc; 1701 #endif 1702 } 1703 1704 /* Post recv matrix values */ 1705 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1706 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1707 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1708 for (i=0; i<nrqs; ++i) { 1709 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1710 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1711 } 1712 1713 /* Allocate sending buffers for a->a, and send them off */ 1714 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1715 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1716 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1717 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1718 1719 for (i=0; i<nrqr; i++) { 1720 rbuf1_i = rbuf1[i]; 1721 sbuf_aa_i = sbuf_aa[i]; 1722 ct1 = 2*rbuf1_i[0]+1; 1723 ct2 = 0; 1724 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1725 1726 kmax = rbuf1_i[2]; 1727 for (k=0; k<kmax; k++,ct1++) { 1728 row = rbuf1_i[ct1] - rstart; 1729 nzA = ai[row+1] - ai[row]; 1730 nzB = bi[row+1] - bi[row]; 1731 ncols = nzA + nzB; 1732 cworkB = bj + bi[row]; 1733 vworkA = a_a + ai[row]; 1734 vworkB = b_a + bi[row]; 1735 1736 /* load the column values for this row into vals*/ 1737 vals = sbuf_aa_i + ct2; 1738 1739 lwrite = 0; 1740 for (l=0; l<nzB; l++) { 1741 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1742 } 1743 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1744 for (l=0; l<nzB; l++) { 1745 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1746 } 1747 1748 ct2 += ncols; 1749 } 1750 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1751 } 1752 1753 /* Assemble submat */ 1754 /* First assemble the local rows */ 1755 for (j=0; j<nrow; j++) { 1756 row = irow[j]; 1757 proc = row2proc[j]; 1758 if (proc == rank) { 1759 Crow = row - rstart; /* local row index of C */ 1760 #if defined(PETSC_USE_CTABLE) 1761 row = rmap_loc[Crow]; /* row index of submat */ 1762 #else 1763 row = rmap[row]; 1764 #endif 1765 1766 if (allcolumns) { 1767 /* diagonal part A = c->A */ 1768 ncols = ai[Crow+1] - ai[Crow]; 1769 cols = aj + ai[Crow]; 1770 vals = a->a + ai[Crow]; 1771 i = 0; 1772 for (k=0; k<ncols; k++) { 1773 subcols[i] = cols[k] + cstart; 1774 subvals[i++] = vals[k]; 1775 } 1776 1777 /* off-diagonal part B = c->B */ 1778 ncols = bi[Crow+1] - bi[Crow]; 1779 cols = bj + bi[Crow]; 1780 vals = b->a + bi[Crow]; 1781 for (k=0; k<ncols; k++) { 1782 subcols[i] = bmap[cols[k]]; 1783 subvals[i++] = vals[k]; 1784 } 1785 1786 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1787 1788 } else { /* !allcolumns */ 1789 #if defined(PETSC_USE_CTABLE) 1790 /* diagonal part A = c->A */ 1791 ncols = ai[Crow+1] - ai[Crow]; 1792 cols = aj + ai[Crow]; 1793 vals = a->a + ai[Crow]; 1794 i = 0; 1795 for (k=0; k<ncols; k++) { 1796 tcol = cmap_loc[cols[k]]; 1797 if (tcol) { 1798 subcols[i] = --tcol; 1799 subvals[i++] = vals[k]; 1800 } 1801 } 1802 1803 /* off-diagonal part B = c->B */ 1804 ncols = bi[Crow+1] - bi[Crow]; 1805 cols = bj + bi[Crow]; 1806 vals = b->a + bi[Crow]; 1807 for (k=0; k<ncols; k++) { 1808 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1809 if (tcol) { 1810 subcols[i] = --tcol; 1811 subvals[i++] = vals[k]; 1812 } 1813 } 1814 #else 1815 /* diagonal part A = c->A */ 1816 ncols = ai[Crow+1] - ai[Crow]; 1817 cols = aj + ai[Crow]; 1818 vals = a->a + ai[Crow]; 1819 i = 0; 1820 for (k=0; k<ncols; k++) { 1821 tcol = cmap[cols[k]+cstart]; 1822 if (tcol) { 1823 subcols[i] = --tcol; 1824 subvals[i++] = vals[k]; 1825 } 1826 } 1827 1828 /* off-diagonal part B = c->B */ 1829 ncols = bi[Crow+1] - bi[Crow]; 1830 cols = bj + bi[Crow]; 1831 vals = b->a + bi[Crow]; 1832 for (k=0; k<ncols; k++) { 1833 tcol = cmap[bmap[cols[k]]]; 1834 if (tcol) { 1835 subcols[i] = --tcol; 1836 subvals[i++] = vals[k]; 1837 } 1838 } 1839 #endif 1840 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1841 } 1842 } 1843 } 1844 1845 /* Now assemble the off-proc rows */ 1846 for (i=0; i<nrqs; i++) { /* for each requested message */ 1847 /* recv values from other processes */ 1848 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1849 proc = pa[idex]; 1850 sbuf1_i = sbuf1[proc]; 1851 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1852 ct1 = 2 + 1; 1853 ct2 = 0; /* count of received C->j */ 1854 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1855 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1856 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1857 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1858 1859 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1860 max1 = sbuf1_i[2]; /* num of rows */ 1861 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1862 row = sbuf1_i[ct1]; /* row index of submat */ 1863 if (!allcolumns) { 1864 idex = 0; 1865 if (scall == MAT_INITIAL_MATRIX) { 1866 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1867 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1868 #if defined(PETSC_USE_CTABLE) 1869 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1870 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1871 } else { 1872 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1873 } 1874 #else 1875 tcol = cmap[rbuf3_i[ct2]]; 1876 #endif 1877 if (tcol) { 1878 subcols[idex] = --tcol; 1879 subvals[idex++] = rbuf4_i[ct2]; 1880 1881 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1882 For reuse, we replace received C->j with index that should be inserted to submat */ 1883 rbuf3_i[ct3++] = ct2; 1884 } 1885 } 1886 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1887 1888 } else { /* scall == MAT_REUSE_MATRIX */ 1889 submat = submats[0]; 1890 subc = (Mat_SeqAIJ*)submat->data; 1891 1892 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1893 for (l=0; l<nnz; l++) { 1894 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1895 subvals[idex++] = rbuf4_i[ct2]; 1896 } 1897 1898 bj = subc->j + subc->i[row]; 1899 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1900 } 1901 } else { /* allcolumns */ 1902 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1903 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1904 ct2 += nnz; 1905 } 1906 } 1907 } 1908 1909 /* sending a->a are done */ 1910 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1911 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1912 1913 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1914 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1915 submats[0] = submat; 1916 1917 /* Restore the indices */ 1918 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1919 if (!allcolumns) { 1920 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1921 } 1922 1923 /* Destroy allocated memory */ 1924 for (i=0; i<nrqs; ++i) { 1925 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1926 } 1927 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1928 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1929 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1930 1931 if (scall == MAT_INITIAL_MATRIX) { 1932 ierr = PetscFree(lens);CHKERRQ(ierr); 1933 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1934 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1935 } 1936 PetscFunctionReturn(0); 1937 } 1938 1939 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1940 { 1941 PetscErrorCode ierr; 1942 PetscInt ncol; 1943 PetscBool colflag,allcolumns=PETSC_FALSE; 1944 1945 PetscFunctionBegin; 1946 /* Allocate memory to hold all the submatrices */ 1947 if (scall == MAT_INITIAL_MATRIX) { 1948 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1949 } 1950 1951 /* Check for special case: each processor gets entire matrix columns */ 1952 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1953 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1954 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1955 1956 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1957 PetscFunctionReturn(0); 1958 } 1959 1960 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1961 { 1962 PetscErrorCode ierr; 1963 PetscInt nmax,nstages,i,pos,max_no,nrow,ncol,in[2],out[2]; 1964 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1965 Mat_SeqAIJ *subc; 1966 Mat_SubSppt *smat; 1967 1968 PetscFunctionBegin; 1969 /* Check for special case: each processor has a single IS */ 1970 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1971 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1972 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */ 1973 PetscFunctionReturn(0); 1974 } 1975 1976 /* Collect global wantallmatrix and nstages */ 1977 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1978 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1979 if (!nmax) nmax = 1; 1980 1981 if (scall == MAT_INITIAL_MATRIX) { 1982 /* Collect global wantallmatrix and nstages */ 1983 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1984 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1985 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1986 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1987 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1988 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1989 wantallmatrix = PETSC_TRUE; 1990 1991 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1992 } 1993 } 1994 1995 /* Determine the number of stages through which submatrices are done 1996 Each stage will extract nmax submatrices. 1997 nmax is determined by the matrix column dimension. 1998 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 1999 */ 2000 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2001 2002 in[0] = -1*(PetscInt)wantallmatrix; 2003 in[1] = nstages; 2004 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2005 wantallmatrix = (PetscBool)(-out[0]); 2006 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2007 2008 } else { /* MAT_REUSE_MATRIX */ 2009 if (ismax) { 2010 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2011 smat = subc->submatis1; 2012 } else { /* (*submat)[0] is a dummy matrix */ 2013 smat = (Mat_SubSppt*)(*submat)[0]->data; 2014 } 2015 if (!smat) { 2016 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2017 wantallmatrix = PETSC_TRUE; 2018 } else if (smat->singleis) { 2019 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2020 PetscFunctionReturn(0); 2021 } else { 2022 nstages = smat->nstages; 2023 } 2024 } 2025 2026 if (wantallmatrix) { 2027 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2028 PetscFunctionReturn(0); 2029 } 2030 2031 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2032 if (scall == MAT_INITIAL_MATRIX) { 2033 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2034 } 2035 2036 for (i=0,pos=0; i<nstages; i++) { 2037 if (pos+nmax <= ismax) max_no = nmax; 2038 else if (pos == ismax) max_no = 0; 2039 else max_no = ismax-pos; 2040 2041 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2042 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2043 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2044 smat->nstages = nstages; 2045 } 2046 pos += max_no; 2047 } 2048 2049 if (ismax && scall == MAT_INITIAL_MATRIX) { 2050 /* save nstages for reuse */ 2051 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2052 smat = subc->submatis1; 2053 smat->nstages = nstages; 2054 } 2055 PetscFunctionReturn(0); 2056 } 2057 2058 /* -------------------------------------------------------------------------*/ 2059 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2060 { 2061 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2062 Mat A = c->A; 2063 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2064 const PetscInt **icol,**irow; 2065 PetscInt *nrow,*ncol,start; 2066 PetscErrorCode ierr; 2067 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2068 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2069 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2070 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2071 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2072 #if defined(PETSC_USE_CTABLE) 2073 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2074 #else 2075 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2076 #endif 2077 const PetscInt *irow_i; 2078 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2079 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2080 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2081 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2082 MPI_Status *r_status3,*r_status4,*s_status4; 2083 MPI_Comm comm; 2084 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2085 PetscMPIInt *onodes1,*olengths1,end; 2086 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2087 Mat_SubSppt *smat_i; 2088 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2089 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2090 2091 PetscFunctionBegin; 2092 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2093 size = c->size; 2094 rank = c->rank; 2095 2096 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2097 ierr = PetscMalloc5(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2098 2099 for (i=0; i<ismax; i++) { 2100 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2101 if (!issorted[i]) iscsorted = issorted[i]; 2102 2103 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2104 2105 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2106 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2107 2108 /* Check for special case: allcolumn */ 2109 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2110 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2111 if (colflag && ncol[i] == C->cmap->N) { 2112 allcolumns[i] = PETSC_TRUE; 2113 icol[i] = NULL; 2114 } else { 2115 allcolumns[i] = PETSC_FALSE; 2116 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2117 } 2118 } 2119 2120 if (scall == MAT_REUSE_MATRIX) { 2121 /* Assumes new rows are same length as the old rows */ 2122 for (i=0; i<ismax; i++) { 2123 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2124 subc = (Mat_SeqAIJ*)submats[i]->data; 2125 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2126 2127 /* Initial matrix as if empty */ 2128 ierr = PetscMemzero(subc->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2129 2130 smat_i = subc->submatis1; 2131 2132 nrqs = smat_i->nrqs; 2133 nrqr = smat_i->nrqr; 2134 rbuf1 = smat_i->rbuf1; 2135 rbuf2 = smat_i->rbuf2; 2136 rbuf3 = smat_i->rbuf3; 2137 req_source2 = smat_i->req_source2; 2138 2139 sbuf1 = smat_i->sbuf1; 2140 sbuf2 = smat_i->sbuf2; 2141 ptr = smat_i->ptr; 2142 tmp = smat_i->tmp; 2143 ctr = smat_i->ctr; 2144 2145 pa = smat_i->pa; 2146 req_size = smat_i->req_size; 2147 req_source1 = smat_i->req_source1; 2148 2149 allcolumns[i] = smat_i->allcolumns; 2150 row2proc[i] = smat_i->row2proc; 2151 rmap[i] = smat_i->rmap; 2152 cmap[i] = smat_i->cmap; 2153 } 2154 2155 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2156 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2157 smat_i = (Mat_SubSppt*)submats[0]->data; 2158 2159 nrqs = smat_i->nrqs; 2160 nrqr = smat_i->nrqr; 2161 rbuf1 = smat_i->rbuf1; 2162 rbuf2 = smat_i->rbuf2; 2163 rbuf3 = smat_i->rbuf3; 2164 req_source2 = smat_i->req_source2; 2165 2166 sbuf1 = smat_i->sbuf1; 2167 sbuf2 = smat_i->sbuf2; 2168 ptr = smat_i->ptr; 2169 tmp = smat_i->tmp; 2170 ctr = smat_i->ctr; 2171 2172 pa = smat_i->pa; 2173 req_size = smat_i->req_size; 2174 req_source1 = smat_i->req_source1; 2175 2176 allcolumns[0] = PETSC_FALSE; 2177 } 2178 } else { /* scall == MAT_INITIAL_MATRIX */ 2179 /* Get some new tags to keep the communication clean */ 2180 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2181 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2182 2183 /* evaluate communication - mesg to who, length of mesg, and buffer space 2184 required. Based on this, buffers are allocated, and data copied into them*/ 2185 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2186 2187 for (i=0; i<ismax; i++) { 2188 jmax = nrow[i]; 2189 irow_i = irow[i]; 2190 2191 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2192 row2proc[i] = row2proc_i; 2193 2194 if (issorted[i]) proc = 0; 2195 for (j=0; j<jmax; j++) { 2196 if (!issorted[i]) proc = 0; 2197 row = irow_i[j]; 2198 while (row >= C->rmap->range[proc+1]) proc++; 2199 w4[proc]++; 2200 row2proc_i[j] = proc; /* map row index to proc */ 2201 } 2202 for (j=0; j<size; j++) { 2203 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2204 } 2205 } 2206 2207 nrqs = 0; /* no of outgoing messages */ 2208 msz = 0; /* total mesg length (for all procs) */ 2209 w1[rank] = 0; /* no mesg sent to self */ 2210 w3[rank] = 0; 2211 for (i=0; i<size; i++) { 2212 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2213 } 2214 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2215 for (i=0,j=0; i<size; i++) { 2216 if (w1[i]) { pa[j] = i; j++; } 2217 } 2218 2219 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2220 for (i=0; i<nrqs; i++) { 2221 j = pa[i]; 2222 w1[j] += w2[j] + 2* w3[j]; 2223 msz += w1[j]; 2224 } 2225 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2226 2227 /* Determine the number of messages to expect, their lengths, from from-ids */ 2228 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2229 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2230 2231 /* Now post the Irecvs corresponding to these messages */ 2232 tag0 = ((PetscObject)C)->tag; 2233 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2234 2235 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2236 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2237 2238 /* Allocate Memory for outgoing messages */ 2239 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2240 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2241 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2242 2243 { 2244 PetscInt *iptr = tmp; 2245 k = 0; 2246 for (i=0; i<nrqs; i++) { 2247 j = pa[i]; 2248 iptr += k; 2249 sbuf1[j] = iptr; 2250 k = w1[j]; 2251 } 2252 } 2253 2254 /* Form the outgoing messages. Initialize the header space */ 2255 for (i=0; i<nrqs; i++) { 2256 j = pa[i]; 2257 sbuf1[j][0] = 0; 2258 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2259 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2260 } 2261 2262 /* Parse the isrow and copy data into outbuf */ 2263 for (i=0; i<ismax; i++) { 2264 row2proc_i = row2proc[i]; 2265 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2266 irow_i = irow[i]; 2267 jmax = nrow[i]; 2268 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2269 proc = row2proc_i[j]; 2270 if (proc != rank) { /* copy to the outgoing buf*/ 2271 ctr[proc]++; 2272 *ptr[proc] = irow_i[j]; 2273 ptr[proc]++; 2274 } 2275 } 2276 /* Update the headers for the current IS */ 2277 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2278 if ((ctr_j = ctr[j])) { 2279 sbuf1_j = sbuf1[j]; 2280 k = ++sbuf1_j[0]; 2281 sbuf1_j[2*k] = ctr_j; 2282 sbuf1_j[2*k-1] = i; 2283 } 2284 } 2285 } 2286 2287 /* Now post the sends */ 2288 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2289 for (i=0; i<nrqs; ++i) { 2290 j = pa[i]; 2291 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2292 } 2293 2294 /* Post Receives to capture the buffer size */ 2295 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2296 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2297 rbuf2[0] = tmp + msz; 2298 for (i=1; i<nrqs; ++i) { 2299 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2300 } 2301 for (i=0; i<nrqs; ++i) { 2302 j = pa[i]; 2303 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2304 } 2305 2306 /* Send to other procs the buf size they should allocate */ 2307 /* Receive messages*/ 2308 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2309 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2310 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2311 { 2312 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2313 PetscInt *sbuf2_i; 2314 2315 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2316 for (i=0; i<nrqr; ++i) { 2317 req_size[i] = 0; 2318 rbuf1_i = rbuf1[i]; 2319 start = 2*rbuf1_i[0] + 1; 2320 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2321 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2322 sbuf2_i = sbuf2[i]; 2323 for (j=start; j<end; j++) { 2324 id = rbuf1_i[j] - rstart; 2325 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2326 sbuf2_i[j] = ncols; 2327 req_size[i] += ncols; 2328 } 2329 req_source1[i] = r_status1[i].MPI_SOURCE; 2330 /* form the header */ 2331 sbuf2_i[0] = req_size[i]; 2332 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2333 2334 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2335 } 2336 } 2337 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2338 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2339 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2340 2341 /* Receive messages*/ 2342 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2343 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2344 2345 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2346 for (i=0; i<nrqs; ++i) { 2347 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2348 req_source2[i] = r_status2[i].MPI_SOURCE; 2349 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2350 } 2351 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2352 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2353 2354 /* Wait on sends1 and sends2 */ 2355 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2356 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2357 2358 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2359 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2360 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2361 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2362 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2363 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2364 2365 /* Now allocate sending buffers for a->j, and send them off */ 2366 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2367 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2368 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2369 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2370 2371 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2372 { 2373 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2374 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2375 PetscInt cend = C->cmap->rend; 2376 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2377 2378 for (i=0; i<nrqr; i++) { 2379 rbuf1_i = rbuf1[i]; 2380 sbuf_aj_i = sbuf_aj[i]; 2381 ct1 = 2*rbuf1_i[0] + 1; 2382 ct2 = 0; 2383 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2384 kmax = rbuf1[i][2*j]; 2385 for (k=0; k<kmax; k++,ct1++) { 2386 row = rbuf1_i[ct1] - rstart; 2387 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2388 ncols = nzA + nzB; 2389 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2390 2391 /* load the column indices for this row into cols */ 2392 cols = sbuf_aj_i + ct2; 2393 2394 lwrite = 0; 2395 for (l=0; l<nzB; l++) { 2396 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2397 } 2398 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2399 for (l=0; l<nzB; l++) { 2400 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2401 } 2402 2403 ct2 += ncols; 2404 } 2405 } 2406 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2407 } 2408 } 2409 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2410 2411 /* create col map: global col of C -> local col of submatrices */ 2412 { 2413 const PetscInt *icol_i; 2414 #if defined(PETSC_USE_CTABLE) 2415 for (i=0; i<ismax; i++) { 2416 if (!allcolumns[i]) { 2417 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2418 2419 jmax = ncol[i]; 2420 icol_i = icol[i]; 2421 cmap_i = cmap[i]; 2422 for (j=0; j<jmax; j++) { 2423 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2424 } 2425 } else cmap[i] = NULL; 2426 } 2427 #else 2428 for (i=0; i<ismax; i++) { 2429 if (!allcolumns[i]) { 2430 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2431 jmax = ncol[i]; 2432 icol_i = icol[i]; 2433 cmap_i = cmap[i]; 2434 for (j=0; j<jmax; j++) { 2435 cmap_i[icol_i[j]] = j+1; 2436 } 2437 } else cmap[i] = NULL; 2438 } 2439 #endif 2440 } 2441 2442 /* Create lens which is required for MatCreate... */ 2443 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2444 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2445 2446 if (ismax) { 2447 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2448 } 2449 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2450 2451 /* Update lens from local data */ 2452 for (i=0; i<ismax; i++) { 2453 row2proc_i = row2proc[i]; 2454 jmax = nrow[i]; 2455 if (!allcolumns[i]) cmap_i = cmap[i]; 2456 irow_i = irow[i]; 2457 lens_i = lens[i]; 2458 for (j=0; j<jmax; j++) { 2459 row = irow_i[j]; 2460 proc = row2proc_i[j]; 2461 if (proc == rank) { 2462 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2463 if (!allcolumns[i]) { 2464 for (k=0; k<ncols; k++) { 2465 #if defined(PETSC_USE_CTABLE) 2466 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2467 #else 2468 tcol = cmap_i[cols[k]]; 2469 #endif 2470 if (tcol) lens_i[j]++; 2471 } 2472 } else { /* allcolumns */ 2473 lens_i[j] = ncols; 2474 } 2475 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2476 } 2477 } 2478 } 2479 2480 /* Create row map: global row of C -> local row of submatrices */ 2481 #if defined(PETSC_USE_CTABLE) 2482 for (i=0; i<ismax; i++) { 2483 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2484 irow_i = irow[i]; 2485 jmax = nrow[i]; 2486 for (j=0; j<jmax; j++) { 2487 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2488 } 2489 } 2490 #else 2491 for (i=0; i<ismax; i++) { 2492 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2493 rmap_i = rmap[i]; 2494 irow_i = irow[i]; 2495 jmax = nrow[i]; 2496 for (j=0; j<jmax; j++) { 2497 rmap_i[irow_i[j]] = j; 2498 } 2499 } 2500 #endif 2501 2502 /* Update lens from offproc data */ 2503 { 2504 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2505 2506 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2507 for (tmp2=0; tmp2<nrqs; tmp2++) { 2508 sbuf1_i = sbuf1[pa[tmp2]]; 2509 jmax = sbuf1_i[0]; 2510 ct1 = 2*jmax+1; 2511 ct2 = 0; 2512 rbuf2_i = rbuf2[tmp2]; 2513 rbuf3_i = rbuf3[tmp2]; 2514 for (j=1; j<=jmax; j++) { 2515 is_no = sbuf1_i[2*j-1]; 2516 max1 = sbuf1_i[2*j]; 2517 lens_i = lens[is_no]; 2518 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2519 rmap_i = rmap[is_no]; 2520 for (k=0; k<max1; k++,ct1++) { 2521 #if defined(PETSC_USE_CTABLE) 2522 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2523 row--; 2524 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2525 #else 2526 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2527 #endif 2528 max2 = rbuf2_i[ct1]; 2529 for (l=0; l<max2; l++,ct2++) { 2530 if (!allcolumns[is_no]) { 2531 #if defined(PETSC_USE_CTABLE) 2532 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2533 #else 2534 tcol = cmap_i[rbuf3_i[ct2]]; 2535 #endif 2536 if (tcol) lens_i[row]++; 2537 } else { /* allcolumns */ 2538 lens_i[row]++; /* lens_i[row] += max2 ? */ 2539 } 2540 } 2541 } 2542 } 2543 } 2544 } 2545 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2546 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2547 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2548 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2549 2550 /* Create the submatrices */ 2551 for (i=0; i<ismax; i++) { 2552 PetscInt rbs,cbs; 2553 2554 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2555 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2556 2557 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2558 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2559 2560 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2561 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2562 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2563 2564 /* create struct Mat_SubSppt and attached it to submat */ 2565 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2566 subc = (Mat_SeqAIJ*)submats[i]->data; 2567 subc->submatis1 = smat_i; 2568 2569 smat_i->destroy = submats[i]->ops->destroy; 2570 submats[i]->ops->destroy = MatDestroy_SeqAIJ_Submatrices; 2571 submats[i]->factortype = C->factortype; 2572 2573 smat_i->id = i; 2574 smat_i->nrqs = nrqs; 2575 smat_i->nrqr = nrqr; 2576 smat_i->rbuf1 = rbuf1; 2577 smat_i->rbuf2 = rbuf2; 2578 smat_i->rbuf3 = rbuf3; 2579 smat_i->sbuf2 = sbuf2; 2580 smat_i->req_source2 = req_source2; 2581 2582 smat_i->sbuf1 = sbuf1; 2583 smat_i->ptr = ptr; 2584 smat_i->tmp = tmp; 2585 smat_i->ctr = ctr; 2586 2587 smat_i->pa = pa; 2588 smat_i->req_size = req_size; 2589 smat_i->req_source1 = req_source1; 2590 2591 smat_i->allcolumns = allcolumns[i]; 2592 smat_i->singleis = PETSC_FALSE; 2593 smat_i->row2proc = row2proc[i]; 2594 smat_i->rmap = rmap[i]; 2595 smat_i->cmap = cmap[i]; 2596 } 2597 2598 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2599 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2600 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2601 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2602 2603 /* create struct Mat_SubSppt and attached it to submat */ 2604 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2605 submats[0]->data = (void*)smat_i; 2606 2607 smat_i->destroy = submats[0]->ops->destroy; 2608 submats[0]->ops->destroy = MatDestroy_Dummy_Submatrices; 2609 submats[0]->factortype = C->factortype; 2610 2611 smat_i->id = 0; 2612 smat_i->nrqs = nrqs; 2613 smat_i->nrqr = nrqr; 2614 smat_i->rbuf1 = rbuf1; 2615 smat_i->rbuf2 = rbuf2; 2616 smat_i->rbuf3 = rbuf3; 2617 smat_i->sbuf2 = sbuf2; 2618 smat_i->req_source2 = req_source2; 2619 2620 smat_i->sbuf1 = sbuf1; 2621 smat_i->ptr = ptr; 2622 smat_i->tmp = tmp; 2623 smat_i->ctr = ctr; 2624 2625 smat_i->pa = pa; 2626 smat_i->req_size = req_size; 2627 smat_i->req_source1 = req_source1; 2628 2629 smat_i->allcolumns = PETSC_FALSE; 2630 smat_i->singleis = PETSC_FALSE; 2631 smat_i->row2proc = NULL; 2632 smat_i->rmap = NULL; 2633 smat_i->cmap = NULL; 2634 } 2635 2636 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2637 ierr = PetscFree(lens);CHKERRQ(ierr); 2638 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2639 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2640 2641 } /* endof scall == MAT_INITIAL_MATRIX */ 2642 2643 /* Post recv matrix values */ 2644 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2645 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2646 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2647 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2648 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2649 for (i=0; i<nrqs; ++i) { 2650 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2651 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2652 } 2653 2654 /* Allocate sending buffers for a->a, and send them off */ 2655 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2656 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2657 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2658 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2659 2660 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2661 { 2662 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2663 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2664 PetscInt cend = C->cmap->rend; 2665 PetscInt *b_j = b->j; 2666 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2667 2668 for (i=0; i<nrqr; i++) { 2669 rbuf1_i = rbuf1[i]; 2670 sbuf_aa_i = sbuf_aa[i]; 2671 ct1 = 2*rbuf1_i[0]+1; 2672 ct2 = 0; 2673 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2674 kmax = rbuf1_i[2*j]; 2675 for (k=0; k<kmax; k++,ct1++) { 2676 row = rbuf1_i[ct1] - rstart; 2677 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2678 ncols = nzA + nzB; 2679 cworkB = b_j + b_i[row]; 2680 vworkA = a_a + a_i[row]; 2681 vworkB = b_a + b_i[row]; 2682 2683 /* load the column values for this row into vals*/ 2684 vals = sbuf_aa_i+ct2; 2685 2686 lwrite = 0; 2687 for (l=0; l<nzB; l++) { 2688 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2689 } 2690 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2691 for (l=0; l<nzB; l++) { 2692 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2693 } 2694 2695 ct2 += ncols; 2696 } 2697 } 2698 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2699 } 2700 } 2701 2702 /* Assemble the matrices */ 2703 /* First assemble the local rows */ 2704 for (i=0; i<ismax; i++) { 2705 row2proc_i = row2proc[i]; 2706 subc = (Mat_SeqAIJ*)submats[i]->data; 2707 imat_ilen = subc->ilen; 2708 imat_j = subc->j; 2709 imat_i = subc->i; 2710 imat_a = subc->a; 2711 2712 if (!allcolumns[i]) cmap_i = cmap[i]; 2713 rmap_i = rmap[i]; 2714 irow_i = irow[i]; 2715 jmax = nrow[i]; 2716 for (j=0; j<jmax; j++) { 2717 row = irow_i[j]; 2718 proc = row2proc_i[j]; 2719 if (proc == rank) { 2720 old_row = row; 2721 #if defined(PETSC_USE_CTABLE) 2722 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2723 row--; 2724 #else 2725 row = rmap_i[row]; 2726 #endif 2727 ilen_row = imat_ilen[row]; 2728 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2729 mat_i = imat_i[row]; 2730 mat_a = imat_a + mat_i; 2731 mat_j = imat_j + mat_i; 2732 if (!allcolumns[i]) { 2733 for (k=0; k<ncols; k++) { 2734 #if defined(PETSC_USE_CTABLE) 2735 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2736 #else 2737 tcol = cmap_i[cols[k]]; 2738 #endif 2739 if (tcol) { 2740 *mat_j++ = tcol - 1; 2741 *mat_a++ = vals[k]; 2742 ilen_row++; 2743 } 2744 } 2745 } else { /* allcolumns */ 2746 for (k=0; k<ncols; k++) { 2747 *mat_j++ = cols[k]; /* global col index! */ 2748 *mat_a++ = vals[k]; 2749 ilen_row++; 2750 } 2751 } 2752 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2753 2754 imat_ilen[row] = ilen_row; 2755 } 2756 } 2757 } 2758 2759 /* Now assemble the off proc rows */ 2760 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2761 for (tmp2=0; tmp2<nrqs; tmp2++) { 2762 sbuf1_i = sbuf1[pa[tmp2]]; 2763 jmax = sbuf1_i[0]; 2764 ct1 = 2*jmax + 1; 2765 ct2 = 0; 2766 rbuf2_i = rbuf2[tmp2]; 2767 rbuf3_i = rbuf3[tmp2]; 2768 rbuf4_i = rbuf4[tmp2]; 2769 for (j=1; j<=jmax; j++) { 2770 is_no = sbuf1_i[2*j-1]; 2771 rmap_i = rmap[is_no]; 2772 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2773 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2774 imat_ilen = subc->ilen; 2775 imat_j = subc->j; 2776 imat_i = subc->i; 2777 imat_a = subc->a; 2778 max1 = sbuf1_i[2*j]; 2779 for (k=0; k<max1; k++,ct1++) { 2780 row = sbuf1_i[ct1]; 2781 #if defined(PETSC_USE_CTABLE) 2782 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2783 row--; 2784 #else 2785 row = rmap_i[row]; 2786 #endif 2787 ilen = imat_ilen[row]; 2788 mat_i = imat_i[row]; 2789 mat_a = imat_a + mat_i; 2790 mat_j = imat_j + mat_i; 2791 max2 = rbuf2_i[ct1]; 2792 if (!allcolumns[is_no]) { 2793 for (l=0; l<max2; l++,ct2++) { 2794 #if defined(PETSC_USE_CTABLE) 2795 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2796 #else 2797 tcol = cmap_i[rbuf3_i[ct2]]; 2798 #endif 2799 if (tcol) { 2800 *mat_j++ = tcol - 1; 2801 *mat_a++ = rbuf4_i[ct2]; 2802 ilen++; 2803 } 2804 } 2805 } else { /* allcolumns */ 2806 for (l=0; l<max2; l++,ct2++) { 2807 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2808 *mat_a++ = rbuf4_i[ct2]; 2809 ilen++; 2810 } 2811 } 2812 imat_ilen[row] = ilen; 2813 } 2814 } 2815 } 2816 2817 if (!iscsorted) { /* sort column indices of the rows */ 2818 for (i=0; i<ismax; i++) { 2819 subc = (Mat_SeqAIJ*)submats[i]->data; 2820 imat_j = subc->j; 2821 imat_i = subc->i; 2822 imat_a = subc->a; 2823 imat_ilen = subc->ilen; 2824 2825 if (allcolumns[i]) continue; 2826 jmax = nrow[i]; 2827 for (j=0; j<jmax; j++) { 2828 mat_i = imat_i[j]; 2829 mat_a = imat_a + mat_i; 2830 mat_j = imat_j + mat_i; 2831 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2832 } 2833 } 2834 } 2835 2836 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2837 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2838 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2839 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2840 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2841 2842 /* Restore the indices */ 2843 for (i=0; i<ismax; i++) { 2844 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2845 if (!allcolumns[i]) { 2846 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2847 } 2848 } 2849 2850 for (i=0; i<ismax; i++) { 2851 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2852 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2853 } 2854 2855 /* Destroy allocated memory */ 2856 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2857 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2858 ierr = PetscFree5(irow,icol,nrow,ncol,issorted);CHKERRQ(ierr); 2859 2860 for (i=0; i<nrqs; ++i) { 2861 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2862 } 2863 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2864 2865 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2866 PetscFunctionReturn(0); 2867 } 2868 2869 /* 2870 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2871 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2872 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2873 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2874 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2875 state, and needs to be "assembled" later by compressing B's column space. 2876 2877 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2878 Following this call, C->A & C->B have been created, even if empty. 2879 */ 2880 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2881 { 2882 /* If making this function public, change the error returned in this function away from _PLIB. */ 2883 PetscErrorCode ierr; 2884 Mat_MPIAIJ *aij; 2885 Mat_SeqAIJ *Baij; 2886 PetscBool seqaij,Bdisassembled; 2887 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2888 PetscScalar v; 2889 const PetscInt *rowindices,*colindices; 2890 2891 PetscFunctionBegin; 2892 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2893 if (A) { 2894 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2895 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2896 if (rowemb) { 2897 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2898 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2899 } else { 2900 if (C->rmap->n != A->rmap->n) { 2901 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2902 } 2903 } 2904 if (dcolemb) { 2905 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2906 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2907 } else { 2908 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2909 } 2910 } 2911 if (B) { 2912 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2913 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2914 if (rowemb) { 2915 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2916 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2917 } else { 2918 if (C->rmap->n != B->rmap->n) { 2919 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2920 } 2921 } 2922 if (ocolemb) { 2923 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2924 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2925 } else { 2926 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2927 } 2928 } 2929 2930 aij = (Mat_MPIAIJ*)C->data; 2931 if (!aij->A) { 2932 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2933 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2934 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2935 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2936 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2937 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2938 } 2939 if (A) { 2940 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2941 } else { 2942 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2943 } 2944 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2945 /* 2946 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2947 need to "disassemble" B -- convert it to using C's global indices. 2948 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2949 2950 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2951 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2952 2953 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2954 At least avoid calling MatSetValues() and the implied searches? 2955 */ 2956 2957 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2958 #if defined(PETSC_USE_CTABLE) 2959 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2960 #else 2961 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2962 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2963 if (aij->B) { 2964 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2965 } 2966 #endif 2967 ngcol = 0; 2968 if (aij->lvec) { 2969 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2970 } 2971 if (aij->garray) { 2972 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2973 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2974 } 2975 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2976 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2977 } 2978 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2979 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2980 } 2981 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2982 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2983 } 2984 } 2985 Bdisassembled = PETSC_FALSE; 2986 if (!aij->B) { 2987 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2988 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2989 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2990 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2991 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2992 Bdisassembled = PETSC_TRUE; 2993 } 2994 if (B) { 2995 Baij = (Mat_SeqAIJ*)B->data; 2996 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2997 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2998 for (i=0; i<B->rmap->n; i++) { 2999 nz[i] = Baij->i[i+1] - Baij->i[i]; 3000 } 3001 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3002 ierr = PetscFree(nz);CHKERRQ(ierr); 3003 } 3004 3005 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3006 shift = rend-rstart; 3007 count = 0; 3008 rowindices = NULL; 3009 colindices = NULL; 3010 if (rowemb) { 3011 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3012 } 3013 if (ocolemb) { 3014 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3015 } 3016 for (i=0; i<B->rmap->n; i++) { 3017 PetscInt row; 3018 row = i; 3019 if (rowindices) row = rowindices[i]; 3020 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3021 col = Baij->j[count]; 3022 if (colindices) col = colindices[col]; 3023 if (Bdisassembled && col>=rstart) col += shift; 3024 v = Baij->a[count]; 3025 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3026 ++count; 3027 } 3028 } 3029 /* No assembly for aij->B is necessary. */ 3030 /* FIXME: set aij->B's nonzerostate correctly. */ 3031 } else { 3032 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3033 } 3034 C->preallocated = PETSC_TRUE; 3035 C->was_assembled = PETSC_FALSE; 3036 C->assembled = PETSC_FALSE; 3037 /* 3038 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3039 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3040 */ 3041 PetscFunctionReturn(0); 3042 } 3043 3044 /* 3045 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3046 */ 3047 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3048 { 3049 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3050 3051 PetscFunctionBegin; 3052 PetscValidPointer(A,2); 3053 PetscValidPointer(B,3); 3054 /* FIXME: make sure C is assembled */ 3055 *A = aij->A; 3056 *B = aij->B; 3057 /* Note that we don't incref *A and *B, so be careful! */ 3058 PetscFunctionReturn(0); 3059 } 3060 3061 /* 3062 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3063 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3064 */ 3065 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3066 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3067 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3068 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3069 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3070 { 3071 PetscErrorCode ierr; 3072 PetscMPIInt isize,flag; 3073 PetscInt i,ii,cismax,ispar; 3074 Mat *A,*B; 3075 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3076 3077 PetscFunctionBegin; 3078 if (!ismax) PetscFunctionReturn(0); 3079 3080 for (i = 0, cismax = 0; i < ismax; ++i) { 3081 PetscMPIInt isize; 3082 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3083 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3084 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3085 if (isize > 1) ++cismax; 3086 } 3087 3088 /* 3089 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3090 ispar counts the number of parallel ISs across C's comm. 3091 */ 3092 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3093 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3094 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3095 PetscFunctionReturn(0); 3096 } 3097 3098 /* if (ispar) */ 3099 /* 3100 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3101 These are used to extract the off-diag portion of the resulting parallel matrix. 3102 The row IS for the off-diag portion is the same as for the diag portion, 3103 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3104 */ 3105 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3106 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3107 for (i = 0, ii = 0; i < ismax; ++i) { 3108 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3109 if (isize > 1) { 3110 /* 3111 TODO: This is the part that's ***NOT SCALABLE***. 3112 To fix this we need to extract just the indices of C's nonzero columns 3113 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3114 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3115 to be done without serializing on the IS list, so, most likely, it is best 3116 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3117 */ 3118 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3119 /* Now we have to 3120 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3121 were sorted on each rank, concatenated they might no longer be sorted; 3122 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3123 indices in the nondecreasing order to the original index positions. 3124 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3125 */ 3126 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3127 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3128 ++ii; 3129 } 3130 } 3131 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3132 for (i = 0, ii = 0; i < ismax; ++i) { 3133 PetscInt j,issize; 3134 const PetscInt *indices; 3135 3136 /* 3137 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3138 */ 3139 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3140 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3141 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3142 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3143 for (j = 1; j < issize; ++j) { 3144 if (indices[j] == indices[j-1]) { 3145 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3146 } 3147 } 3148 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3149 3150 3151 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3152 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3153 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3154 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3155 for (j = 1; j < issize; ++j) { 3156 if (indices[j-1] == indices[j]) { 3157 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3158 } 3159 } 3160 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3161 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3162 if (isize > 1) { 3163 cisrow[ii] = isrow[i]; 3164 ++ii; 3165 } 3166 } 3167 /* 3168 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3169 array of sequential matrices underlying the resulting parallel matrices. 3170 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3171 contain duplicates. 3172 3173 There are as many diag matrices as there are original index sets. There are only as many parallel 3174 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3175 3176 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3177 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3178 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3179 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3180 with A[i] and B[ii] extracted from the corresponding MPI submat. 3181 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3182 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3183 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3184 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3185 values into A[i] and B[ii] sitting inside the corresponding submat. 3186 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3187 A[i], B[ii], AA[i] or BB[ii] matrices. 3188 */ 3189 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3190 if (scall == MAT_INITIAL_MATRIX) { 3191 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3192 } 3193 3194 /* Now obtain the sequential A and B submatrices separately. */ 3195 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3196 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3197 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3198 3199 /* 3200 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3201 matrices A & B have been extracted directly into the parallel matrices containing them, or 3202 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3203 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3204 to have the same sparsity pattern. 3205 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3206 must be constructed for C. This is done by setseqmat(s). 3207 */ 3208 for (i = 0, ii = 0; i < ismax; ++i) { 3209 /* 3210 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3211 That way we can avoid sorting and computing permutations when reusing. 3212 To this end: 3213 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3214 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3215 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3216 */ 3217 MatStructure pattern; 3218 pattern = DIFFERENT_NONZERO_PATTERN; 3219 3220 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3221 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3222 if (isize > 1) { 3223 if (scall == MAT_INITIAL_MATRIX) { 3224 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3225 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3226 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3227 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3228 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3229 } 3230 /* 3231 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3232 */ 3233 { 3234 Mat AA,BB; 3235 AA = A[i]; 3236 BB = B[ii]; 3237 if (AA || BB) { 3238 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3239 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3240 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3241 } 3242 3243 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3244 } 3245 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3246 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3247 ++ii; 3248 } else { /* if (isize == 1) */ 3249 if (scall == MAT_REUSE_MATRIX) { 3250 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3251 } 3252 if (isrow_p[i] || iscol_p[i]) { 3253 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3254 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3255 /* Otherwise A is extracted straight into (*submats)[i]. */ 3256 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3257 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3258 } else (*submat)[i] = A[i]; 3259 } 3260 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3261 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3262 } 3263 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3264 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3265 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3266 ierr = PetscFree(A);CHKERRQ(ierr); 3267 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3268 PetscFunctionReturn(0); 3269 } 3270 3271 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3272 { 3273 PetscErrorCode ierr; 3274 3275 PetscFunctionBegin; 3276 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3277 PetscFunctionReturn(0); 3278 } 3279