1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 #undef __FUNCT__ 23 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 24 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 25 { 26 PetscErrorCode ierr; 27 PetscInt i; 28 29 PetscFunctionBegin; 30 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 31 for (i=0; i<ov; ++i) { 32 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 33 } 34 PetscFunctionReturn(0); 35 } 36 37 #undef __FUNCT__ 38 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Scalable" 39 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 40 { 41 PetscErrorCode ierr; 42 PetscInt i; 43 44 PetscFunctionBegin; 45 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 46 for (i=0; i<ov; ++i) { 47 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 48 } 49 PetscFunctionReturn(0); 50 } 51 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once_Scalable" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 56 { 57 PetscErrorCode ierr; 58 MPI_Comm comm; 59 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 60 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 61 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 62 const PetscInt *indices_i,**indices; 63 PetscLayout rmap; 64 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 65 PetscSF sf; 66 PetscSFNode *remote; 67 68 PetscFunctionBegin; 69 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 70 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 71 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 72 /* get row map to determine where rows should be going */ 73 ierr = MatGetLayouts(mat,&rmap,PETSC_NULL);CHKERRQ(ierr); 74 /* retrieve IS data and put all together so that we 75 * can optimize communication 76 * */ 77 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 78 for (i=0,tlength=0; i<nidx; i++){ 79 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 80 tlength += length[i]; 81 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 82 } 83 /* find these rows on remote processors */ 84 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 85 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 86 nrrows = 0; 87 for (i=0; i<nidx; i++){ 88 length_i = length[i]; 89 indices_i = indices[i]; 90 for (j=0; j<length_i; j++){ 91 owner = -1; 92 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 93 /* remote processors */ 94 if (owner != rank){ 95 tosizes_temp[owner]++; /* number of rows to owner */ 96 rrow_ranks[nrrows] = owner; /* processor */ 97 rrow_isids[nrrows] = i; /* is id */ 98 remoterows[nrrows++] = indices_i[j]; /* row */ 99 } 100 } 101 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 102 } 103 ierr = PetscFree2(indices,length);CHKERRQ(ierr); 104 /* test if we need to exchange messages 105 * generally speaking, we do not need to exchange 106 * data when overlap is 1 107 * */ 108 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 109 /* we do not have any messages 110 * It usually corresponds to overlap 1 111 * */ 112 if (!reducednrrows){ 113 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 114 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 115 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 116 PetscFunctionReturn(0); 117 } 118 nto = 0; 119 /* send sizes and ranks for building a two-sided communcation */ 120 for (i=0; i<size; i++){ 121 if (tosizes_temp[i]){ 122 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 123 tosizes_temp[i] = nto; /* a map from processor to index */ 124 toranks[nto++] = i; /* processor */ 125 } 126 } 127 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 128 for (i=0; i<nto; i++){ 129 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 130 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 131 } 132 /* send information to other processors */ 133 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 134 nrecvrows = 0; 135 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 136 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 137 nrecvrows = 0; 138 for (i=0; i<nfrom; i++){ 139 for (j=0; j<fromsizes[2*i]; j++){ 140 remote[nrecvrows].rank = fromranks[i]; 141 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 142 } 143 } 144 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 145 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 146 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 147 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 148 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 149 /* message pair <no of is, row> */ 150 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 151 for (i=0; i<nrrows; i++){ 152 owner = rrow_ranks[i]; /* processor */ 153 j = tosizes_temp[owner]; /* index */ 154 todata[toffsets[j]++] = rrow_isids[i]; 155 todata[toffsets[j]++] = remoterows[i]; 156 } 157 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 158 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 159 ierr = PetscFree(toffsets);CHKERRQ(ierr); 160 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 162 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 163 /* send rows belonging to the remote so that then we could get the overlapping data back */ 164 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 165 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 166 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 167 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 168 ierr = PetscFree(fromranks);CHKERRQ(ierr); 169 nrecvrows = 0; 170 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 171 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 172 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 173 nrecvrows = 0; 174 for (i=0; i<nto; i++){ 175 for (j=0; j<tosizes[2*i]; j++){ 176 remote[nrecvrows].rank = toranks[i]; 177 remote[nrecvrows++].index = tosizes[2*i+1]+j; 178 } 179 } 180 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 181 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,PETSC_NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 182 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 183 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 184 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 185 /* overlap communication and computation */ 186 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 187 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 188 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 189 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 190 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 191 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 192 ierr = PetscFree(toranks);CHKERRQ(ierr); 193 ierr = PetscFree(tosizes);CHKERRQ(ierr); 194 ierr = PetscFree(todata);CHKERRQ(ierr); 195 PetscFunctionReturn(0); 196 } 197 198 #undef __FUNCT__ 199 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive_Scalable" 200 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 201 { 202 PetscInt *isz,isz_i,i,j,is_id, data_size; 203 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 204 const PetscInt *indices_i_temp; 205 PetscErrorCode ierr; 206 207 PetscFunctionBegin; 208 max_lsize = 0; 209 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 212 max_lsize = lsize>max_lsize ? lsize:max_lsize; 213 isz[i] = lsize; 214 } 215 ierr = PetscMalloc1((max_lsize+nrecvs)*nidx,&indices_temp);CHKERRQ(ierr); 216 for (i=0; i<nidx; i++){ 217 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 218 ierr = PetscMemcpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, sizeof(PetscInt)*isz[i]);CHKERRQ(ierr); 219 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 220 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 221 } 222 /* retrieve information to get row id and its overlap */ 223 for (i=0; i<nrecvs; ){ 224 is_id = recvdata[i++]; 225 data_size = recvdata[i++]; 226 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 227 isz_i = isz[is_id]; 228 for (j=0; j< data_size; j++){ 229 col = recvdata[i++]; 230 indices_i[isz_i++] = col; 231 } 232 isz[is_id] = isz_i; 233 } 234 /* remove duplicate entities */ 235 for (i=0; i<nidx; i++){ 236 indices_i = indices_temp+(max_lsize+nrecvs)*i; 237 isz_i = isz[i]; 238 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 239 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 240 } 241 ierr = PetscFree(isz);CHKERRQ(ierr); 242 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Send_Scalable" 248 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 249 { 250 PetscLayout rmap,cmap; 251 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 252 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 253 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 254 const PetscInt *gcols,*ai,*aj,*bi,*bj; 255 Mat amat,bmat; 256 PetscMPIInt rank; 257 PetscBool done; 258 MPI_Comm comm; 259 PetscErrorCode ierr; 260 261 PetscFunctionBegin; 262 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 263 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 264 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 265 /* Even if the mat is symmetric, we still assume it is not symmetric */ 266 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 267 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 268 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 269 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 270 /* total number of nonzero values is used to estimate the memory usage in the next step */ 271 tnz = ai[an]+bi[bn]; 272 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 273 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 274 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 275 /* to find the longest message */ 276 max_fszs = 0; 277 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 278 /* better way to estimate number of nonzero in the mat??? */ 279 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 280 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 281 rows_pos = 0; 282 totalrows = 0; 283 for (i=0; i<nfrom; i++){ 284 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 285 /* group data together */ 286 for (j=0; j<fromsizes[2*i]; j+=2){ 287 is_id = fromrows[rows_pos++];/* no of is */ 288 rows_i = rows_data[is_id]; 289 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 290 } 291 /* estimate a space to avoid multiple allocations */ 292 for (j=0; j<nidx; j++){ 293 indvc_ij = 0; 294 rows_i = rows_data[j]; 295 for (l=0; l<rows_pos_i[j]; l++){ 296 row = rows_i[l]-rstart; 297 start = ai[row]; 298 end = ai[row+1]; 299 for (k=start; k<end; k++){ /* Amat */ 300 col = aj[k] + cstart; 301 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 302 } 303 start = bi[row]; 304 end = bi[row+1]; 305 for (k=start; k<end; k++) { /* Bmat */ 306 col = gcols[bj[k]]; 307 indices_tmp[indvc_ij++] = col; 308 } 309 } 310 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 311 indv_counts[i*nidx+j] = indvc_ij; 312 totalrows += indvc_ij; 313 } 314 } 315 /* message triple <no of is, number of rows, rows> */ 316 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 317 totalrows = 0; 318 rows_pos = 0; 319 /* use this code again */ 320 for (i=0;i<nfrom;i++){ 321 ierr = PetscMemzero(rows_pos_i,sizeof(PetscInt)*nidx);CHKERRQ(ierr); 322 for (j=0; j<fromsizes[2*i]; j+=2){ 323 is_id = fromrows[rows_pos++]; 324 rows_i = rows_data[is_id]; 325 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 326 } 327 /* add data */ 328 for (j=0; j<nidx; j++){ 329 if (!indv_counts[i*nidx+j]) continue; 330 indvc_ij = 0; 331 sbdata[totalrows++] = j; 332 sbdata[totalrows++] = indv_counts[i*nidx+j]; 333 sbsizes[2*i] += 2; 334 rows_i = rows_data[j]; 335 for (l=0; l<rows_pos_i[j]; l++){ 336 row = rows_i[l]-rstart; 337 start = ai[row]; 338 end = ai[row+1]; 339 for (k=start; k<end; k++){ /* Amat */ 340 col = aj[k] + cstart; 341 indices_tmp[indvc_ij++] = col; 342 } 343 start = bi[row]; 344 end = bi[row+1]; 345 for (k=start; k<end; k++) { /* Bmat */ 346 col = gcols[bj[k]]; 347 indices_tmp[indvc_ij++] = col; 348 } 349 } 350 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 351 sbsizes[2*i] += indvc_ij; 352 ierr = PetscMemcpy(sbdata+totalrows,indices_tmp,sizeof(PetscInt)*indvc_ij);CHKERRQ(ierr); 353 totalrows += indvc_ij; 354 } 355 } 356 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 357 for (i=0; i<nfrom; i++){ 358 offsets[i+1] = offsets[i] + sbsizes[2*i]; 359 sbsizes[2*i+1] = offsets[i]; 360 } 361 ierr = PetscFree(offsets);CHKERRQ(ierr); 362 if (sbrowsizes) *sbrowsizes = sbsizes; 363 if (sbrows) *sbrows = sbdata; 364 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 365 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 366 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local_Scalable" 372 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 373 { 374 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 375 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 376 PetscInt lsize,lsize_tmp,owner; 377 PetscMPIInt rank; 378 Mat amat,bmat; 379 PetscBool done; 380 PetscLayout cmap,rmap; 381 MPI_Comm comm; 382 PetscErrorCode ierr; 383 384 PetscFunctionBegin; 385 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 386 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 387 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 388 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 389 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 390 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 391 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 392 /* is it a safe way to compute number of nonzero values ? */ 393 tnz = ai[an]+bi[bn]; 394 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 395 ierr = PetscLayoutGetRange(rmap,&rstart,PETSC_NULL);CHKERRQ(ierr); 396 ierr = PetscLayoutGetRange(cmap,&cstart,PETSC_NULL);CHKERRQ(ierr); 397 /* it is a better way to estimate memory than the old implementation 398 * where global size of matrix is used 399 * */ 400 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 401 for (i=0; i<nidx; i++) { 402 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 403 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 404 lsize_tmp = 0; 405 for (j=0; j<lsize; j++) { 406 owner = -1; 407 row = indices[j]; 408 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 409 if (owner != rank) continue; 410 /* local number */ 411 row -= rstart; 412 start = ai[row]; 413 end = ai[row+1]; 414 for (k=start; k<end; k++) { /* Amat */ 415 col = aj[k] + cstart; 416 indices_temp[lsize_tmp++] = col; 417 } 418 start = bi[row]; 419 end = bi[row+1]; 420 for (k=start; k<end; k++) { /* Bmat */ 421 col = gcols[bj[k]]; 422 indices_temp[lsize_tmp++] = col; 423 } 424 } 425 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 426 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 427 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 428 ierr = ISCreateGeneral(PETSC_COMM_SELF,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 429 } 430 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 431 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 432 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 433 PetscFunctionReturn(0); 434 } 435 436 437 /* 438 Sample message format: 439 If a processor A wants processor B to process some elements corresponding 440 to index sets is[1],is[5] 441 mesg [0] = 2 (no of index sets in the mesg) 442 ----------- 443 mesg [1] = 1 => is[1] 444 mesg [2] = sizeof(is[1]); 445 ----------- 446 mesg [3] = 5 => is[5] 447 mesg [4] = sizeof(is[5]); 448 ----------- 449 mesg [5] 450 mesg [n] datas[1] 451 ----------- 452 mesg[n+1] 453 mesg[m] data(is[5]) 454 ----------- 455 456 Notes: 457 nrqs - no of requests sent (or to be sent out) 458 nrqr - no of requests recieved (which have to be or which have been processed 459 */ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 462 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 463 { 464 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 465 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 466 const PetscInt **idx,*idx_i; 467 PetscInt *n,**data,len; 468 #if defined(PETSC_USE_CTABLE) 469 PetscTable *table_data,table_data_i; 470 PetscInt *tdata,tcount,tcount_max; 471 #else 472 PetscInt *data_i,*d_p; 473 #endif 474 PetscErrorCode ierr; 475 PetscMPIInt size,rank,tag1,tag2; 476 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 477 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 478 PetscBT *table; 479 MPI_Comm comm; 480 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 481 MPI_Status *s_status,*recv_status; 482 char *t_p; 483 484 PetscFunctionBegin; 485 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 486 size = c->size; 487 rank = c->rank; 488 M = C->rmap->N; 489 490 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 491 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 492 493 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 494 495 for (i=0; i<imax; i++) { 496 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 497 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 498 } 499 500 /* evaluate communication - mesg to who,length of mesg, and buffer space 501 required. Based on this, buffers are allocated, and data copied into them */ 502 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 503 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 504 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 505 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 506 for (i=0; i<imax; i++) { 507 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 508 idx_i = idx[i]; 509 len = n[i]; 510 for (j=0; j<len; j++) { 511 row = idx_i[j]; 512 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 513 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 514 w4[proc]++; 515 } 516 for (j=0; j<size; j++) { 517 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 518 } 519 } 520 521 nrqs = 0; /* no of outgoing messages */ 522 msz = 0; /* total mesg length (for all proc */ 523 w1[rank] = 0; /* no mesg sent to intself */ 524 w3[rank] = 0; 525 for (i=0; i<size; i++) { 526 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 527 } 528 /* pa - is list of processors to communicate with */ 529 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 530 for (i=0,j=0; i<size; i++) { 531 if (w1[i]) {pa[j] = i; j++;} 532 } 533 534 /* Each message would have a header = 1 + 2*(no of IS) + data */ 535 for (i=0; i<nrqs; i++) { 536 j = pa[i]; 537 w1[j] += w2[j] + 2*w3[j]; 538 msz += w1[j]; 539 } 540 541 /* Determine the number of messages to expect, their lengths, from from-ids */ 542 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 543 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 544 545 /* Now post the Irecvs corresponding to these messages */ 546 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 547 548 /* Allocate Memory for outgoing messages */ 549 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 550 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 551 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 552 553 { 554 PetscInt *iptr = tmp,ict = 0; 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 iptr += ict; 558 outdat[j] = iptr; 559 ict = w1[j]; 560 } 561 } 562 563 /* Form the outgoing messages */ 564 /* plug in the headers */ 565 for (i=0; i<nrqs; i++) { 566 j = pa[i]; 567 outdat[j][0] = 0; 568 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 569 ptr[j] = outdat[j] + 2*w3[j] + 1; 570 } 571 572 /* Memory for doing local proc's work */ 573 { 574 PetscInt M_BPB_imax = 0; 575 #if defined(PETSC_USE_CTABLE) 576 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 577 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 578 for (i=0; i<imax; i++) { 579 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 580 } 581 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 582 for (i=0; i<imax; i++) { 583 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 584 } 585 #else 586 PetscInt Mimax = 0; 587 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 588 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 589 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 590 for (i=0; i<imax; i++) { 591 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 592 data[i] = d_p + M*i; 593 } 594 #endif 595 } 596 597 /* Parse the IS and update local tables and the outgoing buf with the data */ 598 { 599 PetscInt n_i,isz_i,*outdat_j,ctr_j; 600 PetscBT table_i; 601 602 for (i=0; i<imax; i++) { 603 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 604 n_i = n[i]; 605 table_i = table[i]; 606 idx_i = idx[i]; 607 #if defined(PETSC_USE_CTABLE) 608 table_data_i = table_data[i]; 609 #else 610 data_i = data[i]; 611 #endif 612 isz_i = isz[i]; 613 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 614 row = idx_i[j]; 615 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 616 if (proc != rank) { /* copy to the outgoing buffer */ 617 ctr[proc]++; 618 *ptr[proc] = row; 619 ptr[proc]++; 620 } else if (!PetscBTLookupSet(table_i,row)) { 621 #if defined(PETSC_USE_CTABLE) 622 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 623 #else 624 data_i[isz_i] = row; /* Update the local table */ 625 #endif 626 isz_i++; 627 } 628 } 629 /* Update the headers for the current IS */ 630 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 631 if ((ctr_j = ctr[j])) { 632 outdat_j = outdat[j]; 633 k = ++outdat_j[0]; 634 outdat_j[2*k] = ctr_j; 635 outdat_j[2*k-1] = i; 636 } 637 } 638 isz[i] = isz_i; 639 } 640 } 641 642 /* Now post the sends */ 643 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 644 for (i=0; i<nrqs; ++i) { 645 j = pa[i]; 646 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 647 } 648 649 /* No longer need the original indices */ 650 for (i=0; i<imax; ++i) { 651 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 652 } 653 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 654 655 for (i=0; i<imax; ++i) { 656 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 657 } 658 659 /* Do Local work */ 660 #if defined(PETSC_USE_CTABLE) 661 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 662 #else 663 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 664 #endif 665 666 /* Receive messages */ 667 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 668 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 669 670 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 671 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 672 673 /* Phase 1 sends are complete - deallocate buffers */ 674 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 675 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 676 677 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 678 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 679 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 680 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 681 ierr = PetscFree(rbuf);CHKERRQ(ierr); 682 683 684 /* Send the data back */ 685 /* Do a global reduction to know the buffer space req for incoming messages */ 686 { 687 PetscMPIInt *rw1; 688 689 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 690 691 for (i=0; i<nrqr; ++i) { 692 proc = recv_status[i].MPI_SOURCE; 693 694 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 695 rw1[proc] = isz1[i]; 696 } 697 ierr = PetscFree(onodes1);CHKERRQ(ierr); 698 ierr = PetscFree(olengths1);CHKERRQ(ierr); 699 700 /* Determine the number of messages to expect, their lengths, from from-ids */ 701 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 702 ierr = PetscFree(rw1);CHKERRQ(ierr); 703 } 704 /* Now post the Irecvs corresponding to these messages */ 705 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 706 707 /* Now post the sends */ 708 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 709 for (i=0; i<nrqr; ++i) { 710 j = recv_status[i].MPI_SOURCE; 711 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 712 } 713 714 /* receive work done on other processors */ 715 { 716 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 717 PetscMPIInt idex; 718 PetscBT table_i; 719 MPI_Status *status2; 720 721 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 722 for (i=0; i<nrqs; ++i) { 723 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 724 /* Process the message */ 725 rbuf2_i = rbuf2[idex]; 726 ct1 = 2*rbuf2_i[0]+1; 727 jmax = rbuf2[idex][0]; 728 for (j=1; j<=jmax; j++) { 729 max = rbuf2_i[2*j]; 730 is_no = rbuf2_i[2*j-1]; 731 isz_i = isz[is_no]; 732 table_i = table[is_no]; 733 #if defined(PETSC_USE_CTABLE) 734 table_data_i = table_data[is_no]; 735 #else 736 data_i = data[is_no]; 737 #endif 738 for (k=0; k<max; k++,ct1++) { 739 row = rbuf2_i[ct1]; 740 if (!PetscBTLookupSet(table_i,row)) { 741 #if defined(PETSC_USE_CTABLE) 742 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 743 #else 744 data_i[isz_i] = row; 745 #endif 746 isz_i++; 747 } 748 } 749 isz[is_no] = isz_i; 750 } 751 } 752 753 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 754 ierr = PetscFree(status2);CHKERRQ(ierr); 755 } 756 757 #if defined(PETSC_USE_CTABLE) 758 tcount_max = 0; 759 for (i=0; i<imax; ++i) { 760 table_data_i = table_data[i]; 761 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 762 if (tcount_max < tcount) tcount_max = tcount; 763 } 764 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 765 #endif 766 767 for (i=0; i<imax; ++i) { 768 #if defined(PETSC_USE_CTABLE) 769 PetscTablePosition tpos; 770 table_data_i = table_data[i]; 771 772 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 773 while (tpos) { 774 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 775 tdata[--j] = --k; 776 } 777 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 778 #else 779 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 780 #endif 781 } 782 783 ierr = PetscFree(onodes2);CHKERRQ(ierr); 784 ierr = PetscFree(olengths2);CHKERRQ(ierr); 785 786 ierr = PetscFree(pa);CHKERRQ(ierr); 787 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 788 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 789 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 790 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 791 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 792 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 793 ierr = PetscFree(s_status);CHKERRQ(ierr); 794 ierr = PetscFree(recv_status);CHKERRQ(ierr); 795 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 796 ierr = PetscFree(xdata);CHKERRQ(ierr); 797 ierr = PetscFree(isz1);CHKERRQ(ierr); 798 #if defined(PETSC_USE_CTABLE) 799 for (i=0; i<imax; i++) { 800 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 801 } 802 ierr = PetscFree(table_data);CHKERRQ(ierr); 803 ierr = PetscFree(tdata);CHKERRQ(ierr); 804 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 805 #else 806 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 807 #endif 808 PetscFunctionReturn(0); 809 } 810 811 #undef __FUNCT__ 812 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 813 /* 814 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 815 the work on the local processor. 816 817 Inputs: 818 C - MAT_MPIAIJ; 819 imax - total no of index sets processed at a time; 820 table - an array of char - size = m bits. 821 822 Output: 823 isz - array containing the count of the solution elements corresponding 824 to each index set; 825 data or table_data - pointer to the solutions 826 */ 827 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 828 { 829 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 830 Mat A = c->A,B = c->B; 831 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 832 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 833 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 834 PetscBT table_i; 835 #if defined(PETSC_USE_CTABLE) 836 PetscTable table_data_i; 837 PetscErrorCode ierr; 838 PetscTablePosition tpos; 839 PetscInt tcount,*tdata; 840 #else 841 PetscInt *data_i; 842 #endif 843 844 PetscFunctionBegin; 845 rstart = C->rmap->rstart; 846 cstart = C->cmap->rstart; 847 ai = a->i; 848 aj = a->j; 849 bi = b->i; 850 bj = b->j; 851 garray = c->garray; 852 853 for (i=0; i<imax; i++) { 854 #if defined(PETSC_USE_CTABLE) 855 /* copy existing entries of table_data_i into tdata[] */ 856 table_data_i = table_data[i]; 857 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 858 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 859 860 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 861 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 862 while (tpos) { 863 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 864 tdata[--j] = --row; 865 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 866 } 867 #else 868 data_i = data[i]; 869 #endif 870 table_i = table[i]; 871 isz_i = isz[i]; 872 max = isz[i]; 873 874 for (j=0; j<max; j++) { 875 #if defined(PETSC_USE_CTABLE) 876 row = tdata[j] - rstart; 877 #else 878 row = data_i[j] - rstart; 879 #endif 880 start = ai[row]; 881 end = ai[row+1]; 882 for (k=start; k<end; k++) { /* Amat */ 883 val = aj[k] + cstart; 884 if (!PetscBTLookupSet(table_i,val)) { 885 #if defined(PETSC_USE_CTABLE) 886 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 887 #else 888 data_i[isz_i] = val; 889 #endif 890 isz_i++; 891 } 892 } 893 start = bi[row]; 894 end = bi[row+1]; 895 for (k=start; k<end; k++) { /* Bmat */ 896 val = garray[bj[k]]; 897 if (!PetscBTLookupSet(table_i,val)) { 898 #if defined(PETSC_USE_CTABLE) 899 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 900 #else 901 data_i[isz_i] = val; 902 #endif 903 isz_i++; 904 } 905 } 906 } 907 isz[i] = isz_i; 908 909 #if defined(PETSC_USE_CTABLE) 910 ierr = PetscFree(tdata);CHKERRQ(ierr); 911 #endif 912 } 913 PetscFunctionReturn(0); 914 } 915 916 #undef __FUNCT__ 917 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 918 /* 919 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 920 and return the output 921 922 Input: 923 C - the matrix 924 nrqr - no of messages being processed. 925 rbuf - an array of pointers to the recieved requests 926 927 Output: 928 xdata - array of messages to be sent back 929 isz1 - size of each message 930 931 For better efficiency perhaps we should malloc separately each xdata[i], 932 then if a remalloc is required we need only copy the data for that one row 933 rather then all previous rows as it is now where a single large chunck of 934 memory is used. 935 936 */ 937 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 938 { 939 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 940 Mat A = c->A,B = c->B; 941 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 942 PetscErrorCode ierr; 943 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 944 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 945 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 946 PetscInt *rbuf_i,kmax,rbuf_0; 947 PetscBT xtable; 948 949 PetscFunctionBegin; 950 m = C->rmap->N; 951 rstart = C->rmap->rstart; 952 cstart = C->cmap->rstart; 953 ai = a->i; 954 aj = a->j; 955 bi = b->i; 956 bj = b->j; 957 garray = c->garray; 958 959 960 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 961 rbuf_i = rbuf[i]; 962 rbuf_0 = rbuf_i[0]; 963 ct += rbuf_0; 964 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 965 } 966 967 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 968 else max1 = 1; 969 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 970 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 971 ++no_malloc; 972 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 973 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 974 975 ct3 = 0; 976 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 977 rbuf_i = rbuf[i]; 978 rbuf_0 = rbuf_i[0]; 979 ct1 = 2*rbuf_0+1; 980 ct2 = ct1; 981 ct3 += ct1; 982 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 983 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 984 oct2 = ct2; 985 kmax = rbuf_i[2*j]; 986 for (k=0; k<kmax; k++,ct1++) { 987 row = rbuf_i[ct1]; 988 if (!PetscBTLookupSet(xtable,row)) { 989 if (!(ct3 < mem_estimate)) { 990 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 991 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 992 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 993 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 994 xdata[0] = tmp; 995 mem_estimate = new_estimate; ++no_malloc; 996 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 997 } 998 xdata[i][ct2++] = row; 999 ct3++; 1000 } 1001 } 1002 for (k=oct2,max2=ct2; k<max2; k++) { 1003 row = xdata[i][k] - rstart; 1004 start = ai[row]; 1005 end = ai[row+1]; 1006 for (l=start; l<end; l++) { 1007 val = aj[l] + cstart; 1008 if (!PetscBTLookupSet(xtable,val)) { 1009 if (!(ct3 < mem_estimate)) { 1010 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1011 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1012 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1013 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1014 xdata[0] = tmp; 1015 mem_estimate = new_estimate; ++no_malloc; 1016 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1017 } 1018 xdata[i][ct2++] = val; 1019 ct3++; 1020 } 1021 } 1022 start = bi[row]; 1023 end = bi[row+1]; 1024 for (l=start; l<end; l++) { 1025 val = garray[bj[l]]; 1026 if (!PetscBTLookupSet(xtable,val)) { 1027 if (!(ct3 < mem_estimate)) { 1028 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1029 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1030 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 1031 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1032 xdata[0] = tmp; 1033 mem_estimate = new_estimate; ++no_malloc; 1034 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1035 } 1036 xdata[i][ct2++] = val; 1037 ct3++; 1038 } 1039 } 1040 } 1041 /* Update the header*/ 1042 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1043 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1044 } 1045 xdata[i][0] = rbuf_0; 1046 xdata[i+1] = xdata[i] + ct2; 1047 isz1[i] = ct2; /* size of each message */ 1048 } 1049 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1050 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1051 PetscFunctionReturn(0); 1052 } 1053 /* -------------------------------------------------------------------------*/ 1054 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 1055 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 1056 /* 1057 Every processor gets the entire matrix 1058 */ 1059 #undef __FUNCT__ 1060 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 1061 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1062 { 1063 Mat B; 1064 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1065 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1066 PetscErrorCode ierr; 1067 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1068 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1069 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1070 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1071 1072 PetscFunctionBegin; 1073 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1074 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1075 1076 if (scall == MAT_INITIAL_MATRIX) { 1077 /* ---------------------------------------------------------------- 1078 Tell every processor the number of nonzeros per row 1079 */ 1080 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1081 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1082 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1083 } 1084 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1085 for (i=0; i<size; i++) { 1086 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1087 displs[i] = A->rmap->range[i]; 1088 } 1089 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1090 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1091 #else 1092 sendcount = A->rmap->rend - A->rmap->rstart; 1093 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1094 #endif 1095 /* --------------------------------------------------------------- 1096 Create the sequential matrix of the same type as the local block diagonal 1097 */ 1098 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1099 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1100 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1101 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1102 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1103 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 1104 **Bin = B; 1105 b = (Mat_SeqAIJ*)B->data; 1106 1107 /*-------------------------------------------------------------------- 1108 Copy my part of matrix column indices over 1109 */ 1110 sendcount = ad->nz + bd->nz; 1111 jsendbuf = b->j + b->i[rstarts[rank]]; 1112 a_jsendbuf = ad->j; 1113 b_jsendbuf = bd->j; 1114 n = A->rmap->rend - A->rmap->rstart; 1115 cnt = 0; 1116 for (i=0; i<n; i++) { 1117 1118 /* put in lower diagonal portion */ 1119 m = bd->i[i+1] - bd->i[i]; 1120 while (m > 0) { 1121 /* is it above diagonal (in bd (compressed) numbering) */ 1122 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1123 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1124 m--; 1125 } 1126 1127 /* put in diagonal portion */ 1128 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1129 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1130 } 1131 1132 /* put in upper diagonal portion */ 1133 while (m-- > 0) { 1134 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1135 } 1136 } 1137 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1138 1139 /*-------------------------------------------------------------------- 1140 Gather all column indices to all processors 1141 */ 1142 for (i=0; i<size; i++) { 1143 recvcounts[i] = 0; 1144 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1145 recvcounts[i] += lens[j]; 1146 } 1147 } 1148 displs[0] = 0; 1149 for (i=1; i<size; i++) { 1150 displs[i] = displs[i-1] + recvcounts[i-1]; 1151 } 1152 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1153 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1154 #else 1155 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1156 #endif 1157 /*-------------------------------------------------------------------- 1158 Assemble the matrix into useable form (note numerical values not yet set) 1159 */ 1160 /* set the b->ilen (length of each row) values */ 1161 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1162 /* set the b->i indices */ 1163 b->i[0] = 0; 1164 for (i=1; i<=A->rmap->N; i++) { 1165 b->i[i] = b->i[i-1] + lens[i-1]; 1166 } 1167 ierr = PetscFree(lens);CHKERRQ(ierr); 1168 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1169 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1170 1171 } else { 1172 B = **Bin; 1173 b = (Mat_SeqAIJ*)B->data; 1174 } 1175 1176 /*-------------------------------------------------------------------- 1177 Copy my part of matrix numerical values into the values location 1178 */ 1179 if (flag == MAT_GET_VALUES) { 1180 sendcount = ad->nz + bd->nz; 1181 sendbuf = b->a + b->i[rstarts[rank]]; 1182 a_sendbuf = ad->a; 1183 b_sendbuf = bd->a; 1184 b_sendj = bd->j; 1185 n = A->rmap->rend - A->rmap->rstart; 1186 cnt = 0; 1187 for (i=0; i<n; i++) { 1188 1189 /* put in lower diagonal portion */ 1190 m = bd->i[i+1] - bd->i[i]; 1191 while (m > 0) { 1192 /* is it above diagonal (in bd (compressed) numbering) */ 1193 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1194 sendbuf[cnt++] = *b_sendbuf++; 1195 m--; 1196 b_sendj++; 1197 } 1198 1199 /* put in diagonal portion */ 1200 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1201 sendbuf[cnt++] = *a_sendbuf++; 1202 } 1203 1204 /* put in upper diagonal portion */ 1205 while (m-- > 0) { 1206 sendbuf[cnt++] = *b_sendbuf++; 1207 b_sendj++; 1208 } 1209 } 1210 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1211 1212 /* ----------------------------------------------------------------- 1213 Gather all numerical values to all processors 1214 */ 1215 if (!recvcounts) { 1216 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1217 } 1218 for (i=0; i<size; i++) { 1219 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1220 } 1221 displs[0] = 0; 1222 for (i=1; i<size; i++) { 1223 displs[i] = displs[i-1] + recvcounts[i-1]; 1224 } 1225 recvbuf = b->a; 1226 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1227 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1228 #else 1229 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1230 #endif 1231 } /* endof (flag == MAT_GET_VALUES) */ 1232 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1233 1234 if (A->symmetric) { 1235 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1236 } else if (A->hermitian) { 1237 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1238 } else if (A->structurally_symmetric) { 1239 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1240 } 1241 PetscFunctionReturn(0); 1242 } 1243 #undef __FUNCT__ 1244 #define __FUNCT__ "MatDestroy_MPIAIJ_MatGetSubmatrices" 1245 PetscErrorCode MatDestroy_MPIAIJ_MatGetSubmatrices(Mat C) 1246 { 1247 PetscErrorCode ierr; 1248 Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 1249 Mat_SubMat *submatj = c->submatis1; 1250 PetscInt i; 1251 1252 PetscFunctionBegin; 1253 ierr = submatj->destroy(C);CHKERRQ(ierr); 1254 1255 ierr = PetscFree4(submatj->sbuf1,submatj->ptr,submatj->tmp,submatj->ctr);CHKERRQ(ierr); 1256 1257 for (i=0; i<submatj->nrqr; ++i) { 1258 ierr = PetscFree(submatj->sbuf2[i]);CHKERRQ(ierr); 1259 } 1260 ierr = PetscFree3(submatj->sbuf2,submatj->req_size,submatj->req_source1);CHKERRQ(ierr); 1261 1262 ierr = PetscFree(submatj->rbuf1[0]);CHKERRQ(ierr); 1263 ierr = PetscFree(submatj->rbuf1);CHKERRQ(ierr); 1264 1265 for (i=0; i<submatj->nrqs; ++i) { 1266 ierr = PetscFree(submatj->rbuf3[i]);CHKERRQ(ierr); 1267 } 1268 ierr = PetscFree3(submatj->req_source2,submatj->rbuf2,submatj->rbuf3);CHKERRQ(ierr); 1269 ierr = PetscFree(submatj->pa);CHKERRQ(ierr); 1270 1271 ierr = PetscFree(submatj->row2proc);CHKERRQ(ierr); 1272 1273 #if defined(PETSC_USE_CTABLE) 1274 ierr = PetscTableDestroy((PetscTable*)&submatj->rmap);CHKERRQ(ierr); 1275 if (submatj->cmap_loc) {ierr = PetscFree(submatj->cmap_loc);CHKERRQ(ierr);} 1276 ierr = PetscFree(submatj->rmap_loc);CHKERRQ(ierr); 1277 #else 1278 ierr = PetscFree(submatj->rmap);CHKERRQ(ierr); 1279 #endif 1280 1281 if (!submatj->allcolumns) { 1282 #if defined(PETSC_USE_CTABLE) 1283 ierr = PetscTableDestroy((PetscTable*)&submatj->cmap);CHKERRQ(ierr); 1284 #else 1285 ierr = PetscFree(submatj->cmap);CHKERRQ(ierr); 1286 #endif 1287 } 1288 1289 ierr = PetscFree(submatj);CHKERRQ(ierr); 1290 PetscFunctionReturn(0); 1291 } 1292 1293 #undef __FUNCT__ 1294 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS_Local" 1295 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1296 { 1297 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1298 Mat submat,A = c->A,B = c->B; 1299 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1300 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1301 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1302 const PetscInt *icol,*irow; 1303 PetscInt nrow,ncol,start; 1304 PetscErrorCode ierr; 1305 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1306 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1307 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1308 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1309 PetscInt *lens,rmax,ncols,*cols,Crow; 1310 #if defined(PETSC_USE_CTABLE) 1311 PetscTable cmap,rmap; 1312 PetscInt *cmap_loc,*rmap_loc; 1313 #else 1314 PetscInt *cmap,*rmap; 1315 #endif 1316 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1317 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1318 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1319 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1320 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1321 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1322 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1323 MPI_Comm comm; 1324 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1325 PetscMPIInt *onodes1,*olengths1,idex,end; 1326 Mat_SubMat *smatis1; 1327 PetscBool isrowsorted; 1328 1329 PetscFunctionBegin; 1330 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1331 1332 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1333 size = c->size; 1334 rank = c->rank; 1335 1336 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1337 if (!isrowsorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow[0] must be sorted"); 1338 1339 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1340 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1341 if (allcolumns) { 1342 icol = NULL; 1343 ncol = C->cmap->N; 1344 } else { 1345 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1346 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1347 } 1348 1349 if (scall == MAT_INITIAL_MATRIX) { 1350 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1351 1352 /* Get some new tags to keep the communication clean */ 1353 tag1 = ((PetscObject)C)->tag; 1354 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1355 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1356 1357 /* evaluate communication - mesg to who, length of mesg, and buffer space 1358 required. Based on this, buffers are allocated, and data copied into them */ 1359 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1360 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1361 1362 /* w1[proc] = num of rows owned by proc -- to be requested */ 1363 proc = 0; 1364 nrqs = 0; /* num of outgoing messages */ 1365 for (j=0; j<nrow; j++) { 1366 row = irow[j]; /* sorted! */ 1367 while (row >= C->rmap->range[proc+1]) proc++; 1368 w1[proc]++; 1369 row2proc[j] = proc; /* map row index to proc */ 1370 1371 if (proc != rank && !w2[proc]) { 1372 w2[proc] = 1; nrqs++; 1373 } 1374 } 1375 w1[rank] = 0; /* rows owned by self will not be requested */ 1376 1377 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1378 for (proc=0,j=0; proc<size; proc++) { 1379 if (w1[proc]) { pa[j++] = proc;} 1380 } 1381 1382 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1383 msz = 0; /* total mesg length (for all procs) */ 1384 for (i=0; i<nrqs; i++) { 1385 proc = pa[i]; 1386 w1[proc] += 3; 1387 msz += w1[proc]; 1388 } 1389 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1390 1391 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1392 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1393 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1394 1395 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1396 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1397 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1398 1399 /* Now post the Irecvs corresponding to these messages */ 1400 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1401 1402 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1403 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1404 1405 /* Allocate Memory for outgoing messages */ 1406 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1407 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 1408 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 1409 1410 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1411 iptr = tmp; 1412 for (i=0; i<nrqs; i++) { 1413 proc = pa[i]; 1414 sbuf1[proc] = iptr; 1415 iptr += w1[proc]; 1416 } 1417 1418 /* Form the outgoing messages */ 1419 /* Initialize the header space */ 1420 for (i=0; i<nrqs; i++) { 1421 proc = pa[i]; 1422 ierr = PetscMemzero(sbuf1[proc],3*sizeof(PetscInt));CHKERRQ(ierr); 1423 ptr[proc] = sbuf1[proc] + 3; 1424 } 1425 1426 /* Parse the isrow and copy data into outbuf */ 1427 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 1428 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1429 proc = row2proc[j]; 1430 if (proc != rank) { /* copy to the outgoing buf*/ 1431 *ptr[proc] = irow[j]; 1432 ctr[proc]++; ptr[proc]++; 1433 } 1434 } 1435 1436 /* Update the headers for the current IS */ 1437 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1438 if ((ctr_j = ctr[j])) { 1439 sbuf1_j = sbuf1[j]; 1440 k = ++sbuf1_j[0]; 1441 sbuf1_j[2*k] = ctr_j; 1442 sbuf1_j[2*k-1] = 0; 1443 } 1444 } 1445 1446 /* Now post the sends */ 1447 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1448 for (i=0; i<nrqs; ++i) { 1449 proc = pa[i]; 1450 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1451 } 1452 1453 /* Post Receives to capture the buffer size */ 1454 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1455 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1456 1457 rbuf2[0] = tmp + msz; 1458 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1459 1460 for (i=0; i<nrqs; ++i) { 1461 proc = pa[i]; 1462 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1463 } 1464 1465 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1466 1467 /* Send to other procs the buf size they should allocate */ 1468 /* Receive messages*/ 1469 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1470 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1471 1472 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1473 for (i=0; i<nrqr; ++i) { 1474 req_size[i] = 0; 1475 rbuf1_i = rbuf1[i]; 1476 start = 2*rbuf1_i[0] + 1; 1477 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1478 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1479 sbuf2_i = sbuf2[i]; 1480 for (j=start; j<end; j++) { 1481 k = rbuf1_i[j] - rstart; 1482 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1483 sbuf2_i[j] = ncols; 1484 req_size[i] += ncols; 1485 } 1486 req_source1[i] = r_status1[i].MPI_SOURCE; 1487 1488 /* form the header */ 1489 sbuf2_i[0] = req_size[i]; 1490 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1491 1492 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1493 } 1494 1495 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1496 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1497 1498 /* rbuf2 is received, Post recv column indices a->j */ 1499 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1500 1501 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1502 for (i=0; i<nrqs; ++i) { 1503 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1504 req_source2[i] = r_status2[i].MPI_SOURCE; 1505 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1506 } 1507 1508 /* Wait on sends1 and sends2 */ 1509 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1510 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1511 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1512 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1513 1514 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1515 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1516 1517 /* Now allocate sending buffers for a->j, and send them off */ 1518 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1519 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1520 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1521 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1522 1523 for (i=0; i<nrqr; i++) { /* for each requested message */ 1524 rbuf1_i = rbuf1[i]; 1525 sbuf_aj_i = sbuf_aj[i]; 1526 ct1 = 2*rbuf1_i[0] + 1; 1527 ct2 = 0; 1528 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1529 1530 kmax = rbuf1[i][2]; 1531 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1532 row = rbuf1_i[ct1] - rstart; 1533 nzA = ai[row+1] - ai[row]; 1534 nzB = bi[row+1] - bi[row]; 1535 ncols = nzA + nzB; 1536 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1537 1538 /* load the column indices for this row into cols*/ 1539 cols = sbuf_aj_i + ct2; 1540 1541 lwrite = 0; 1542 for (l=0; l<nzB; l++) { 1543 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1544 } 1545 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1546 for (l=0; l<nzB; l++) { 1547 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1548 } 1549 1550 ct2 += ncols; 1551 } 1552 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1553 } 1554 1555 /* create column map (cmap): global col of C -> local col of submat */ 1556 #if defined(PETSC_USE_CTABLE) 1557 if (!allcolumns) { 1558 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1559 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1560 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1561 if (icol[j] >= cstart && icol[j] <cend) { 1562 cmap_loc[icol[j] - cstart] = j+1; 1563 } else { /* use PetscTable for non-local col indices */ 1564 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1565 } 1566 } 1567 } else { 1568 cmap = NULL; 1569 cmap_loc = NULL; 1570 } 1571 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1572 #else 1573 if (!allcolumns) { 1574 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1575 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1576 } else { 1577 cmap = NULL; 1578 } 1579 #endif 1580 1581 /* Create lens for MatSeqAIJSetPreallocation() */ 1582 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1583 1584 /* Compute lens from local part of C */ 1585 for (j=0; j<nrow; j++) { 1586 row = irow[j]; 1587 proc = row2proc[j]; 1588 if (proc == rank) { 1589 /* diagonal part A = c->A */ 1590 ncols = ai[row-rstart+1] - ai[row-rstart]; 1591 cols = aj + ai[row-rstart]; 1592 if (!allcolumns) { 1593 for (k=0; k<ncols; k++) { 1594 #if defined(PETSC_USE_CTABLE) 1595 tcol = cmap_loc[cols[k]]; 1596 #else 1597 tcol = cmap[cols[k]+cstart]; 1598 #endif 1599 if (tcol) lens[j]++; 1600 } 1601 } else { /* allcolumns */ 1602 lens[j] = ncols; 1603 } 1604 1605 /* off-diagonal part B = c->B */ 1606 ncols = bi[row-rstart+1] - bi[row-rstart]; 1607 cols = bj + bi[row-rstart]; 1608 if (!allcolumns) { 1609 for (k=0; k<ncols; k++) { 1610 #if defined(PETSC_USE_CTABLE) 1611 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1612 #else 1613 tcol = cmap[bmap[cols[k]]]; 1614 #endif 1615 if (tcol) lens[j]++; 1616 } 1617 } else { /* allcolumns */ 1618 lens[j] += ncols; 1619 } 1620 } 1621 } 1622 1623 /* Create row map (rmap): global row of C -> local row of submat */ 1624 #if defined(PETSC_USE_CTABLE) 1625 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1626 for (j=0; j<nrow; j++) { 1627 row = irow[j]; 1628 proc = row2proc[j]; 1629 if (proc == rank) { /* a local row */ 1630 rmap_loc[row - rstart] = j; 1631 } else { 1632 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1633 } 1634 } 1635 #else 1636 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1637 for (j=0; j<nrow; j++) { 1638 rmap[irow[j]] = j; 1639 } 1640 #endif 1641 1642 /* Update lens from offproc data */ 1643 /* recv a->j is done */ 1644 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1645 for (i=0; i<nrqs; i++) { 1646 proc = pa[i]; 1647 sbuf1_i = sbuf1[proc]; 1648 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1649 ct1 = 2 + 1; 1650 ct2 = 0; 1651 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1652 rbuf3_i = rbuf3[i]; /* received C->j */ 1653 1654 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1655 max1 = sbuf1_i[2]; 1656 for (k=0; k<max1; k++,ct1++) { 1657 #if defined(PETSC_USE_CTABLE) 1658 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1659 row--; 1660 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1661 #else 1662 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1663 #endif 1664 /* Now, store row index of submat in sbuf1_i[ct1] */ 1665 sbuf1_i[ct1] = row; 1666 1667 nnz = rbuf2_i[ct1]; 1668 if (!allcolumns) { 1669 for (l=0; l<nnz; l++,ct2++) { 1670 #if defined(PETSC_USE_CTABLE) 1671 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1672 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1673 } else { 1674 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1675 } 1676 #else 1677 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1678 #endif 1679 if (tcol) lens[row]++; 1680 } 1681 } else { /* allcolumns */ 1682 lens[row] += nnz; 1683 } 1684 } 1685 } 1686 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1687 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1688 1689 /* Create the submatrices */ 1690 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1691 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1692 1693 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1694 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1695 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1696 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1697 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1698 1699 /* create struct Mat_SubMat and attached it to submat */ 1700 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1701 subc = (Mat_SeqAIJ*)submat->data; 1702 subc->submatis1 = smatis1; 1703 1704 smatis1->nrqs = nrqs; 1705 smatis1->nrqr = nrqr; 1706 smatis1->rbuf1 = rbuf1; 1707 smatis1->rbuf2 = rbuf2; 1708 smatis1->rbuf3 = rbuf3; 1709 smatis1->sbuf2 = sbuf2; 1710 smatis1->req_source2 = req_source2; 1711 1712 smatis1->sbuf1 = sbuf1; 1713 smatis1->ptr = ptr; 1714 smatis1->tmp = tmp; 1715 smatis1->ctr = ctr; 1716 1717 smatis1->pa = pa; 1718 smatis1->req_size = req_size; 1719 smatis1->req_source1 = req_source1; 1720 1721 smatis1->allcolumns = allcolumns; 1722 smatis1->row2proc = row2proc; 1723 smatis1->rmap = rmap; 1724 smatis1->cmap = cmap; 1725 #if defined(PETSC_USE_CTABLE) 1726 smatis1->rmap_loc = rmap_loc; 1727 smatis1->cmap_loc = cmap_loc; 1728 #endif 1729 1730 smatis1->destroy = submat->ops->destroy; 1731 submat->ops->destroy = MatDestroy_MPIAIJ_MatGetSubmatrices; 1732 submat->factortype = C->factortype; 1733 1734 /* compute rmax */ 1735 rmax = 0; 1736 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1737 1738 } else { /* scall == MAT_REUSE_MATRIX */ 1739 submat = submats[0]; 1740 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1741 1742 subc = (Mat_SeqAIJ*)submat->data; 1743 rmax = subc->rmax; 1744 smatis1 = subc->submatis1; 1745 nrqs = smatis1->nrqs; 1746 nrqr = smatis1->nrqr; 1747 rbuf1 = smatis1->rbuf1; 1748 rbuf2 = smatis1->rbuf2; 1749 rbuf3 = smatis1->rbuf3; 1750 req_source2 = smatis1->req_source2; 1751 1752 sbuf1 = smatis1->sbuf1; 1753 sbuf2 = smatis1->sbuf2; 1754 ptr = smatis1->ptr; 1755 tmp = smatis1->tmp; 1756 ctr = smatis1->ctr; 1757 1758 pa = smatis1->pa; 1759 req_size = smatis1->req_size; 1760 req_source1 = smatis1->req_source1; 1761 1762 allcolumns = smatis1->allcolumns; 1763 row2proc = smatis1->row2proc; 1764 rmap = smatis1->rmap; 1765 cmap = smatis1->cmap; 1766 #if defined(PETSC_USE_CTABLE) 1767 rmap_loc = smatis1->rmap_loc; 1768 cmap_loc = smatis1->cmap_loc; 1769 #endif 1770 } 1771 1772 /* Post recv matrix values */ 1773 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1774 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1775 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1776 for (i=0; i<nrqs; ++i) { 1777 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1778 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1779 } 1780 1781 /* Allocate sending buffers for a->a, and send them off */ 1782 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1783 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1784 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1785 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1786 1787 for (i=0; i<nrqr; i++) { 1788 rbuf1_i = rbuf1[i]; 1789 sbuf_aa_i = sbuf_aa[i]; 1790 ct1 = 2*rbuf1_i[0]+1; 1791 ct2 = 0; 1792 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1793 1794 kmax = rbuf1_i[2]; 1795 for (k=0; k<kmax; k++,ct1++) { 1796 row = rbuf1_i[ct1] - rstart; 1797 nzA = ai[row+1] - ai[row]; 1798 nzB = bi[row+1] - bi[row]; 1799 ncols = nzA + nzB; 1800 cworkB = bj + bi[row]; 1801 vworkA = a_a + ai[row]; 1802 vworkB = b_a + bi[row]; 1803 1804 /* load the column values for this row into vals*/ 1805 vals = sbuf_aa_i + ct2; 1806 1807 lwrite = 0; 1808 for (l=0; l<nzB; l++) { 1809 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1810 } 1811 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1812 for (l=0; l<nzB; l++) { 1813 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1814 } 1815 1816 ct2 += ncols; 1817 } 1818 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1819 } 1820 1821 /* Assemble submat */ 1822 /* First assemble the local rows */ 1823 for (j=0; j<nrow; j++) { 1824 row = irow[j]; 1825 proc = row2proc[j]; 1826 if (proc == rank) { 1827 Crow = row - rstart; /* local row index of C */ 1828 #if defined(PETSC_USE_CTABLE) 1829 row = rmap_loc[Crow]; /* row index of submat */ 1830 #else 1831 row = rmap[row]; 1832 #endif 1833 1834 if (allcolumns) { 1835 /* diagonal part A = c->A */ 1836 ncols = ai[Crow+1] - ai[Crow]; 1837 cols = aj + ai[Crow]; 1838 vals = a->a + ai[Crow]; 1839 i = 0; 1840 for (k=0; k<ncols; k++) { 1841 subcols[i] = cols[k] + cstart; 1842 subvals[i++] = vals[k]; 1843 } 1844 1845 /* off-diagonal part B = c->B */ 1846 ncols = bi[Crow+1] - bi[Crow]; 1847 cols = bj + bi[Crow]; 1848 vals = b->a + bi[Crow]; 1849 for (k=0; k<ncols; k++) { 1850 subcols[i] = bmap[cols[k]]; 1851 subvals[i++] = vals[k]; 1852 } 1853 1854 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1855 1856 } else { /* !allcolumns */ 1857 #if defined(PETSC_USE_CTABLE) 1858 /* diagonal part A = c->A */ 1859 ncols = ai[Crow+1] - ai[Crow]; 1860 cols = aj + ai[Crow]; 1861 vals = a->a + ai[Crow]; 1862 i = 0; 1863 for (k=0; k<ncols; k++) { 1864 tcol = cmap_loc[cols[k]]; 1865 if (tcol) { 1866 subcols[i] = --tcol; 1867 subvals[i++] = vals[k]; 1868 } 1869 } 1870 1871 /* off-diagonal part B = c->B */ 1872 ncols = bi[Crow+1] - bi[Crow]; 1873 cols = bj + bi[Crow]; 1874 vals = b->a + bi[Crow]; 1875 for (k=0; k<ncols; k++) { 1876 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1877 if (tcol) { 1878 subcols[i] = --tcol; 1879 subvals[i++] = vals[k]; 1880 } 1881 } 1882 #else 1883 /* diagonal part A = c->A */ 1884 ncols = ai[Crow+1] - ai[Crow]; 1885 cols = aj + ai[Crow]; 1886 vals = a->a + ai[Crow]; 1887 i = 0; 1888 for (k=0; k<ncols; k++) { 1889 tcol = cmap[cols[k]+cstart]; 1890 if (tcol) { 1891 subcols[i] = --tcol; 1892 subvals[i++] = vals[k]; 1893 } 1894 } 1895 1896 /* off-diagonal part B = c->B */ 1897 ncols = bi[Crow+1] - bi[Crow]; 1898 cols = bj + bi[Crow]; 1899 vals = b->a + bi[Crow]; 1900 for (k=0; k<ncols; k++) { 1901 tcol = cmap[bmap[cols[k]]]; 1902 if (tcol) { 1903 subcols[i] = --tcol; 1904 subvals[i++] = vals[k]; 1905 } 1906 } 1907 #endif 1908 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1909 } 1910 } 1911 } 1912 1913 /* Now assemble the off-proc rows */ 1914 for (i=0; i<nrqs; i++) { /* for each requested message */ 1915 /* recv values from other processes */ 1916 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1917 proc = pa[idex]; 1918 sbuf1_i = sbuf1[proc]; 1919 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1920 ct1 = 2 + 1; 1921 ct2 = 0; /* count of received C->j */ 1922 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1923 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1924 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1925 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1926 1927 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1928 max1 = sbuf1_i[2]; /* num of rows */ 1929 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1930 row = sbuf1_i[ct1]; /* row index of submat */ 1931 if (!allcolumns) { 1932 idex = 0; 1933 if (scall == MAT_INITIAL_MATRIX) { 1934 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1935 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1936 #if defined(PETSC_USE_CTABLE) 1937 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1938 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1939 } else { 1940 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1941 } 1942 #else 1943 tcol = cmap[rbuf3_i[ct2]]; 1944 #endif 1945 if (tcol) { 1946 subcols[idex] = --tcol; 1947 subvals[idex++] = rbuf4_i[ct2]; 1948 1949 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1950 For reuse, we replace received C->j with index that should be inserted to submat */ 1951 rbuf3_i[ct3++] = ct2; 1952 } 1953 } 1954 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1955 1956 } else { /* scall == MAT_REUSE_MATRIX */ 1957 submat = submats[0]; 1958 subc = (Mat_SeqAIJ*)submat->data; 1959 1960 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1961 for (l=0; l<nnz; l++) { 1962 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1963 subvals[idex++] = rbuf4_i[ct2]; 1964 } 1965 1966 bj = subc->j + subc->i[row]; 1967 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1968 } 1969 } else { /* allcolumns */ 1970 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1971 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1972 ct2 += nnz; 1973 } 1974 } 1975 } 1976 1977 /* sending a->a are done */ 1978 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1979 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1980 1981 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1982 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1983 submats[0] = submat; 1984 1985 /* Restore the indices */ 1986 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1987 if (!allcolumns) { 1988 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1989 } 1990 1991 /* Destroy allocated memory */ 1992 for (i=0; i<nrqs; ++i) { 1993 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1994 } 1995 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1996 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1997 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1998 1999 if (scall == MAT_INITIAL_MATRIX) { 2000 ierr = PetscFree(lens);CHKERRQ(ierr); 2001 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2002 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2003 } 2004 PetscFunctionReturn(0); 2005 } 2006 2007 #undef __FUNCT__ 2008 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_SingleIS" 2009 PetscErrorCode MatGetSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2010 { 2011 PetscErrorCode ierr; 2012 PetscInt ncol; 2013 PetscBool colflag,allcolumns=PETSC_FALSE; 2014 2015 PetscFunctionBegin; 2016 /* Allocate memory to hold all the submatrices */ 2017 if (scall != MAT_REUSE_MATRIX) { 2018 ierr = PetscMalloc1(1,submat);CHKERRQ(ierr); 2019 } 2020 2021 /* Check for special case: each processor gets entire matrix columns */ 2022 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 2023 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 2024 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 2025 2026 ierr = MatGetSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 2027 PetscFunctionReturn(0); 2028 } 2029 2030 #undef __FUNCT__ 2031 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 2032 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 2033 { 2034 PetscErrorCode ierr; 2035 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 2036 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 2037 2038 PetscFunctionBegin; 2039 /* Check for special case: each processor gets entire matrix */ 2040 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip several MPIU_Allreduce() */ 2041 ierr = MatGetSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2042 PetscFunctionReturn(0); 2043 } 2044 2045 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2046 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2047 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2048 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2049 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2050 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2051 wantallmatrix = PETSC_TRUE; 2052 2053 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2054 } 2055 } 2056 ierr = MPIU_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2057 if (twantallmatrix) { 2058 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2059 PetscFunctionReturn(0); 2060 } 2061 2062 /* Allocate memory to hold all the submatrices */ 2063 if (scall != MAT_REUSE_MATRIX) { 2064 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 2065 } 2066 2067 /* Check for special case: each processor gets entire matrix columns */ 2068 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 2069 for (i=0; i<ismax; i++) { 2070 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2071 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 2072 if (colflag && ncol == C->cmap->N) { 2073 allcolumns[i] = PETSC_TRUE; 2074 } else { 2075 allcolumns[i] = PETSC_FALSE; 2076 } 2077 } 2078 2079 /* Determine the number of stages through which submatrices are done */ 2080 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 2081 2082 /* 2083 Each stage will extract nmax submatrices. 2084 nmax is determined by the matrix column dimension. 2085 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2086 */ 2087 if (!nmax) nmax = 1; 2088 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 2089 2090 /* Make sure every processor loops through the nstages */ 2091 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2092 2093 for (i=0,pos=0; i<nstages; i++) { 2094 if (pos+nmax <= ismax) max_no = nmax; 2095 else if (pos == ismax) max_no = 0; 2096 else max_no = ismax-pos; 2097 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 2098 pos += max_no; 2099 } 2100 2101 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 2102 PetscFunctionReturn(0); 2103 } 2104 2105 /* -------------------------------------------------------------------------*/ 2106 #undef __FUNCT__ 2107 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 2108 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 2109 { 2110 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2111 Mat A = c->A; 2112 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 2113 const PetscInt **icol,**irow; 2114 PetscInt *nrow,*ncol,start; 2115 PetscErrorCode ierr; 2116 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 2117 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 2118 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 2119 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 2120 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2121 #if defined(PETSC_USE_CTABLE) 2122 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2123 #else 2124 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2125 #endif 2126 const PetscInt *irow_i; 2127 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2128 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2129 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2130 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2131 MPI_Status *r_status3,*r_status4,*s_status4; 2132 MPI_Comm comm; 2133 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 2134 PetscMPIInt *onodes1,*olengths1; 2135 PetscMPIInt idex,idex2,end; 2136 2137 PetscFunctionBegin; 2138 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2139 tag0 = ((PetscObject)C)->tag; 2140 size = c->size; 2141 rank = c->rank; 2142 2143 /* Get some new tags to keep the communication clean */ 2144 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 2145 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2146 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2147 2148 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 2149 2150 for (i=0; i<ismax; i++) { 2151 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2152 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2153 if (allcolumns[i]) { 2154 icol[i] = NULL; 2155 ncol[i] = C->cmap->N; 2156 } else { 2157 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2158 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2159 } 2160 } 2161 2162 /* evaluate communication - mesg to who, length of mesg, and buffer space 2163 required. Based on this, buffers are allocated, and data copied into them*/ 2164 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 2165 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2166 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2167 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2168 for (i=0; i<ismax; i++) { 2169 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 2170 jmax = nrow[i]; 2171 irow_i = irow[i]; 2172 for (j=0; j<jmax; j++) { 2173 l = 0; 2174 row = irow_i[j]; 2175 while (row >= C->rmap->range[l+1]) l++; 2176 proc = l; 2177 w4[proc]++; 2178 } 2179 for (j=0; j<size; j++) { 2180 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 2181 } 2182 } 2183 2184 nrqs = 0; /* no of outgoing messages */ 2185 msz = 0; /* total mesg length (for all procs) */ 2186 w1[rank] = 0; /* no mesg sent to self */ 2187 w3[rank] = 0; 2188 for (i=0; i<size; i++) { 2189 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2190 } 2191 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2192 for (i=0,j=0; i<size; i++) { 2193 if (w1[i]) { pa[j] = i; j++; } 2194 } 2195 2196 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2197 for (i=0; i<nrqs; i++) { 2198 j = pa[i]; 2199 w1[j] += w2[j] + 2* w3[j]; 2200 msz += w1[j]; 2201 } 2202 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2203 2204 /* Determine the number of messages to expect, their lengths, from from-ids */ 2205 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2206 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2207 2208 /* Now post the Irecvs corresponding to these messages */ 2209 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2210 2211 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2212 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2213 2214 /* Allocate Memory for outgoing messages */ 2215 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2216 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 2217 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 2218 2219 { 2220 PetscInt *iptr = tmp; 2221 k = 0; 2222 for (i=0; i<nrqs; i++) { 2223 j = pa[i]; 2224 iptr += k; 2225 sbuf1[j] = iptr; 2226 k = w1[j]; 2227 } 2228 } 2229 2230 /* Form the outgoing messages */ 2231 /* Initialize the header space */ 2232 for (i=0; i<nrqs; i++) { 2233 j = pa[i]; 2234 sbuf1[j][0] = 0; 2235 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 2236 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2237 } 2238 2239 /* Parse the isrow and copy data into outbuf */ 2240 for (i=0; i<ismax; i++) { 2241 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 2242 irow_i = irow[i]; 2243 jmax = nrow[i]; 2244 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2245 l = 0; 2246 row = irow_i[j]; 2247 while (row >= C->rmap->range[l+1]) l++; 2248 proc = l; 2249 if (proc != rank) { /* copy to the outgoing buf*/ 2250 ctr[proc]++; 2251 *ptr[proc] = row; 2252 ptr[proc]++; 2253 } 2254 } 2255 /* Update the headers for the current IS */ 2256 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2257 if ((ctr_j = ctr[j])) { 2258 sbuf1_j = sbuf1[j]; 2259 k = ++sbuf1_j[0]; 2260 sbuf1_j[2*k] = ctr_j; 2261 sbuf1_j[2*k-1] = i; 2262 } 2263 } 2264 } 2265 2266 /* Now post the sends */ 2267 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2268 for (i=0; i<nrqs; ++i) { 2269 j = pa[i]; 2270 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2271 } 2272 2273 /* Post Receives to capture the buffer size */ 2274 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2275 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 2276 rbuf2[0] = tmp + msz; 2277 for (i=1; i<nrqs; ++i) { 2278 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2279 } 2280 for (i=0; i<nrqs; ++i) { 2281 j = pa[i]; 2282 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 2283 } 2284 2285 /* Send to other procs the buf size they should allocate */ 2286 2287 2288 /* Receive messages*/ 2289 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2290 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2291 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 2292 { 2293 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2294 PetscInt *sbuf2_i; 2295 2296 for (i=0; i<nrqr; ++i) { 2297 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 2298 2299 req_size[idex] = 0; 2300 rbuf1_i = rbuf1[idex]; 2301 start = 2*rbuf1_i[0] + 1; 2302 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2303 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 2304 sbuf2_i = sbuf2[idex]; 2305 for (j=start; j<end; j++) { 2306 id = rbuf1_i[j] - rstart; 2307 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2308 sbuf2_i[j] = ncols; 2309 req_size[idex] += ncols; 2310 } 2311 req_source[idex] = r_status1[i].MPI_SOURCE; 2312 /* form the header */ 2313 sbuf2_i[0] = req_size[idex]; 2314 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2315 2316 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 2317 } 2318 } 2319 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2320 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2321 2322 /* recv buffer sizes */ 2323 /* Receive messages*/ 2324 2325 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 2326 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2327 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2328 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2329 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2330 2331 for (i=0; i<nrqs; ++i) { 2332 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 2333 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 2334 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 2335 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 2336 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 2337 } 2338 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2339 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2340 2341 /* Wait on sends1 and sends2 */ 2342 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2343 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2344 2345 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2346 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2347 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2348 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2349 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2350 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2351 2352 /* Now allocate buffers for a->j, and send them off */ 2353 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2354 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2355 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2356 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2357 2358 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2359 { 2360 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2361 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2362 PetscInt cend = C->cmap->rend; 2363 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2364 2365 for (i=0; i<nrqr; i++) { 2366 rbuf1_i = rbuf1[i]; 2367 sbuf_aj_i = sbuf_aj[i]; 2368 ct1 = 2*rbuf1_i[0] + 1; 2369 ct2 = 0; 2370 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2371 kmax = rbuf1[i][2*j]; 2372 for (k=0; k<kmax; k++,ct1++) { 2373 row = rbuf1_i[ct1] - rstart; 2374 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2375 ncols = nzA + nzB; 2376 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2377 2378 /* load the column indices for this row into cols*/ 2379 cols = sbuf_aj_i + ct2; 2380 2381 lwrite = 0; 2382 for (l=0; l<nzB; l++) { 2383 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2384 } 2385 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2386 for (l=0; l<nzB; l++) { 2387 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2388 } 2389 2390 ct2 += ncols; 2391 } 2392 } 2393 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 2394 } 2395 } 2396 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 2397 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 2398 2399 /* Allocate buffers for a->a, and send them off */ 2400 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2401 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2402 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2403 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2404 2405 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2406 { 2407 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2408 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2409 PetscInt cend = C->cmap->rend; 2410 PetscInt *b_j = b->j; 2411 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2412 2413 for (i=0; i<nrqr; i++) { 2414 rbuf1_i = rbuf1[i]; 2415 sbuf_aa_i = sbuf_aa[i]; 2416 ct1 = 2*rbuf1_i[0]+1; 2417 ct2 = 0; 2418 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2419 kmax = rbuf1_i[2*j]; 2420 for (k=0; k<kmax; k++,ct1++) { 2421 row = rbuf1_i[ct1] - rstart; 2422 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2423 ncols = nzA + nzB; 2424 cworkB = b_j + b_i[row]; 2425 vworkA = a_a + a_i[row]; 2426 vworkB = b_a + b_i[row]; 2427 2428 /* load the column values for this row into vals*/ 2429 vals = sbuf_aa_i+ct2; 2430 2431 lwrite = 0; 2432 for (l=0; l<nzB; l++) { 2433 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2434 } 2435 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2436 for (l=0; l<nzB; l++) { 2437 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2438 } 2439 2440 ct2 += ncols; 2441 } 2442 } 2443 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 2444 } 2445 } 2446 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 2447 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 2448 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2449 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2450 2451 /* Form the matrix */ 2452 /* create col map: global col of C -> local col of submatrices */ 2453 { 2454 const PetscInt *icol_i; 2455 #if defined(PETSC_USE_CTABLE) 2456 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 2457 for (i=0; i<ismax; i++) { 2458 if (!allcolumns[i]) { 2459 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2460 2461 jmax = ncol[i]; 2462 icol_i = icol[i]; 2463 cmap_i = cmap[i]; 2464 for (j=0; j<jmax; j++) { 2465 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2466 } 2467 } else { 2468 cmap[i] = NULL; 2469 } 2470 } 2471 #else 2472 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 2473 for (i=0; i<ismax; i++) { 2474 if (!allcolumns[i]) { 2475 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2476 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2477 jmax = ncol[i]; 2478 icol_i = icol[i]; 2479 cmap_i = cmap[i]; 2480 for (j=0; j<jmax; j++) { 2481 cmap_i[icol_i[j]] = j+1; 2482 } 2483 } else { 2484 cmap[i] = NULL; 2485 } 2486 } 2487 #endif 2488 } 2489 2490 /* Create lens which is required for MatCreate... */ 2491 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2492 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2493 if (ismax) { 2494 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 2495 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 2496 } 2497 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2498 2499 /* Update lens from local data */ 2500 for (i=0; i<ismax; i++) { 2501 jmax = nrow[i]; 2502 if (!allcolumns[i]) cmap_i = cmap[i]; 2503 irow_i = irow[i]; 2504 lens_i = lens[i]; 2505 for (j=0; j<jmax; j++) { 2506 l = 0; 2507 row = irow_i[j]; 2508 while (row >= C->rmap->range[l+1]) l++; 2509 proc = l; 2510 if (proc == rank) { 2511 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2512 if (!allcolumns[i]) { 2513 for (k=0; k<ncols; k++) { 2514 #if defined(PETSC_USE_CTABLE) 2515 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2516 #else 2517 tcol = cmap_i[cols[k]]; 2518 #endif 2519 if (tcol) lens_i[j]++; 2520 } 2521 } else { /* allcolumns */ 2522 lens_i[j] = ncols; 2523 } 2524 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2525 } 2526 } 2527 } 2528 2529 /* Create row map: global row of C -> local row of submatrices */ 2530 #if defined(PETSC_USE_CTABLE) 2531 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 2532 for (i=0; i<ismax; i++) { 2533 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2534 irow_i = irow[i]; 2535 jmax = nrow[i]; 2536 for (j=0; j<jmax; j++) { 2537 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2538 } 2539 } 2540 #else 2541 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 2542 if (ismax) { 2543 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 2544 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 2545 } 2546 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 2547 for (i=0; i<ismax; i++) { 2548 rmap_i = rmap[i]; 2549 irow_i = irow[i]; 2550 jmax = nrow[i]; 2551 for (j=0; j<jmax; j++) { 2552 rmap_i[irow_i[j]] = j; 2553 } 2554 } 2555 #endif 2556 2557 /* Update lens from offproc data */ 2558 { 2559 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2560 2561 for (tmp2=0; tmp2<nrqs; tmp2++) { 2562 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 2563 idex = pa[idex2]; 2564 sbuf1_i = sbuf1[idex]; 2565 jmax = sbuf1_i[0]; 2566 ct1 = 2*jmax+1; 2567 ct2 = 0; 2568 rbuf2_i = rbuf2[idex2]; 2569 rbuf3_i = rbuf3[idex2]; 2570 for (j=1; j<=jmax; j++) { 2571 is_no = sbuf1_i[2*j-1]; 2572 max1 = sbuf1_i[2*j]; 2573 lens_i = lens[is_no]; 2574 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2575 rmap_i = rmap[is_no]; 2576 for (k=0; k<max1; k++,ct1++) { 2577 #if defined(PETSC_USE_CTABLE) 2578 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2579 row--; 2580 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2581 #else 2582 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2583 #endif 2584 max2 = rbuf2_i[ct1]; 2585 for (l=0; l<max2; l++,ct2++) { 2586 if (!allcolumns[is_no]) { 2587 #if defined(PETSC_USE_CTABLE) 2588 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2589 #else 2590 tcol = cmap_i[rbuf3_i[ct2]]; 2591 #endif 2592 if (tcol) lens_i[row]++; 2593 } else { /* allcolumns */ 2594 lens_i[row]++; /* lens_i[row] += max2 ? */ 2595 } 2596 } 2597 } 2598 } 2599 } 2600 } 2601 ierr = PetscFree(r_status3);CHKERRQ(ierr); 2602 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2603 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2604 ierr = PetscFree(s_status3);CHKERRQ(ierr); 2605 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2606 2607 /* Create the submatrices */ 2608 if (scall == MAT_REUSE_MATRIX) { 2609 PetscBool flag; 2610 2611 /* 2612 Assumes new rows are same length as the old rows,hence bug! 2613 */ 2614 for (i=0; i<ismax; i++) { 2615 mat = (Mat_SeqAIJ*)(submats[i]->data); 2616 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2617 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2618 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 2619 /* Initial matrix as if empty */ 2620 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2621 2622 submats[i]->factortype = C->factortype; 2623 } 2624 } else { 2625 for (i=0; i<ismax; i++) { 2626 PetscInt rbs,cbs; 2627 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2628 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2629 2630 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2631 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2632 2633 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2634 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2635 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2636 } 2637 } 2638 2639 /* Assemble the matrices */ 2640 /* First assemble the local rows */ 2641 { 2642 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2643 PetscScalar *imat_a; 2644 2645 for (i=0; i<ismax; i++) { 2646 mat = (Mat_SeqAIJ*)submats[i]->data; 2647 imat_ilen = mat->ilen; 2648 imat_j = mat->j; 2649 imat_i = mat->i; 2650 imat_a = mat->a; 2651 2652 if (!allcolumns[i]) cmap_i = cmap[i]; 2653 rmap_i = rmap[i]; 2654 irow_i = irow[i]; 2655 jmax = nrow[i]; 2656 for (j=0; j<jmax; j++) { 2657 l = 0; 2658 row = irow_i[j]; 2659 while (row >= C->rmap->range[l+1]) l++; 2660 proc = l; 2661 if (proc == rank) { 2662 old_row = row; 2663 #if defined(PETSC_USE_CTABLE) 2664 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2665 row--; 2666 #else 2667 row = rmap_i[row]; 2668 #endif 2669 ilen_row = imat_ilen[row]; 2670 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2671 mat_i = imat_i[row]; 2672 mat_a = imat_a + mat_i; 2673 mat_j = imat_j + mat_i; 2674 if (!allcolumns[i]) { 2675 for (k=0; k<ncols; k++) { 2676 #if defined(PETSC_USE_CTABLE) 2677 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2678 #else 2679 tcol = cmap_i[cols[k]]; 2680 #endif 2681 if (tcol) { 2682 *mat_j++ = tcol - 1; 2683 *mat_a++ = vals[k]; 2684 ilen_row++; 2685 } 2686 } 2687 } else { /* allcolumns */ 2688 for (k=0; k<ncols; k++) { 2689 *mat_j++ = cols[k]; /* global col index! */ 2690 *mat_a++ = vals[k]; 2691 ilen_row++; 2692 } 2693 } 2694 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2695 2696 imat_ilen[row] = ilen_row; 2697 } 2698 } 2699 } 2700 } 2701 2702 /* Now assemble the off proc rows*/ 2703 { 2704 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 2705 PetscInt *imat_j,*imat_i; 2706 PetscScalar *imat_a,*rbuf4_i; 2707 2708 for (tmp2=0; tmp2<nrqs; tmp2++) { 2709 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 2710 idex = pa[idex2]; 2711 sbuf1_i = sbuf1[idex]; 2712 jmax = sbuf1_i[0]; 2713 ct1 = 2*jmax + 1; 2714 ct2 = 0; 2715 rbuf2_i = rbuf2[idex2]; 2716 rbuf3_i = rbuf3[idex2]; 2717 rbuf4_i = rbuf4[idex2]; 2718 for (j=1; j<=jmax; j++) { 2719 is_no = sbuf1_i[2*j-1]; 2720 rmap_i = rmap[is_no]; 2721 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2722 mat = (Mat_SeqAIJ*)submats[is_no]->data; 2723 imat_ilen = mat->ilen; 2724 imat_j = mat->j; 2725 imat_i = mat->i; 2726 imat_a = mat->a; 2727 max1 = sbuf1_i[2*j]; 2728 for (k=0; k<max1; k++,ct1++) { 2729 row = sbuf1_i[ct1]; 2730 #if defined(PETSC_USE_CTABLE) 2731 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2732 row--; 2733 #else 2734 row = rmap_i[row]; 2735 #endif 2736 ilen = imat_ilen[row]; 2737 mat_i = imat_i[row]; 2738 mat_a = imat_a + mat_i; 2739 mat_j = imat_j + mat_i; 2740 max2 = rbuf2_i[ct1]; 2741 if (!allcolumns[is_no]) { 2742 for (l=0; l<max2; l++,ct2++) { 2743 2744 #if defined(PETSC_USE_CTABLE) 2745 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2746 #else 2747 tcol = cmap_i[rbuf3_i[ct2]]; 2748 #endif 2749 if (tcol) { 2750 *mat_j++ = tcol - 1; 2751 *mat_a++ = rbuf4_i[ct2]; 2752 ilen++; 2753 } 2754 } 2755 } else { /* allcolumns */ 2756 for (l=0; l<max2; l++,ct2++) { 2757 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2758 *mat_a++ = rbuf4_i[ct2]; 2759 ilen++; 2760 } 2761 } 2762 imat_ilen[row] = ilen; 2763 } 2764 } 2765 } 2766 } 2767 2768 /* sort the rows */ 2769 { 2770 PetscInt *imat_ilen,*imat_j,*imat_i; 2771 PetscScalar *imat_a; 2772 2773 for (i=0; i<ismax; i++) { 2774 mat = (Mat_SeqAIJ*)submats[i]->data; 2775 imat_j = mat->j; 2776 imat_i = mat->i; 2777 imat_a = mat->a; 2778 imat_ilen = mat->ilen; 2779 2780 if (allcolumns[i]) continue; 2781 jmax = nrow[i]; 2782 for (j=0; j<jmax; j++) { 2783 PetscInt ilen; 2784 2785 mat_i = imat_i[j]; 2786 mat_a = imat_a + mat_i; 2787 mat_j = imat_j + mat_i; 2788 ilen = imat_ilen[j]; 2789 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 2790 } 2791 } 2792 } 2793 2794 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2795 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2796 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2797 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2798 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2799 2800 /* Restore the indices */ 2801 for (i=0; i<ismax; i++) { 2802 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2803 if (!allcolumns[i]) { 2804 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2805 } 2806 } 2807 2808 /* Destroy allocated memory */ 2809 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 2810 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2811 ierr = PetscFree(pa);CHKERRQ(ierr); 2812 2813 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 2814 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 2815 for (i=0; i<nrqr; ++i) { 2816 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 2817 } 2818 for (i=0; i<nrqs; ++i) { 2819 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 2820 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2821 } 2822 2823 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 2824 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 2825 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2826 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2827 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2828 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2829 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2830 2831 #if defined(PETSC_USE_CTABLE) 2832 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 2833 #else 2834 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 2835 #endif 2836 ierr = PetscFree(rmap);CHKERRQ(ierr); 2837 2838 for (i=0; i<ismax; i++) { 2839 if (!allcolumns[i]) { 2840 #if defined(PETSC_USE_CTABLE) 2841 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 2842 #else 2843 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 2844 #endif 2845 } 2846 } 2847 ierr = PetscFree(cmap);CHKERRQ(ierr); 2848 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2849 ierr = PetscFree(lens);CHKERRQ(ierr); 2850 2851 for (i=0; i<ismax; i++) { 2852 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2853 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2854 } 2855 PetscFunctionReturn(0); 2856 } 2857 2858 /* 2859 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2860 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2861 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2862 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2863 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2864 state, and needs to be "assembled" later by compressing B's column space. 2865 2866 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2867 Following this call, C->A & C->B have been created, even if empty. 2868 */ 2869 #undef __FUNCT__ 2870 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 2871 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2872 { 2873 /* If making this function public, change the error returned in this function away from _PLIB. */ 2874 PetscErrorCode ierr; 2875 Mat_MPIAIJ *aij; 2876 Mat_SeqAIJ *Baij; 2877 PetscBool seqaij,Bdisassembled; 2878 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2879 PetscScalar v; 2880 const PetscInt *rowindices,*colindices; 2881 2882 PetscFunctionBegin; 2883 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2884 if (A) { 2885 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2886 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2887 if (rowemb) { 2888 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2889 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2890 } else { 2891 if (C->rmap->n != A->rmap->n) { 2892 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2893 } 2894 } 2895 if (dcolemb) { 2896 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2897 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2898 } else { 2899 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2900 } 2901 } 2902 if (B) { 2903 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2904 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2905 if (rowemb) { 2906 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2907 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2908 } else { 2909 if (C->rmap->n != B->rmap->n) { 2910 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2911 } 2912 } 2913 if (ocolemb) { 2914 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2915 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2916 } else { 2917 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2918 } 2919 } 2920 2921 aij = (Mat_MPIAIJ*)(C->data); 2922 if (!aij->A) { 2923 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2924 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2925 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2926 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2927 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2928 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2929 } 2930 if (A) { 2931 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2932 } else { 2933 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2934 } 2935 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2936 /* 2937 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2938 need to "disassemble" B -- convert it to using C's global indices. 2939 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2940 2941 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2942 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2943 2944 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2945 At least avoid calling MatSetValues() and the implied searches? 2946 */ 2947 2948 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2949 #if defined(PETSC_USE_CTABLE) 2950 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2951 #else 2952 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2953 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2954 if (aij->B) { 2955 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2956 } 2957 #endif 2958 ngcol = 0; 2959 if (aij->lvec) { 2960 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2961 } 2962 if (aij->garray) { 2963 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2964 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2965 } 2966 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2967 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2968 } 2969 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2970 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2971 } 2972 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2973 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2974 } 2975 } 2976 Bdisassembled = PETSC_FALSE; 2977 if (!aij->B) { 2978 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2979 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2980 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2981 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2982 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2983 Bdisassembled = PETSC_TRUE; 2984 } 2985 if (B) { 2986 Baij = (Mat_SeqAIJ*)(B->data); 2987 if (pattern == DIFFERENT_NONZERO_PATTERN) { 2988 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 2989 for (i=0; i<B->rmap->n; i++) { 2990 nz[i] = Baij->i[i+1] - Baij->i[i]; 2991 } 2992 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 2993 ierr = PetscFree(nz);CHKERRQ(ierr); 2994 } 2995 2996 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 2997 shift = rend-rstart; 2998 count = 0; 2999 rowindices = NULL; 3000 colindices = NULL; 3001 if (rowemb) { 3002 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3003 } 3004 if (ocolemb) { 3005 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3006 } 3007 for (i=0; i<B->rmap->n; i++) { 3008 PetscInt row; 3009 row = i; 3010 if (rowindices) row = rowindices[i]; 3011 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3012 col = Baij->j[count]; 3013 if (colindices) col = colindices[col]; 3014 if (Bdisassembled && col>=rstart) col += shift; 3015 v = Baij->a[count]; 3016 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3017 ++count; 3018 } 3019 } 3020 /* No assembly for aij->B is necessary. */ 3021 /* FIXME: set aij->B's nonzerostate correctly. */ 3022 } else { 3023 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3024 } 3025 C->preallocated = PETSC_TRUE; 3026 C->was_assembled = PETSC_FALSE; 3027 C->assembled = PETSC_FALSE; 3028 /* 3029 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3030 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3031 */ 3032 PetscFunctionReturn(0); 3033 } 3034 3035 #undef __FUNCT__ 3036 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 3037 /* 3038 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3039 */ 3040 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3041 { 3042 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 3043 3044 PetscFunctionBegin; 3045 PetscValidPointer(A,2); 3046 PetscValidPointer(B,3); 3047 /* FIXME: make sure C is assembled */ 3048 *A = aij->A; 3049 *B = aij->B; 3050 /* Note that we don't incref *A and *B, so be careful! */ 3051 PetscFunctionReturn(0); 3052 } 3053 3054 /* 3055 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3056 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3057 */ 3058 #undef __FUNCT__ 3059 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 3060 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3061 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3062 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3063 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3064 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3065 { 3066 PetscErrorCode ierr; 3067 PetscMPIInt isize,flag; 3068 PetscInt i,ii,cismax,ispar,ciscol_localsize; 3069 Mat *A,*B; 3070 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3071 3072 PetscFunctionBegin; 3073 if (!ismax) PetscFunctionReturn(0); 3074 3075 for (i = 0, cismax = 0; i < ismax; ++i) { 3076 PetscMPIInt isize; 3077 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3078 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3079 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3080 if (isize > 1) ++cismax; 3081 } 3082 /* 3083 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3084 ispar counts the number of parallel ISs across C's comm. 3085 */ 3086 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3087 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3088 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3089 PetscFunctionReturn(0); 3090 } 3091 3092 /* if (ispar) */ 3093 /* 3094 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3095 These are used to extract the off-diag portion of the resulting parallel matrix. 3096 The row IS for the off-diag portion is the same as for the diag portion, 3097 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3098 */ 3099 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3100 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3101 for (i = 0, ii = 0; i < ismax; ++i) { 3102 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3103 if (isize > 1) { 3104 /* 3105 TODO: This is the part that's ***NOT SCALABLE***. 3106 To fix this we need to extract just the indices of C's nonzero columns 3107 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3108 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3109 to be done without serializing on the IS list, so, most likely, it is best 3110 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 3111 */ 3112 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3113 /* Now we have to 3114 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3115 were sorted on each rank, concatenated they might no longer be sorted; 3116 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3117 indices in the nondecreasing order to the original index positions. 3118 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3119 */ 3120 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3121 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3122 ++ii; 3123 } 3124 } 3125 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3126 for (i = 0, ii = 0; i < ismax; ++i) { 3127 PetscInt j,issize; 3128 const PetscInt *indices; 3129 3130 /* 3131 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3132 */ 3133 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3134 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3135 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3136 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3137 for (j = 1; j < issize; ++j) { 3138 if (indices[j] == indices[j-1]) { 3139 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3140 } 3141 } 3142 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3143 3144 3145 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3146 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3147 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3148 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3149 for (j = 1; j < issize; ++j) { 3150 if (indices[j-1] == indices[j]) { 3151 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3152 } 3153 } 3154 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3155 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3156 if (isize > 1) { 3157 cisrow[ii] = isrow[i]; 3158 ++ii; 3159 } 3160 } 3161 /* 3162 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3163 array of sequential matrices underlying the resulting parallel matrices. 3164 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3165 contain duplicates. 3166 3167 There are as many diag matrices as there are original index sets. There are only as many parallel 3168 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3169 3170 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3171 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3172 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3173 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3174 with A[i] and B[ii] extracted from the corresponding MPI submat. 3175 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3176 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3177 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3178 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3179 values into A[i] and B[ii] sitting inside the corresponding submat. 3180 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3181 A[i], B[ii], AA[i] or BB[ii] matrices. 3182 */ 3183 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3184 if (scall == MAT_INITIAL_MATRIX) { 3185 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3186 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 3187 } else { 3188 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 3189 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 3190 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 3191 for (i = 0, ii = 0; i < ismax; ++i) { 3192 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3193 if (isize > 1) { 3194 Mat AA,BB; 3195 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 3196 if (!isrow_p[i] && !iscol_p[i]) { 3197 A[i] = AA; 3198 } else { 3199 /* TODO: extract A[i] composed on AA. */ 3200 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3201 } 3202 if (!isrow_p[i] && !ciscol_p[ii]) { 3203 B[ii] = BB; 3204 } else { 3205 /* TODO: extract B[ii] composed on BB. */ 3206 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 3207 } 3208 ++ii; 3209 } else { 3210 if (!isrow_p[i] && !iscol_p[i]) { 3211 A[i] = (*submat)[i]; 3212 } else { 3213 /* TODO: extract A[i] composed on (*submat)[i]. */ 3214 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 3215 } 3216 } 3217 } 3218 } 3219 /* Now obtain the sequential A and B submatrices separately. */ 3220 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 3221 /* I did not figure out a good way to fix it right now. 3222 * Local column size of B[i] is different from the size of ciscol[i]. 3223 * B[i]'s size is finally determined by MatAssembly, while 3224 * ciscol[i] is computed as the complement of iscol[i]. 3225 * It is better to keep only nonzero indices when computing 3226 * the complement ciscol[i]. 3227 * */ 3228 if(scall==MAT_REUSE_MATRIX){ 3229 for(i=0; i<cismax; i++){ 3230 ierr = ISGetLocalSize(ciscol[i],&ciscol_localsize);CHKERRQ(ierr); 3231 B[i]->cmap->n = ciscol_localsize; 3232 } 3233 } 3234 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 3235 /* 3236 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3237 matrices A & B have been extracted directly into the parallel matrices containing them, or 3238 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3239 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3240 to have the same sparsity pattern. 3241 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3242 must be constructed for C. This is done by setseqmat(s). 3243 */ 3244 for (i = 0, ii = 0; i < ismax; ++i) { 3245 /* 3246 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3247 That way we can avoid sorting and computing permutations when reusing. 3248 To this end: 3249 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3250 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3251 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3252 */ 3253 MatStructure pattern; 3254 if (scall == MAT_INITIAL_MATRIX) { 3255 pattern = DIFFERENT_NONZERO_PATTERN; 3256 } else { 3257 pattern = SAME_NONZERO_PATTERN; 3258 } 3259 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3260 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3261 if (isize > 1) { 3262 if (scall == MAT_INITIAL_MATRIX) { 3263 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3264 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3265 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3266 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3267 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3268 } 3269 /* 3270 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3271 */ 3272 { 3273 Mat AA,BB; 3274 AA = NULL; 3275 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 3276 BB = NULL; 3277 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 3278 if (AA || BB) { 3279 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3280 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3281 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3282 } 3283 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 3284 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 3285 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3286 } 3287 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 3288 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 3289 ierr = MatDestroy(&BB);CHKERRQ(ierr); 3290 } 3291 } 3292 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3293 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3294 ++ii; 3295 } else { /* if (isize == 1) */ 3296 if (scall == MAT_INITIAL_MATRIX) { 3297 if (isrow_p[i] || iscol_p[i]) { 3298 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3299 } else (*submat)[i] = A[i]; 3300 } 3301 if (isrow_p[i] || iscol_p[i]) { 3302 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3303 /* Otherwise A is extracted straight into (*submats)[i]. */ 3304 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3305 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3306 } 3307 } 3308 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3309 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3310 } 3311 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3312 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3313 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3314 ierr = PetscFree(A);CHKERRQ(ierr); 3315 ierr = PetscFree(B);CHKERRQ(ierr); 3316 PetscFunctionReturn(0); 3317 } 3318 3319 3320 3321 #undef __FUNCT__ 3322 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 3323 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3324 { 3325 PetscErrorCode ierr; 3326 3327 PetscFunctionBegin; 3328 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3329 PetscFunctionReturn(0); 3330 } 3331