1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 22 { 23 PetscErrorCode ierr; 24 PetscInt i; 25 26 PetscFunctionBegin; 27 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 28 for (i=0; i<ov; ++i) { 29 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 30 } 31 PetscFunctionReturn(0); 32 } 33 34 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 35 { 36 PetscErrorCode ierr; 37 PetscInt i; 38 39 PetscFunctionBegin; 40 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 41 for (i=0; i<ov; ++i) { 42 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 43 } 44 PetscFunctionReturn(0); 45 } 46 47 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 48 { 49 PetscErrorCode ierr; 50 MPI_Comm comm; 51 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 52 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 53 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 54 const PetscInt *indices_i,**indices; 55 PetscLayout rmap; 56 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 57 PetscSF sf; 58 PetscSFNode *remote; 59 60 PetscFunctionBegin; 61 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 62 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 63 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 64 /* get row map to determine where rows should be going */ 65 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 66 /* retrieve IS data and put all together so that we 67 * can optimize communication 68 * */ 69 ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 70 for (i=0,tlength=0; i<nidx; i++) { 71 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 72 tlength += length[i]; 73 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 74 } 75 /* find these rows on remote processors */ 76 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 77 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 78 nrrows = 0; 79 for (i=0; i<nidx; i++) { 80 length_i = length[i]; 81 indices_i = indices[i]; 82 for (j=0; j<length_i; j++) { 83 owner = -1; 84 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 85 /* remote processors */ 86 if (owner != rank) { 87 tosizes_temp[owner]++; /* number of rows to owner */ 88 rrow_ranks[nrrows] = owner; /* processor */ 89 rrow_isids[nrrows] = i; /* is id */ 90 remoterows[nrrows++] = indices_i[j]; /* row */ 91 } 92 } 93 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 94 } 95 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 96 /* test if we need to exchange messages 97 * generally speaking, we do not need to exchange 98 * data when overlap is 1 99 * */ 100 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRMPI(ierr); 101 /* we do not have any messages 102 * It usually corresponds to overlap 1 103 * */ 104 if (!reducednrrows) { 105 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 106 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 107 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 108 PetscFunctionReturn(0); 109 } 110 nto = 0; 111 /* send sizes and ranks for building a two-sided communcation */ 112 for (i=0; i<size; i++) { 113 if (tosizes_temp[i]) { 114 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 115 tosizes_temp[i] = nto; /* a map from processor to index */ 116 toranks[nto++] = i; /* processor */ 117 } 118 } 119 ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr); 120 toffsets[0] = 0; 121 for (i=0; i<nto; i++) { 122 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 123 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 124 } 125 /* send information to other processors */ 126 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 127 nrecvrows = 0; 128 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 129 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 130 nrecvrows = 0; 131 for (i=0; i<nfrom; i++) { 132 for (j=0; j<fromsizes[2*i]; j++) { 133 remote[nrecvrows].rank = fromranks[i]; 134 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 135 } 136 } 137 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 138 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 139 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 140 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 141 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 142 /* message pair <no of is, row> */ 143 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 144 for (i=0; i<nrrows; i++) { 145 owner = rrow_ranks[i]; /* processor */ 146 j = tosizes_temp[owner]; /* index */ 147 todata[toffsets[j]++] = rrow_isids[i]; 148 todata[toffsets[j]++] = remoterows[i]; 149 } 150 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 151 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 152 ierr = PetscFree(toffsets);CHKERRQ(ierr); 153 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 154 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 155 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 156 /* send rows belonging to the remote so that then we could get the overlapping data back */ 157 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 158 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 159 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 160 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 161 ierr = PetscFree(fromranks);CHKERRQ(ierr); 162 nrecvrows = 0; 163 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 164 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 165 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 166 nrecvrows = 0; 167 for (i=0; i<nto; i++) { 168 for (j=0; j<tosizes[2*i]; j++) { 169 remote[nrecvrows].rank = toranks[i]; 170 remote[nrecvrows++].index = tosizes[2*i+1]+j; 171 } 172 } 173 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 174 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 175 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 176 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 177 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 178 /* overlap communication and computation */ 179 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 180 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 181 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 182 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 183 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 184 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 185 ierr = PetscFree(toranks);CHKERRQ(ierr); 186 ierr = PetscFree(tosizes);CHKERRQ(ierr); 187 ierr = PetscFree(todata);CHKERRQ(ierr); 188 PetscFunctionReturn(0); 189 } 190 191 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 192 { 193 PetscInt *isz,isz_i,i,j,is_id, data_size; 194 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 195 const PetscInt *indices_i_temp; 196 MPI_Comm *iscomms; 197 PetscErrorCode ierr; 198 199 PetscFunctionBegin; 200 max_lsize = 0; 201 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 202 for (i=0; i<nidx; i++) { 203 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 204 max_lsize = lsize>max_lsize ? lsize:max_lsize; 205 isz[i] = lsize; 206 } 207 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 208 for (i=0; i<nidx; i++) { 209 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 210 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 211 ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr); 212 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 213 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 214 } 215 /* retrieve information to get row id and its overlap */ 216 for (i=0; i<nrecvs;) { 217 is_id = recvdata[i++]; 218 data_size = recvdata[i++]; 219 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 220 isz_i = isz[is_id]; 221 for (j=0; j< data_size; j++) { 222 col = recvdata[i++]; 223 indices_i[isz_i++] = col; 224 } 225 isz[is_id] = isz_i; 226 } 227 /* remove duplicate entities */ 228 for (i=0; i<nidx; i++) { 229 indices_i = indices_temp+(max_lsize+nrecvs)*i; 230 isz_i = isz[i]; 231 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 232 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 233 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 234 } 235 ierr = PetscFree(isz);CHKERRQ(ierr); 236 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 237 PetscFunctionReturn(0); 238 } 239 240 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 241 { 242 PetscLayout rmap,cmap; 243 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 244 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 245 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 246 const PetscInt *gcols,*ai,*aj,*bi,*bj; 247 Mat amat,bmat; 248 PetscMPIInt rank; 249 PetscBool done; 250 MPI_Comm comm; 251 PetscErrorCode ierr; 252 253 PetscFunctionBegin; 254 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 255 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 256 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 257 /* Even if the mat is symmetric, we still assume it is not symmetric */ 258 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 259 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 260 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 261 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 262 /* total number of nonzero values is used to estimate the memory usage in the next step */ 263 tnz = ai[an]+bi[bn]; 264 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 265 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 266 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 267 /* to find the longest message */ 268 max_fszs = 0; 269 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 270 /* better way to estimate number of nonzero in the mat??? */ 271 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 272 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 273 rows_pos = 0; 274 totalrows = 0; 275 for (i=0; i<nfrom; i++) { 276 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 277 /* group data together */ 278 for (j=0; j<fromsizes[2*i]; j+=2) { 279 is_id = fromrows[rows_pos++];/* no of is */ 280 rows_i = rows_data[is_id]; 281 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 282 } 283 /* estimate a space to avoid multiple allocations */ 284 for (j=0; j<nidx; j++) { 285 indvc_ij = 0; 286 rows_i = rows_data[j]; 287 for (l=0; l<rows_pos_i[j]; l++) { 288 row = rows_i[l]-rstart; 289 start = ai[row]; 290 end = ai[row+1]; 291 for (k=start; k<end; k++) { /* Amat */ 292 col = aj[k] + cstart; 293 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 294 } 295 start = bi[row]; 296 end = bi[row+1]; 297 for (k=start; k<end; k++) { /* Bmat */ 298 col = gcols[bj[k]]; 299 indices_tmp[indvc_ij++] = col; 300 } 301 } 302 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 303 indv_counts[i*nidx+j] = indvc_ij; 304 totalrows += indvc_ij; 305 } 306 } 307 /* message triple <no of is, number of rows, rows> */ 308 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 309 totalrows = 0; 310 rows_pos = 0; 311 /* use this code again */ 312 for (i=0;i<nfrom;i++) { 313 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 314 for (j=0; j<fromsizes[2*i]; j+=2) { 315 is_id = fromrows[rows_pos++]; 316 rows_i = rows_data[is_id]; 317 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 318 } 319 /* add data */ 320 for (j=0; j<nidx; j++) { 321 if (!indv_counts[i*nidx+j]) continue; 322 indvc_ij = 0; 323 sbdata[totalrows++] = j; 324 sbdata[totalrows++] = indv_counts[i*nidx+j]; 325 sbsizes[2*i] += 2; 326 rows_i = rows_data[j]; 327 for (l=0; l<rows_pos_i[j]; l++) { 328 row = rows_i[l]-rstart; 329 start = ai[row]; 330 end = ai[row+1]; 331 for (k=start; k<end; k++) { /* Amat */ 332 col = aj[k] + cstart; 333 indices_tmp[indvc_ij++] = col; 334 } 335 start = bi[row]; 336 end = bi[row+1]; 337 for (k=start; k<end; k++) { /* Bmat */ 338 col = gcols[bj[k]]; 339 indices_tmp[indvc_ij++] = col; 340 } 341 } 342 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 343 sbsizes[2*i] += indvc_ij; 344 ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr); 345 totalrows += indvc_ij; 346 } 347 } 348 ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 349 offsets[0] = 0; 350 for (i=0; i<nfrom; i++) { 351 offsets[i+1] = offsets[i] + sbsizes[2*i]; 352 sbsizes[2*i+1] = offsets[i]; 353 } 354 ierr = PetscFree(offsets);CHKERRQ(ierr); 355 if (sbrowsizes) *sbrowsizes = sbsizes; 356 if (sbrows) *sbrows = sbdata; 357 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 358 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 359 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 360 PetscFunctionReturn(0); 361 } 362 363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 364 { 365 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 366 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 367 PetscInt lsize,lsize_tmp; 368 PetscMPIInt rank,owner; 369 Mat amat,bmat; 370 PetscBool done; 371 PetscLayout cmap,rmap; 372 MPI_Comm comm; 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 377 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 378 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 379 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 380 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 381 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 382 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 383 /* is it a safe way to compute number of nonzero values ? */ 384 tnz = ai[an]+bi[bn]; 385 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 386 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 387 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 388 /* it is a better way to estimate memory than the old implementation 389 * where global size of matrix is used 390 * */ 391 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 392 for (i=0; i<nidx; i++) { 393 MPI_Comm iscomm; 394 395 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 396 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 397 lsize_tmp = 0; 398 for (j=0; j<lsize; j++) { 399 owner = -1; 400 row = indices[j]; 401 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 402 if (owner != rank) continue; 403 /* local number */ 404 row -= rstart; 405 start = ai[row]; 406 end = ai[row+1]; 407 for (k=start; k<end; k++) { /* Amat */ 408 col = aj[k] + cstart; 409 indices_temp[lsize_tmp++] = col; 410 } 411 start = bi[row]; 412 end = bi[row+1]; 413 for (k=start; k<end; k++) { /* Bmat */ 414 col = gcols[bj[k]]; 415 indices_temp[lsize_tmp++] = col; 416 } 417 } 418 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 419 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 420 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 421 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 422 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 423 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 424 } 425 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 426 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 427 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 /* 432 Sample message format: 433 If a processor A wants processor B to process some elements corresponding 434 to index sets is[1],is[5] 435 mesg [0] = 2 (no of index sets in the mesg) 436 ----------- 437 mesg [1] = 1 => is[1] 438 mesg [2] = sizeof(is[1]); 439 ----------- 440 mesg [3] = 5 => is[5] 441 mesg [4] = sizeof(is[5]); 442 ----------- 443 mesg [5] 444 mesg [n] datas[1] 445 ----------- 446 mesg[n+1] 447 mesg[m] data(is[5]) 448 ----------- 449 450 Notes: 451 nrqs - no of requests sent (or to be sent out) 452 nrqr - no of requests received (which have to be or which have been processed) 453 */ 454 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 455 { 456 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 457 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 458 const PetscInt **idx,*idx_i; 459 PetscInt *n,**data,len; 460 #if defined(PETSC_USE_CTABLE) 461 PetscTable *table_data,table_data_i; 462 PetscInt *tdata,tcount,tcount_max; 463 #else 464 PetscInt *data_i,*d_p; 465 #endif 466 PetscErrorCode ierr; 467 PetscMPIInt size,rank,tag1,tag2,proc = 0; 468 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 469 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 470 PetscBT *table; 471 MPI_Comm comm; 472 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 473 MPI_Status *recv_status; 474 MPI_Comm *iscomms; 475 char *t_p; 476 477 PetscFunctionBegin; 478 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 479 size = c->size; 480 rank = c->rank; 481 M = C->rmap->N; 482 483 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 484 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 485 486 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 487 488 for (i=0; i<imax; i++) { 489 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 490 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 491 } 492 493 /* evaluate communication - mesg to who,length of mesg, and buffer space 494 required. Based on this, buffers are allocated, and data copied into them */ 495 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 496 for (i=0; i<imax; i++) { 497 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 498 idx_i = idx[i]; 499 len = n[i]; 500 for (j=0; j<len; j++) { 501 row = idx_i[j]; 502 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 503 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 504 w4[proc]++; 505 } 506 for (j=0; j<size; j++) { 507 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 508 } 509 } 510 511 nrqs = 0; /* no of outgoing messages */ 512 msz = 0; /* total mesg length (for all proc */ 513 w1[rank] = 0; /* no mesg sent to intself */ 514 w3[rank] = 0; 515 for (i=0; i<size; i++) { 516 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 517 } 518 /* pa - is list of processors to communicate with */ 519 ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); 520 for (i=0,j=0; i<size; i++) { 521 if (w1[i]) {pa[j] = i; j++;} 522 } 523 524 /* Each message would have a header = 1 + 2*(no of IS) + data */ 525 for (i=0; i<nrqs; i++) { 526 j = pa[i]; 527 w1[j] += w2[j] + 2*w3[j]; 528 msz += w1[j]; 529 } 530 531 /* Determine the number of messages to expect, their lengths, from from-ids */ 532 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 533 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 534 535 /* Now post the Irecvs corresponding to these messages */ 536 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 537 538 /* Allocate Memory for outgoing messages */ 539 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 540 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 541 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 542 543 { 544 PetscInt *iptr = tmp,ict = 0; 545 for (i=0; i<nrqs; i++) { 546 j = pa[i]; 547 iptr += ict; 548 outdat[j] = iptr; 549 ict = w1[j]; 550 } 551 } 552 553 /* Form the outgoing messages */ 554 /* plug in the headers */ 555 for (i=0; i<nrqs; i++) { 556 j = pa[i]; 557 outdat[j][0] = 0; 558 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 559 ptr[j] = outdat[j] + 2*w3[j] + 1; 560 } 561 562 /* Memory for doing local proc's work */ 563 { 564 PetscInt M_BPB_imax = 0; 565 #if defined(PETSC_USE_CTABLE) 566 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 567 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 568 for (i=0; i<imax; i++) { 569 ierr = PetscTableCreate(n[i],M,&table_data[i]);CHKERRQ(ierr); 570 } 571 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 572 for (i=0; i<imax; i++) { 573 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 574 } 575 #else 576 PetscInt Mimax = 0; 577 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 578 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 579 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 580 for (i=0; i<imax; i++) { 581 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 582 data[i] = d_p + M*i; 583 } 584 #endif 585 } 586 587 /* Parse the IS and update local tables and the outgoing buf with the data */ 588 { 589 PetscInt n_i,isz_i,*outdat_j,ctr_j; 590 PetscBT table_i; 591 592 for (i=0; i<imax; i++) { 593 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 594 n_i = n[i]; 595 table_i = table[i]; 596 idx_i = idx[i]; 597 #if defined(PETSC_USE_CTABLE) 598 table_data_i = table_data[i]; 599 #else 600 data_i = data[i]; 601 #endif 602 isz_i = isz[i]; 603 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 604 row = idx_i[j]; 605 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 606 if (proc != rank) { /* copy to the outgoing buffer */ 607 ctr[proc]++; 608 *ptr[proc] = row; 609 ptr[proc]++; 610 } else if (!PetscBTLookupSet(table_i,row)) { 611 #if defined(PETSC_USE_CTABLE) 612 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 613 #else 614 data_i[isz_i] = row; /* Update the local table */ 615 #endif 616 isz_i++; 617 } 618 } 619 /* Update the headers for the current IS */ 620 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 621 if ((ctr_j = ctr[j])) { 622 outdat_j = outdat[j]; 623 k = ++outdat_j[0]; 624 outdat_j[2*k] = ctr_j; 625 outdat_j[2*k-1] = i; 626 } 627 } 628 isz[i] = isz_i; 629 } 630 } 631 632 /* Now post the sends */ 633 ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr); 634 for (i=0; i<nrqs; ++i) { 635 j = pa[i]; 636 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 637 } 638 639 /* No longer need the original indices */ 640 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 641 for (i=0; i<imax; ++i) { 642 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 643 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 644 } 645 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 646 647 for (i=0; i<imax; ++i) { 648 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 649 } 650 651 /* Do Local work */ 652 #if defined(PETSC_USE_CTABLE) 653 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 654 #else 655 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 656 #endif 657 658 /* Receive messages */ 659 ierr = PetscMalloc1(nrqr,&recv_status);CHKERRQ(ierr); 660 ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr); 661 ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 662 663 /* Phase 1 sends are complete - deallocate buffers */ 664 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 665 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 666 667 ierr = PetscMalloc1(nrqr,&xdata);CHKERRQ(ierr); 668 ierr = PetscMalloc1(nrqr,&isz1);CHKERRQ(ierr); 669 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 670 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 671 ierr = PetscFree(rbuf);CHKERRQ(ierr); 672 673 /* Send the data back */ 674 /* Do a global reduction to know the buffer space req for incoming messages */ 675 { 676 PetscMPIInt *rw1; 677 678 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 679 680 for (i=0; i<nrqr; ++i) { 681 proc = recv_status[i].MPI_SOURCE; 682 683 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 684 rw1[proc] = isz1[i]; 685 } 686 ierr = PetscFree(onodes1);CHKERRQ(ierr); 687 ierr = PetscFree(olengths1);CHKERRQ(ierr); 688 689 /* Determine the number of messages to expect, their lengths, from from-ids */ 690 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 691 ierr = PetscFree(rw1);CHKERRQ(ierr); 692 } 693 /* Now post the Irecvs corresponding to these messages */ 694 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 695 696 /* Now post the sends */ 697 ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr); 698 for (i=0; i<nrqr; ++i) { 699 j = recv_status[i].MPI_SOURCE; 700 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr); 701 } 702 703 /* receive work done on other processors */ 704 { 705 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 706 PetscMPIInt idex; 707 PetscBT table_i; 708 709 for (i=0; i<nrqs; ++i) { 710 ierr = MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);CHKERRMPI(ierr); 711 /* Process the message */ 712 rbuf2_i = rbuf2[idex]; 713 ct1 = 2*rbuf2_i[0]+1; 714 jmax = rbuf2[idex][0]; 715 for (j=1; j<=jmax; j++) { 716 max = rbuf2_i[2*j]; 717 is_no = rbuf2_i[2*j-1]; 718 isz_i = isz[is_no]; 719 table_i = table[is_no]; 720 #if defined(PETSC_USE_CTABLE) 721 table_data_i = table_data[is_no]; 722 #else 723 data_i = data[is_no]; 724 #endif 725 for (k=0; k<max; k++,ct1++) { 726 row = rbuf2_i[ct1]; 727 if (!PetscBTLookupSet(table_i,row)) { 728 #if defined(PETSC_USE_CTABLE) 729 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 730 #else 731 data_i[isz_i] = row; 732 #endif 733 isz_i++; 734 } 735 } 736 isz[is_no] = isz_i; 737 } 738 } 739 740 ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 741 } 742 743 #if defined(PETSC_USE_CTABLE) 744 tcount_max = 0; 745 for (i=0; i<imax; ++i) { 746 table_data_i = table_data[i]; 747 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 748 if (tcount_max < tcount) tcount_max = tcount; 749 } 750 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 751 #endif 752 753 for (i=0; i<imax; ++i) { 754 #if defined(PETSC_USE_CTABLE) 755 PetscTablePosition tpos; 756 table_data_i = table_data[i]; 757 758 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 759 while (tpos) { 760 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 761 tdata[--j] = --k; 762 } 763 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 764 #else 765 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 766 #endif 767 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 768 } 769 770 ierr = PetscFree(iscomms);CHKERRQ(ierr); 771 ierr = PetscFree(onodes2);CHKERRQ(ierr); 772 ierr = PetscFree(olengths2);CHKERRQ(ierr); 773 774 ierr = PetscFree(pa);CHKERRQ(ierr); 775 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 776 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 777 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 778 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 779 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 780 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 781 ierr = PetscFree(recv_status);CHKERRQ(ierr); 782 if (xdata) { 783 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 784 ierr = PetscFree(xdata);CHKERRQ(ierr); 785 } 786 ierr = PetscFree(isz1);CHKERRQ(ierr); 787 #if defined(PETSC_USE_CTABLE) 788 for (i=0; i<imax; i++) { 789 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 790 } 791 ierr = PetscFree(table_data);CHKERRQ(ierr); 792 ierr = PetscFree(tdata);CHKERRQ(ierr); 793 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 794 #else 795 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 796 #endif 797 PetscFunctionReturn(0); 798 } 799 800 /* 801 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 802 the work on the local processor. 803 804 Inputs: 805 C - MAT_MPIAIJ; 806 imax - total no of index sets processed at a time; 807 table - an array of char - size = m bits. 808 809 Output: 810 isz - array containing the count of the solution elements corresponding 811 to each index set; 812 data or table_data - pointer to the solutions 813 */ 814 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 815 { 816 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 817 Mat A = c->A,B = c->B; 818 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 819 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 820 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 821 PetscBT table_i; 822 #if defined(PETSC_USE_CTABLE) 823 PetscTable table_data_i; 824 PetscErrorCode ierr; 825 PetscTablePosition tpos; 826 PetscInt tcount,*tdata; 827 #else 828 PetscInt *data_i; 829 #endif 830 831 PetscFunctionBegin; 832 rstart = C->rmap->rstart; 833 cstart = C->cmap->rstart; 834 ai = a->i; 835 aj = a->j; 836 bi = b->i; 837 bj = b->j; 838 garray = c->garray; 839 840 for (i=0; i<imax; i++) { 841 #if defined(PETSC_USE_CTABLE) 842 /* copy existing entries of table_data_i into tdata[] */ 843 table_data_i = table_data[i]; 844 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 845 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]); 846 847 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 848 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 849 while (tpos) { 850 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 851 tdata[--j] = --row; 852 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount); 853 } 854 #else 855 data_i = data[i]; 856 #endif 857 table_i = table[i]; 858 isz_i = isz[i]; 859 max = isz[i]; 860 861 for (j=0; j<max; j++) { 862 #if defined(PETSC_USE_CTABLE) 863 row = tdata[j] - rstart; 864 #else 865 row = data_i[j] - rstart; 866 #endif 867 start = ai[row]; 868 end = ai[row+1]; 869 for (k=start; k<end; k++) { /* Amat */ 870 val = aj[k] + cstart; 871 if (!PetscBTLookupSet(table_i,val)) { 872 #if defined(PETSC_USE_CTABLE) 873 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 874 #else 875 data_i[isz_i] = val; 876 #endif 877 isz_i++; 878 } 879 } 880 start = bi[row]; 881 end = bi[row+1]; 882 for (k=start; k<end; k++) { /* Bmat */ 883 val = garray[bj[k]]; 884 if (!PetscBTLookupSet(table_i,val)) { 885 #if defined(PETSC_USE_CTABLE) 886 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 887 #else 888 data_i[isz_i] = val; 889 #endif 890 isz_i++; 891 } 892 } 893 } 894 isz[i] = isz_i; 895 896 #if defined(PETSC_USE_CTABLE) 897 ierr = PetscFree(tdata);CHKERRQ(ierr); 898 #endif 899 } 900 PetscFunctionReturn(0); 901 } 902 903 /* 904 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 905 and return the output 906 907 Input: 908 C - the matrix 909 nrqr - no of messages being processed. 910 rbuf - an array of pointers to the received requests 911 912 Output: 913 xdata - array of messages to be sent back 914 isz1 - size of each message 915 916 For better efficiency perhaps we should malloc separately each xdata[i], 917 then if a remalloc is required we need only copy the data for that one row 918 rather then all previous rows as it is now where a single large chunk of 919 memory is used. 920 921 */ 922 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 923 { 924 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 925 Mat A = c->A,B = c->B; 926 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 927 PetscErrorCode ierr; 928 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 929 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 930 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 931 PetscInt *rbuf_i,kmax,rbuf_0; 932 PetscBT xtable; 933 934 PetscFunctionBegin; 935 m = C->rmap->N; 936 rstart = C->rmap->rstart; 937 cstart = C->cmap->rstart; 938 ai = a->i; 939 aj = a->j; 940 bi = b->i; 941 bj = b->j; 942 garray = c->garray; 943 944 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 945 rbuf_i = rbuf[i]; 946 rbuf_0 = rbuf_i[0]; 947 ct += rbuf_0; 948 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 949 } 950 951 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 952 else max1 = 1; 953 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 954 if (nrqr) { 955 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 956 ++no_malloc; 957 } 958 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 959 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 960 961 ct3 = 0; 962 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 963 rbuf_i = rbuf[i]; 964 rbuf_0 = rbuf_i[0]; 965 ct1 = 2*rbuf_0+1; 966 ct2 = ct1; 967 ct3 += ct1; 968 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 969 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 970 oct2 = ct2; 971 kmax = rbuf_i[2*j]; 972 for (k=0; k<kmax; k++,ct1++) { 973 row = rbuf_i[ct1]; 974 if (!PetscBTLookupSet(xtable,row)) { 975 if (!(ct3 < mem_estimate)) { 976 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 977 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 978 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 979 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 980 xdata[0] = tmp; 981 mem_estimate = new_estimate; ++no_malloc; 982 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 983 } 984 xdata[i][ct2++] = row; 985 ct3++; 986 } 987 } 988 for (k=oct2,max2=ct2; k<max2; k++) { 989 row = xdata[i][k] - rstart; 990 start = ai[row]; 991 end = ai[row+1]; 992 for (l=start; l<end; l++) { 993 val = aj[l] + cstart; 994 if (!PetscBTLookupSet(xtable,val)) { 995 if (!(ct3 < mem_estimate)) { 996 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 997 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 998 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 999 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1000 xdata[0] = tmp; 1001 mem_estimate = new_estimate; ++no_malloc; 1002 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1003 } 1004 xdata[i][ct2++] = val; 1005 ct3++; 1006 } 1007 } 1008 start = bi[row]; 1009 end = bi[row+1]; 1010 for (l=start; l<end; l++) { 1011 val = garray[bj[l]]; 1012 if (!PetscBTLookupSet(xtable,val)) { 1013 if (!(ct3 < mem_estimate)) { 1014 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1015 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1016 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1017 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1018 xdata[0] = tmp; 1019 mem_estimate = new_estimate; ++no_malloc; 1020 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1021 } 1022 xdata[i][ct2++] = val; 1023 ct3++; 1024 } 1025 } 1026 } 1027 /* Update the header*/ 1028 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1029 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1030 } 1031 xdata[i][0] = rbuf_0; 1032 if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2; 1033 isz1[i] = ct2; /* size of each message */ 1034 } 1035 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1036 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1037 PetscFunctionReturn(0); 1038 } 1039 /* -------------------------------------------------------------------------*/ 1040 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1041 /* 1042 Every processor gets the entire matrix 1043 */ 1044 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1045 { 1046 Mat B; 1047 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1048 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1049 PetscErrorCode ierr; 1050 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1051 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1052 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1053 1054 PetscFunctionBegin; 1055 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 1056 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr); 1057 if (scall == MAT_INITIAL_MATRIX) { 1058 /* ---------------------------------------------------------------- 1059 Tell every processor the number of nonzeros per row 1060 */ 1061 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1062 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1063 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1064 } 1065 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1066 for (i=0; i<size; i++) { 1067 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1068 displs[i] = A->rmap->range[i]; 1069 } 1070 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1071 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1072 #else 1073 sendcount = A->rmap->rend - A->rmap->rstart; 1074 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1075 #endif 1076 /* --------------------------------------------------------------- 1077 Create the sequential matrix of the same type as the local block diagonal 1078 */ 1079 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1080 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1081 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1082 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1083 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1084 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1085 **Bin = B; 1086 b = (Mat_SeqAIJ*)B->data; 1087 1088 /*-------------------------------------------------------------------- 1089 Copy my part of matrix column indices over 1090 */ 1091 sendcount = ad->nz + bd->nz; 1092 jsendbuf = b->j + b->i[rstarts[rank]]; 1093 a_jsendbuf = ad->j; 1094 b_jsendbuf = bd->j; 1095 n = A->rmap->rend - A->rmap->rstart; 1096 cnt = 0; 1097 for (i=0; i<n; i++) { 1098 /* put in lower diagonal portion */ 1099 m = bd->i[i+1] - bd->i[i]; 1100 while (m > 0) { 1101 /* is it above diagonal (in bd (compressed) numbering) */ 1102 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1103 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1104 m--; 1105 } 1106 1107 /* put in diagonal portion */ 1108 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1109 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1110 } 1111 1112 /* put in upper diagonal portion */ 1113 while (m-- > 0) { 1114 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1115 } 1116 } 1117 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1118 1119 /*-------------------------------------------------------------------- 1120 Gather all column indices to all processors 1121 */ 1122 for (i=0; i<size; i++) { 1123 recvcounts[i] = 0; 1124 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1125 recvcounts[i] += lens[j]; 1126 } 1127 } 1128 displs[0] = 0; 1129 for (i=1; i<size; i++) { 1130 displs[i] = displs[i-1] + recvcounts[i-1]; 1131 } 1132 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1133 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1134 #else 1135 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1136 #endif 1137 /*-------------------------------------------------------------------- 1138 Assemble the matrix into useable form (note numerical values not yet set) 1139 */ 1140 /* set the b->ilen (length of each row) values */ 1141 ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr); 1142 /* set the b->i indices */ 1143 b->i[0] = 0; 1144 for (i=1; i<=A->rmap->N; i++) { 1145 b->i[i] = b->i[i-1] + lens[i-1]; 1146 } 1147 ierr = PetscFree(lens);CHKERRQ(ierr); 1148 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1149 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1150 1151 } else { 1152 B = **Bin; 1153 b = (Mat_SeqAIJ*)B->data; 1154 } 1155 1156 /*-------------------------------------------------------------------- 1157 Copy my part of matrix numerical values into the values location 1158 */ 1159 if (flag == MAT_GET_VALUES) { 1160 const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf; 1161 MatScalar *sendbuf,*recvbuf; 1162 1163 ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr); 1164 ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr); 1165 sendcount = ad->nz + bd->nz; 1166 sendbuf = b->a + b->i[rstarts[rank]]; 1167 a_sendbuf = ada; 1168 b_sendbuf = bda; 1169 b_sendj = bd->j; 1170 n = A->rmap->rend - A->rmap->rstart; 1171 cnt = 0; 1172 for (i=0; i<n; i++) { 1173 /* put in lower diagonal portion */ 1174 m = bd->i[i+1] - bd->i[i]; 1175 while (m > 0) { 1176 /* is it above diagonal (in bd (compressed) numbering) */ 1177 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1178 sendbuf[cnt++] = *b_sendbuf++; 1179 m--; 1180 b_sendj++; 1181 } 1182 1183 /* put in diagonal portion */ 1184 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1185 sendbuf[cnt++] = *a_sendbuf++; 1186 } 1187 1188 /* put in upper diagonal portion */ 1189 while (m-- > 0) { 1190 sendbuf[cnt++] = *b_sendbuf++; 1191 b_sendj++; 1192 } 1193 } 1194 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1195 1196 /* ----------------------------------------------------------------- 1197 Gather all numerical values to all processors 1198 */ 1199 if (!recvcounts) { 1200 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1201 } 1202 for (i=0; i<size; i++) { 1203 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1204 } 1205 displs[0] = 0; 1206 for (i=1; i<size; i++) { 1207 displs[i] = displs[i-1] + recvcounts[i-1]; 1208 } 1209 recvbuf = b->a; 1210 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1211 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1212 #else 1213 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1214 #endif 1215 ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr); 1216 ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr); 1217 } /* endof (flag == MAT_GET_VALUES) */ 1218 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1219 1220 if (A->symmetric) { 1221 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1222 } else if (A->hermitian) { 1223 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1224 } else if (A->structurally_symmetric) { 1225 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1226 } 1227 PetscFunctionReturn(0); 1228 } 1229 1230 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1231 { 1232 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1233 Mat submat,A = c->A,B = c->B; 1234 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1235 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1236 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1237 const PetscInt *icol,*irow; 1238 PetscInt nrow,ncol,start; 1239 PetscErrorCode ierr; 1240 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1241 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1242 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1243 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1244 PetscInt *lens,rmax,ncols,*cols,Crow; 1245 #if defined(PETSC_USE_CTABLE) 1246 PetscTable cmap,rmap; 1247 PetscInt *cmap_loc,*rmap_loc; 1248 #else 1249 PetscInt *cmap,*rmap; 1250 #endif 1251 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1252 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1253 PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL; 1254 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1255 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1256 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1257 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1258 MPI_Comm comm; 1259 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1260 PetscMPIInt *onodes1,*olengths1,idex,end; 1261 Mat_SubSppt *smatis1; 1262 PetscBool isrowsorted,iscolsorted; 1263 1264 PetscFunctionBegin; 1265 PetscValidLogicalCollectiveInt(C,ismax,2); 1266 PetscValidLogicalCollectiveEnum(C,scall,5); 1267 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1268 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1269 ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1270 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1271 size = c->size; 1272 rank = c->rank; 1273 1274 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1275 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1276 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1277 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1278 if (allcolumns) { 1279 icol = NULL; 1280 ncol = C->cmap->N; 1281 } else { 1282 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1283 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1284 } 1285 1286 if (scall == MAT_INITIAL_MATRIX) { 1287 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1288 1289 /* Get some new tags to keep the communication clean */ 1290 tag1 = ((PetscObject)C)->tag; 1291 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1292 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1293 1294 /* evaluate communication - mesg to who, length of mesg, and buffer space 1295 required. Based on this, buffers are allocated, and data copied into them */ 1296 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1297 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1298 1299 /* w1[proc] = num of rows owned by proc -- to be requested */ 1300 proc = 0; 1301 nrqs = 0; /* num of outgoing messages */ 1302 for (j=0; j<nrow; j++) { 1303 row = irow[j]; 1304 if (!isrowsorted) proc = 0; 1305 while (row >= C->rmap->range[proc+1]) proc++; 1306 w1[proc]++; 1307 row2proc[j] = proc; /* map row index to proc */ 1308 1309 if (proc != rank && !w2[proc]) { 1310 w2[proc] = 1; nrqs++; 1311 } 1312 } 1313 w1[rank] = 0; /* rows owned by self will not be requested */ 1314 1315 ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1316 for (proc=0,j=0; proc<size; proc++) { 1317 if (w1[proc]) { pa[j++] = proc;} 1318 } 1319 1320 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1321 msz = 0; /* total mesg length (for all procs) */ 1322 for (i=0; i<nrqs; i++) { 1323 proc = pa[i]; 1324 w1[proc] += 3; 1325 msz += w1[proc]; 1326 } 1327 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1328 1329 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1330 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1331 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1332 1333 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1334 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1335 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1336 1337 /* Now post the Irecvs corresponding to these messages */ 1338 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1339 1340 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1341 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1342 1343 /* Allocate Memory for outgoing messages */ 1344 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1345 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 1346 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 1347 1348 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1349 iptr = tmp; 1350 for (i=0; i<nrqs; i++) { 1351 proc = pa[i]; 1352 sbuf1[proc] = iptr; 1353 iptr += w1[proc]; 1354 } 1355 1356 /* Form the outgoing messages */ 1357 /* Initialize the header space */ 1358 for (i=0; i<nrqs; i++) { 1359 proc = pa[i]; 1360 ierr = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr); 1361 ptr[proc] = sbuf1[proc] + 3; 1362 } 1363 1364 /* Parse the isrow and copy data into outbuf */ 1365 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 1366 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1367 proc = row2proc[j]; 1368 if (proc != rank) { /* copy to the outgoing buf*/ 1369 *ptr[proc] = irow[j]; 1370 ctr[proc]++; ptr[proc]++; 1371 } 1372 } 1373 1374 /* Update the headers for the current IS */ 1375 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1376 if ((ctr_j = ctr[j])) { 1377 sbuf1_j = sbuf1[j]; 1378 k = ++sbuf1_j[0]; 1379 sbuf1_j[2*k] = ctr_j; 1380 sbuf1_j[2*k-1] = 0; 1381 } 1382 } 1383 1384 /* Now post the sends */ 1385 ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr); 1386 for (i=0; i<nrqs; ++i) { 1387 proc = pa[i]; 1388 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 1389 } 1390 1391 /* Post Receives to capture the buffer size */ 1392 ierr = PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2);CHKERRQ(ierr); 1393 ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr); 1394 1395 if (nrqs) rbuf2[0] = tmp + msz; 1396 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1397 1398 for (i=0; i<nrqs; ++i) { 1399 proc = pa[i]; 1400 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 1401 } 1402 1403 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1404 1405 /* Send to other procs the buf size they should allocate */ 1406 /* Receive messages*/ 1407 ierr = PetscMalloc1(nrqr,&r_status1);CHKERRQ(ierr); 1408 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1409 1410 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 1411 for (i=0; i<nrqr; ++i) { 1412 req_size[i] = 0; 1413 rbuf1_i = rbuf1[i]; 1414 start = 2*rbuf1_i[0] + 1; 1415 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr); 1416 ierr = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr); 1417 sbuf2_i = sbuf2[i]; 1418 for (j=start; j<end; j++) { 1419 k = rbuf1_i[j] - rstart; 1420 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1421 sbuf2_i[j] = ncols; 1422 req_size[i] += ncols; 1423 } 1424 req_source1[i] = r_status1[i].MPI_SOURCE; 1425 1426 /* form the header */ 1427 sbuf2_i[0] = req_size[i]; 1428 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1429 1430 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 1431 } 1432 1433 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1434 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1435 1436 /* rbuf2 is received, Post recv column indices a->j */ 1437 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 1438 1439 ierr = PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3);CHKERRQ(ierr); 1440 for (i=0; i<nrqs; ++i) { 1441 ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr); 1442 req_source2[i] = r_status2[i].MPI_SOURCE; 1443 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 1444 } 1445 1446 /* Wait on sends1 and sends2 */ 1447 ierr = PetscMalloc1(nrqs,&s_status1);CHKERRQ(ierr); 1448 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr); 1449 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1450 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1451 1452 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr); 1453 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1454 1455 /* Now allocate sending buffers for a->j, and send them off */ 1456 ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr); 1457 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1458 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);} 1459 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1460 1461 for (i=0; i<nrqr; i++) { /* for each requested message */ 1462 rbuf1_i = rbuf1[i]; 1463 sbuf_aj_i = sbuf_aj[i]; 1464 ct1 = 2*rbuf1_i[0] + 1; 1465 ct2 = 0; 1466 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */ 1467 1468 kmax = rbuf1[i][2]; 1469 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1470 row = rbuf1_i[ct1] - rstart; 1471 nzA = ai[row+1] - ai[row]; 1472 nzB = bi[row+1] - bi[row]; 1473 ncols = nzA + nzB; 1474 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1475 1476 /* load the column indices for this row into cols*/ 1477 cols = sbuf_aj_i + ct2; 1478 1479 lwrite = 0; 1480 for (l=0; l<nzB; l++) { 1481 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1482 } 1483 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1484 for (l=0; l<nzB; l++) { 1485 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1486 } 1487 1488 ct2 += ncols; 1489 } 1490 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 1491 } 1492 1493 /* create column map (cmap): global col of C -> local col of submat */ 1494 #if defined(PETSC_USE_CTABLE) 1495 if (!allcolumns) { 1496 ierr = PetscTableCreate(ncol,C->cmap->N,&cmap);CHKERRQ(ierr); 1497 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1498 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1499 if (icol[j] >= cstart && icol[j] <cend) { 1500 cmap_loc[icol[j] - cstart] = j+1; 1501 } else { /* use PetscTable for non-local col indices */ 1502 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1503 } 1504 } 1505 } else { 1506 cmap = NULL; 1507 cmap_loc = NULL; 1508 } 1509 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1510 #else 1511 if (!allcolumns) { 1512 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1513 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1514 } else { 1515 cmap = NULL; 1516 } 1517 #endif 1518 1519 /* Create lens for MatSeqAIJSetPreallocation() */ 1520 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1521 1522 /* Compute lens from local part of C */ 1523 for (j=0; j<nrow; j++) { 1524 row = irow[j]; 1525 proc = row2proc[j]; 1526 if (proc == rank) { 1527 /* diagonal part A = c->A */ 1528 ncols = ai[row-rstart+1] - ai[row-rstart]; 1529 cols = aj + ai[row-rstart]; 1530 if (!allcolumns) { 1531 for (k=0; k<ncols; k++) { 1532 #if defined(PETSC_USE_CTABLE) 1533 tcol = cmap_loc[cols[k]]; 1534 #else 1535 tcol = cmap[cols[k]+cstart]; 1536 #endif 1537 if (tcol) lens[j]++; 1538 } 1539 } else { /* allcolumns */ 1540 lens[j] = ncols; 1541 } 1542 1543 /* off-diagonal part B = c->B */ 1544 ncols = bi[row-rstart+1] - bi[row-rstart]; 1545 cols = bj + bi[row-rstart]; 1546 if (!allcolumns) { 1547 for (k=0; k<ncols; k++) { 1548 #if defined(PETSC_USE_CTABLE) 1549 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1550 #else 1551 tcol = cmap[bmap[cols[k]]]; 1552 #endif 1553 if (tcol) lens[j]++; 1554 } 1555 } else { /* allcolumns */ 1556 lens[j] += ncols; 1557 } 1558 } 1559 } 1560 1561 /* Create row map (rmap): global row of C -> local row of submat */ 1562 #if defined(PETSC_USE_CTABLE) 1563 ierr = PetscTableCreate(nrow,C->rmap->N,&rmap);CHKERRQ(ierr); 1564 for (j=0; j<nrow; j++) { 1565 row = irow[j]; 1566 proc = row2proc[j]; 1567 if (proc == rank) { /* a local row */ 1568 rmap_loc[row - rstart] = j; 1569 } else { 1570 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1571 } 1572 } 1573 #else 1574 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1575 for (j=0; j<nrow; j++) { 1576 rmap[irow[j]] = j; 1577 } 1578 #endif 1579 1580 /* Update lens from offproc data */ 1581 /* recv a->j is done */ 1582 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr); 1583 for (i=0; i<nrqs; i++) { 1584 proc = pa[i]; 1585 sbuf1_i = sbuf1[proc]; 1586 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1587 ct1 = 2 + 1; 1588 ct2 = 0; 1589 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1590 rbuf3_i = rbuf3[i]; /* received C->j */ 1591 1592 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1593 max1 = sbuf1_i[2]; 1594 for (k=0; k<max1; k++,ct1++) { 1595 #if defined(PETSC_USE_CTABLE) 1596 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1597 row--; 1598 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1599 #else 1600 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1601 #endif 1602 /* Now, store row index of submat in sbuf1_i[ct1] */ 1603 sbuf1_i[ct1] = row; 1604 1605 nnz = rbuf2_i[ct1]; 1606 if (!allcolumns) { 1607 for (l=0; l<nnz; l++,ct2++) { 1608 #if defined(PETSC_USE_CTABLE) 1609 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1610 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1611 } else { 1612 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1613 } 1614 #else 1615 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1616 #endif 1617 if (tcol) lens[row]++; 1618 } 1619 } else { /* allcolumns */ 1620 lens[row] += nnz; 1621 } 1622 } 1623 } 1624 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr); 1625 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1626 1627 /* Create the submatrices */ 1628 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1629 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1630 1631 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1632 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1633 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1634 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1635 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1636 1637 /* create struct Mat_SubSppt and attached it to submat */ 1638 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1639 subc = (Mat_SeqAIJ*)submat->data; 1640 subc->submatis1 = smatis1; 1641 1642 smatis1->id = 0; 1643 smatis1->nrqs = nrqs; 1644 smatis1->nrqr = nrqr; 1645 smatis1->rbuf1 = rbuf1; 1646 smatis1->rbuf2 = rbuf2; 1647 smatis1->rbuf3 = rbuf3; 1648 smatis1->sbuf2 = sbuf2; 1649 smatis1->req_source2 = req_source2; 1650 1651 smatis1->sbuf1 = sbuf1; 1652 smatis1->ptr = ptr; 1653 smatis1->tmp = tmp; 1654 smatis1->ctr = ctr; 1655 1656 smatis1->pa = pa; 1657 smatis1->req_size = req_size; 1658 smatis1->req_source1 = req_source1; 1659 1660 smatis1->allcolumns = allcolumns; 1661 smatis1->singleis = PETSC_TRUE; 1662 smatis1->row2proc = row2proc; 1663 smatis1->rmap = rmap; 1664 smatis1->cmap = cmap; 1665 #if defined(PETSC_USE_CTABLE) 1666 smatis1->rmap_loc = rmap_loc; 1667 smatis1->cmap_loc = cmap_loc; 1668 #endif 1669 1670 smatis1->destroy = submat->ops->destroy; 1671 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1672 submat->factortype = C->factortype; 1673 1674 /* compute rmax */ 1675 rmax = 0; 1676 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1677 1678 } else { /* scall == MAT_REUSE_MATRIX */ 1679 submat = submats[0]; 1680 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1681 1682 subc = (Mat_SeqAIJ*)submat->data; 1683 rmax = subc->rmax; 1684 smatis1 = subc->submatis1; 1685 nrqs = smatis1->nrqs; 1686 nrqr = smatis1->nrqr; 1687 rbuf1 = smatis1->rbuf1; 1688 rbuf2 = smatis1->rbuf2; 1689 rbuf3 = smatis1->rbuf3; 1690 req_source2 = smatis1->req_source2; 1691 1692 sbuf1 = smatis1->sbuf1; 1693 sbuf2 = smatis1->sbuf2; 1694 ptr = smatis1->ptr; 1695 tmp = smatis1->tmp; 1696 ctr = smatis1->ctr; 1697 1698 pa = smatis1->pa; 1699 req_size = smatis1->req_size; 1700 req_source1 = smatis1->req_source1; 1701 1702 allcolumns = smatis1->allcolumns; 1703 row2proc = smatis1->row2proc; 1704 rmap = smatis1->rmap; 1705 cmap = smatis1->cmap; 1706 #if defined(PETSC_USE_CTABLE) 1707 rmap_loc = smatis1->rmap_loc; 1708 cmap_loc = smatis1->cmap_loc; 1709 #endif 1710 } 1711 1712 /* Post recv matrix values */ 1713 ierr = PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1714 ierr = PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4);CHKERRQ(ierr); 1715 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1716 for (i=0; i<nrqs; ++i) { 1717 ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr); 1718 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 1719 } 1720 1721 /* Allocate sending buffers for a->a, and send them off */ 1722 ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr); 1723 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1724 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);} 1725 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1726 1727 for (i=0; i<nrqr; i++) { 1728 rbuf1_i = rbuf1[i]; 1729 sbuf_aa_i = sbuf_aa[i]; 1730 ct1 = 2*rbuf1_i[0]+1; 1731 ct2 = 0; 1732 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1733 1734 kmax = rbuf1_i[2]; 1735 for (k=0; k<kmax; k++,ct1++) { 1736 row = rbuf1_i[ct1] - rstart; 1737 nzA = ai[row+1] - ai[row]; 1738 nzB = bi[row+1] - bi[row]; 1739 ncols = nzA + nzB; 1740 cworkB = bj + bi[row]; 1741 vworkA = a_a + ai[row]; 1742 vworkB = b_a + bi[row]; 1743 1744 /* load the column values for this row into vals*/ 1745 vals = sbuf_aa_i + ct2; 1746 1747 lwrite = 0; 1748 for (l=0; l<nzB; l++) { 1749 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1750 } 1751 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1752 for (l=0; l<nzB; l++) { 1753 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1754 } 1755 1756 ct2 += ncols; 1757 } 1758 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 1759 } 1760 1761 /* Assemble submat */ 1762 /* First assemble the local rows */ 1763 for (j=0; j<nrow; j++) { 1764 row = irow[j]; 1765 proc = row2proc[j]; 1766 if (proc == rank) { 1767 Crow = row - rstart; /* local row index of C */ 1768 #if defined(PETSC_USE_CTABLE) 1769 row = rmap_loc[Crow]; /* row index of submat */ 1770 #else 1771 row = rmap[row]; 1772 #endif 1773 1774 if (allcolumns) { 1775 /* diagonal part A = c->A */ 1776 ncols = ai[Crow+1] - ai[Crow]; 1777 cols = aj + ai[Crow]; 1778 vals = a_a + ai[Crow]; 1779 i = 0; 1780 for (k=0; k<ncols; k++) { 1781 subcols[i] = cols[k] + cstart; 1782 subvals[i++] = vals[k]; 1783 } 1784 1785 /* off-diagonal part B = c->B */ 1786 ncols = bi[Crow+1] - bi[Crow]; 1787 cols = bj + bi[Crow]; 1788 vals = b_a + bi[Crow]; 1789 for (k=0; k<ncols; k++) { 1790 subcols[i] = bmap[cols[k]]; 1791 subvals[i++] = vals[k]; 1792 } 1793 1794 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1795 1796 } else { /* !allcolumns */ 1797 #if defined(PETSC_USE_CTABLE) 1798 /* diagonal part A = c->A */ 1799 ncols = ai[Crow+1] - ai[Crow]; 1800 cols = aj + ai[Crow]; 1801 vals = a_a + ai[Crow]; 1802 i = 0; 1803 for (k=0; k<ncols; k++) { 1804 tcol = cmap_loc[cols[k]]; 1805 if (tcol) { 1806 subcols[i] = --tcol; 1807 subvals[i++] = vals[k]; 1808 } 1809 } 1810 1811 /* off-diagonal part B = c->B */ 1812 ncols = bi[Crow+1] - bi[Crow]; 1813 cols = bj + bi[Crow]; 1814 vals = b_a + bi[Crow]; 1815 for (k=0; k<ncols; k++) { 1816 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1817 if (tcol) { 1818 subcols[i] = --tcol; 1819 subvals[i++] = vals[k]; 1820 } 1821 } 1822 #else 1823 /* diagonal part A = c->A */ 1824 ncols = ai[Crow+1] - ai[Crow]; 1825 cols = aj + ai[Crow]; 1826 vals = a_a + ai[Crow]; 1827 i = 0; 1828 for (k=0; k<ncols; k++) { 1829 tcol = cmap[cols[k]+cstart]; 1830 if (tcol) { 1831 subcols[i] = --tcol; 1832 subvals[i++] = vals[k]; 1833 } 1834 } 1835 1836 /* off-diagonal part B = c->B */ 1837 ncols = bi[Crow+1] - bi[Crow]; 1838 cols = bj + bi[Crow]; 1839 vals = b_a + bi[Crow]; 1840 for (k=0; k<ncols; k++) { 1841 tcol = cmap[bmap[cols[k]]]; 1842 if (tcol) { 1843 subcols[i] = --tcol; 1844 subvals[i++] = vals[k]; 1845 } 1846 } 1847 #endif 1848 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1849 } 1850 } 1851 } 1852 1853 /* Now assemble the off-proc rows */ 1854 for (i=0; i<nrqs; i++) { /* for each requested message */ 1855 /* recv values from other processes */ 1856 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr); 1857 proc = pa[idex]; 1858 sbuf1_i = sbuf1[proc]; 1859 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */ 1860 ct1 = 2 + 1; 1861 ct2 = 0; /* count of received C->j */ 1862 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1863 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1864 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1865 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1866 1867 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1868 max1 = sbuf1_i[2]; /* num of rows */ 1869 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1870 row = sbuf1_i[ct1]; /* row index of submat */ 1871 if (!allcolumns) { 1872 idex = 0; 1873 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1874 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1875 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1876 #if defined(PETSC_USE_CTABLE) 1877 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1878 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1879 } else { 1880 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1881 } 1882 #else 1883 tcol = cmap[rbuf3_i[ct2]]; 1884 #endif 1885 if (tcol) { 1886 subcols[idex] = --tcol; /* may not be sorted */ 1887 subvals[idex++] = rbuf4_i[ct2]; 1888 1889 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1890 For reuse, we replace received C->j with index that should be inserted to submat */ 1891 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1892 } 1893 } 1894 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1895 } else { /* scall == MAT_REUSE_MATRIX */ 1896 submat = submats[0]; 1897 subc = (Mat_SeqAIJ*)submat->data; 1898 1899 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1900 for (l=0; l<nnz; l++) { 1901 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1902 subvals[idex++] = rbuf4_i[ct2]; 1903 } 1904 1905 bj = subc->j + subc->i[row]; /* sorted column indices */ 1906 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1907 } 1908 } else { /* allcolumns */ 1909 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1910 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1911 ct2 += nnz; 1912 } 1913 } 1914 } 1915 1916 /* sending a->a are done */ 1917 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr); 1918 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1919 1920 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1921 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1922 submats[0] = submat; 1923 1924 /* Restore the indices */ 1925 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1926 if (!allcolumns) { 1927 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1928 } 1929 1930 /* Destroy allocated memory */ 1931 for (i=0; i<nrqs; ++i) { 1932 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1933 } 1934 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1935 if (sbuf_aa) { 1936 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1937 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1938 } 1939 1940 if (scall == MAT_INITIAL_MATRIX) { 1941 ierr = PetscFree(lens);CHKERRQ(ierr); 1942 if (sbuf_aj) { 1943 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1944 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1945 } 1946 } 1947 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1948 ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1949 PetscFunctionReturn(0); 1950 } 1951 1952 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1953 { 1954 PetscErrorCode ierr; 1955 PetscInt ncol; 1956 PetscBool colflag,allcolumns=PETSC_FALSE; 1957 1958 PetscFunctionBegin; 1959 /* Allocate memory to hold all the submatrices */ 1960 if (scall == MAT_INITIAL_MATRIX) { 1961 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1962 } 1963 1964 /* Check for special case: each processor gets entire matrix columns */ 1965 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1966 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1967 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1968 1969 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1970 PetscFunctionReturn(0); 1971 } 1972 1973 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1974 { 1975 PetscErrorCode ierr; 1976 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1977 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1978 Mat_SeqAIJ *subc; 1979 Mat_SubSppt *smat; 1980 1981 PetscFunctionBegin; 1982 /* Check for special case: each processor has a single IS */ 1983 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */ 1984 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1985 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1986 PetscFunctionReturn(0); 1987 } 1988 1989 /* Collect global wantallmatrix and nstages */ 1990 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1991 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1992 if (!nmax) nmax = 1; 1993 1994 if (scall == MAT_INITIAL_MATRIX) { 1995 /* Collect global wantallmatrix and nstages */ 1996 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1997 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1998 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1999 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2000 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2001 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2002 wantallmatrix = PETSC_TRUE; 2003 2004 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2005 } 2006 } 2007 2008 /* Determine the number of stages through which submatrices are done 2009 Each stage will extract nmax submatrices. 2010 nmax is determined by the matrix column dimension. 2011 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2012 */ 2013 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2014 2015 in[0] = -1*(PetscInt)wantallmatrix; 2016 in[1] = nstages; 2017 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr); 2018 wantallmatrix = (PetscBool)(-out[0]); 2019 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2020 2021 } else { /* MAT_REUSE_MATRIX */ 2022 if (ismax) { 2023 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2024 smat = subc->submatis1; 2025 } else { /* (*submat)[0] is a dummy matrix */ 2026 smat = (Mat_SubSppt*)(*submat)[0]->data; 2027 } 2028 if (!smat) { 2029 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2030 wantallmatrix = PETSC_TRUE; 2031 } else if (smat->singleis) { 2032 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2033 PetscFunctionReturn(0); 2034 } else { 2035 nstages = smat->nstages; 2036 } 2037 } 2038 2039 if (wantallmatrix) { 2040 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2041 PetscFunctionReturn(0); 2042 } 2043 2044 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2045 if (scall == MAT_INITIAL_MATRIX) { 2046 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2047 } 2048 2049 for (i=0,pos=0; i<nstages; i++) { 2050 if (pos+nmax <= ismax) max_no = nmax; 2051 else if (pos >= ismax) max_no = 0; 2052 else max_no = ismax-pos; 2053 2054 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2055 if (!max_no) { 2056 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2057 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2058 smat->nstages = nstages; 2059 } 2060 pos++; /* advance to next dummy matrix if any */ 2061 } else pos += max_no; 2062 } 2063 2064 if (ismax && scall == MAT_INITIAL_MATRIX) { 2065 /* save nstages for reuse */ 2066 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2067 smat = subc->submatis1; 2068 smat->nstages = nstages; 2069 } 2070 PetscFunctionReturn(0); 2071 } 2072 2073 /* -------------------------------------------------------------------------*/ 2074 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2075 { 2076 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2077 Mat A = c->A; 2078 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2079 const PetscInt **icol,**irow; 2080 PetscInt *nrow,*ncol,start; 2081 PetscErrorCode ierr; 2082 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2083 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2084 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2085 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2086 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2087 #if defined(PETSC_USE_CTABLE) 2088 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2089 #else 2090 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2091 #endif 2092 const PetscInt *irow_i; 2093 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2094 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2095 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2096 MPI_Comm comm; 2097 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2098 PetscMPIInt *onodes1,*olengths1,end; 2099 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2100 Mat_SubSppt *smat_i; 2101 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2102 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2103 2104 PetscFunctionBegin; 2105 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2106 size = c->size; 2107 rank = c->rank; 2108 2109 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2110 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2111 2112 for (i=0; i<ismax; i++) { 2113 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2114 if (!issorted[i]) iscsorted = issorted[i]; 2115 2116 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2117 2118 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2119 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2120 2121 /* Check for special case: allcolumn */ 2122 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2123 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2124 if (colflag && ncol[i] == C->cmap->N) { 2125 allcolumns[i] = PETSC_TRUE; 2126 icol[i] = NULL; 2127 } else { 2128 allcolumns[i] = PETSC_FALSE; 2129 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2130 } 2131 } 2132 2133 if (scall == MAT_REUSE_MATRIX) { 2134 /* Assumes new rows are same length as the old rows */ 2135 for (i=0; i<ismax; i++) { 2136 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2137 subc = (Mat_SeqAIJ*)submats[i]->data; 2138 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2139 2140 /* Initial matrix as if empty */ 2141 ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr); 2142 2143 smat_i = subc->submatis1; 2144 2145 nrqs = smat_i->nrqs; 2146 nrqr = smat_i->nrqr; 2147 rbuf1 = smat_i->rbuf1; 2148 rbuf2 = smat_i->rbuf2; 2149 rbuf3 = smat_i->rbuf3; 2150 req_source2 = smat_i->req_source2; 2151 2152 sbuf1 = smat_i->sbuf1; 2153 sbuf2 = smat_i->sbuf2; 2154 ptr = smat_i->ptr; 2155 tmp = smat_i->tmp; 2156 ctr = smat_i->ctr; 2157 2158 pa = smat_i->pa; 2159 req_size = smat_i->req_size; 2160 req_source1 = smat_i->req_source1; 2161 2162 allcolumns[i] = smat_i->allcolumns; 2163 row2proc[i] = smat_i->row2proc; 2164 rmap[i] = smat_i->rmap; 2165 cmap[i] = smat_i->cmap; 2166 } 2167 2168 if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */ 2169 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2170 smat_i = (Mat_SubSppt*)submats[0]->data; 2171 2172 nrqs = smat_i->nrqs; 2173 nrqr = smat_i->nrqr; 2174 rbuf1 = smat_i->rbuf1; 2175 rbuf2 = smat_i->rbuf2; 2176 rbuf3 = smat_i->rbuf3; 2177 req_source2 = smat_i->req_source2; 2178 2179 sbuf1 = smat_i->sbuf1; 2180 sbuf2 = smat_i->sbuf2; 2181 ptr = smat_i->ptr; 2182 tmp = smat_i->tmp; 2183 ctr = smat_i->ctr; 2184 2185 pa = smat_i->pa; 2186 req_size = smat_i->req_size; 2187 req_source1 = smat_i->req_source1; 2188 2189 allcolumns[0] = PETSC_FALSE; 2190 } 2191 } else { /* scall == MAT_INITIAL_MATRIX */ 2192 /* Get some new tags to keep the communication clean */ 2193 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2194 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2195 2196 /* evaluate communication - mesg to who, length of mesg, and buffer space 2197 required. Based on this, buffers are allocated, and data copied into them*/ 2198 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2199 2200 for (i=0; i<ismax; i++) { 2201 jmax = nrow[i]; 2202 irow_i = irow[i]; 2203 2204 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2205 row2proc[i] = row2proc_i; 2206 2207 if (issorted[i]) proc = 0; 2208 for (j=0; j<jmax; j++) { 2209 if (!issorted[i]) proc = 0; 2210 row = irow_i[j]; 2211 while (row >= C->rmap->range[proc+1]) proc++; 2212 w4[proc]++; 2213 row2proc_i[j] = proc; /* map row index to proc */ 2214 } 2215 for (j=0; j<size; j++) { 2216 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2217 } 2218 } 2219 2220 nrqs = 0; /* no of outgoing messages */ 2221 msz = 0; /* total mesg length (for all procs) */ 2222 w1[rank] = 0; /* no mesg sent to self */ 2223 w3[rank] = 0; 2224 for (i=0; i<size; i++) { 2225 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2226 } 2227 ierr = PetscMalloc1(nrqs,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2228 for (i=0,j=0; i<size; i++) { 2229 if (w1[i]) { pa[j] = i; j++; } 2230 } 2231 2232 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2233 for (i=0; i<nrqs; i++) { 2234 j = pa[i]; 2235 w1[j] += w2[j] + 2* w3[j]; 2236 msz += w1[j]; 2237 } 2238 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2239 2240 /* Determine the number of messages to expect, their lengths, from from-ids */ 2241 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2242 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2243 2244 /* Now post the Irecvs corresponding to these messages */ 2245 ierr = PetscObjectGetNewTag((PetscObject)C,&tag0);CHKERRQ(ierr); 2246 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2247 2248 /* Allocate Memory for outgoing messages */ 2249 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2250 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 2251 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 2252 2253 { 2254 PetscInt *iptr = tmp; 2255 k = 0; 2256 for (i=0; i<nrqs; i++) { 2257 j = pa[i]; 2258 iptr += k; 2259 sbuf1[j] = iptr; 2260 k = w1[j]; 2261 } 2262 } 2263 2264 /* Form the outgoing messages. Initialize the header space */ 2265 for (i=0; i<nrqs; i++) { 2266 j = pa[i]; 2267 sbuf1[j][0] = 0; 2268 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 2269 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2270 } 2271 2272 /* Parse the isrow and copy data into outbuf */ 2273 for (i=0; i<ismax; i++) { 2274 row2proc_i = row2proc[i]; 2275 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 2276 irow_i = irow[i]; 2277 jmax = nrow[i]; 2278 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2279 proc = row2proc_i[j]; 2280 if (proc != rank) { /* copy to the outgoing buf*/ 2281 ctr[proc]++; 2282 *ptr[proc] = irow_i[j]; 2283 ptr[proc]++; 2284 } 2285 } 2286 /* Update the headers for the current IS */ 2287 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2288 if ((ctr_j = ctr[j])) { 2289 sbuf1_j = sbuf1[j]; 2290 k = ++sbuf1_j[0]; 2291 sbuf1_j[2*k] = ctr_j; 2292 sbuf1_j[2*k-1] = i; 2293 } 2294 } 2295 } 2296 2297 /* Now post the sends */ 2298 ierr = PetscMalloc1(nrqs,&s_waits1);CHKERRQ(ierr); 2299 for (i=0; i<nrqs; ++i) { 2300 j = pa[i]; 2301 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr); 2302 } 2303 2304 /* Post Receives to capture the buffer size */ 2305 ierr = PetscMalloc1(nrqs,&r_waits2);CHKERRQ(ierr); 2306 ierr = PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);CHKERRQ(ierr); 2307 if (nrqs) rbuf2[0] = tmp + msz; 2308 for (i=1; i<nrqs; ++i) { 2309 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2310 } 2311 for (i=0; i<nrqs; ++i) { 2312 j = pa[i]; 2313 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 2314 } 2315 2316 /* Send to other procs the buf size they should allocate */ 2317 /* Receive messages*/ 2318 ierr = PetscMalloc1(nrqr,&s_waits2);CHKERRQ(ierr); 2319 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2320 { 2321 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2322 PetscInt *sbuf2_i; 2323 2324 ierr = MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2325 for (i=0; i<nrqr; ++i) { 2326 req_size[i] = 0; 2327 rbuf1_i = rbuf1[i]; 2328 start = 2*rbuf1_i[0] + 1; 2329 end = olengths1[i]; 2330 ierr = PetscMalloc1(end,&sbuf2[i]);CHKERRQ(ierr); 2331 sbuf2_i = sbuf2[i]; 2332 for (j=start; j<end; j++) { 2333 id = rbuf1_i[j] - rstart; 2334 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2335 sbuf2_i[j] = ncols; 2336 req_size[i] += ncols; 2337 } 2338 req_source1[i] = onodes1[i]; 2339 /* form the header */ 2340 sbuf2_i[0] = req_size[i]; 2341 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2342 2343 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 2344 } 2345 } 2346 2347 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2348 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2349 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2350 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2351 2352 /* Receive messages*/ 2353 ierr = PetscMalloc1(nrqs,&r_waits3);CHKERRQ(ierr); 2354 ierr = MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2355 for (i=0; i<nrqs; ++i) { 2356 ierr = PetscMalloc1(rbuf2[i][0],&rbuf3[i]);CHKERRQ(ierr); 2357 req_source2[i] = pa[i]; 2358 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 2359 } 2360 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2361 2362 /* Wait on sends1 and sends2 */ 2363 ierr = MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2364 ierr = MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2365 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2366 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2367 2368 /* Now allocate sending buffers for a->j, and send them off */ 2369 ierr = PetscMalloc1(nrqr,&sbuf_aj);CHKERRQ(ierr); 2370 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2371 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aj[0]);CHKERRQ(ierr);} 2372 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2373 2374 ierr = PetscMalloc1(nrqr,&s_waits3);CHKERRQ(ierr); 2375 { 2376 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2377 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2378 PetscInt cend = C->cmap->rend; 2379 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2380 2381 for (i=0; i<nrqr; i++) { 2382 rbuf1_i = rbuf1[i]; 2383 sbuf_aj_i = sbuf_aj[i]; 2384 ct1 = 2*rbuf1_i[0] + 1; 2385 ct2 = 0; 2386 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2387 kmax = rbuf1[i][2*j]; 2388 for (k=0; k<kmax; k++,ct1++) { 2389 row = rbuf1_i[ct1] - rstart; 2390 nzA = a_i[row+1] - a_i[row]; 2391 nzB = b_i[row+1] - b_i[row]; 2392 ncols = nzA + nzB; 2393 cworkA = a_j + a_i[row]; 2394 cworkB = b_j + b_i[row]; 2395 2396 /* load the column indices for this row into cols */ 2397 cols = sbuf_aj_i + ct2; 2398 2399 lwrite = 0; 2400 for (l=0; l<nzB; l++) { 2401 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2402 } 2403 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2404 for (l=0; l<nzB; l++) { 2405 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2406 } 2407 2408 ct2 += ncols; 2409 } 2410 } 2411 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 2412 } 2413 } 2414 2415 /* create col map: global col of C -> local col of submatrices */ 2416 { 2417 const PetscInt *icol_i; 2418 #if defined(PETSC_USE_CTABLE) 2419 for (i=0; i<ismax; i++) { 2420 if (!allcolumns[i]) { 2421 ierr = PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2422 2423 jmax = ncol[i]; 2424 icol_i = icol[i]; 2425 cmap_i = cmap[i]; 2426 for (j=0; j<jmax; j++) { 2427 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2428 } 2429 } else cmap[i] = NULL; 2430 } 2431 #else 2432 for (i=0; i<ismax; i++) { 2433 if (!allcolumns[i]) { 2434 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2435 jmax = ncol[i]; 2436 icol_i = icol[i]; 2437 cmap_i = cmap[i]; 2438 for (j=0; j<jmax; j++) { 2439 cmap_i[icol_i[j]] = j+1; 2440 } 2441 } else cmap[i] = NULL; 2442 } 2443 #endif 2444 } 2445 2446 /* Create lens which is required for MatCreate... */ 2447 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2448 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2449 2450 if (ismax) { 2451 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2452 } 2453 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2454 2455 /* Update lens from local data */ 2456 for (i=0; i<ismax; i++) { 2457 row2proc_i = row2proc[i]; 2458 jmax = nrow[i]; 2459 if (!allcolumns[i]) cmap_i = cmap[i]; 2460 irow_i = irow[i]; 2461 lens_i = lens[i]; 2462 for (j=0; j<jmax; j++) { 2463 row = irow_i[j]; 2464 proc = row2proc_i[j]; 2465 if (proc == rank) { 2466 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2467 if (!allcolumns[i]) { 2468 for (k=0; k<ncols; k++) { 2469 #if defined(PETSC_USE_CTABLE) 2470 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2471 #else 2472 tcol = cmap_i[cols[k]]; 2473 #endif 2474 if (tcol) lens_i[j]++; 2475 } 2476 } else { /* allcolumns */ 2477 lens_i[j] = ncols; 2478 } 2479 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2480 } 2481 } 2482 } 2483 2484 /* Create row map: global row of C -> local row of submatrices */ 2485 #if defined(PETSC_USE_CTABLE) 2486 for (i=0; i<ismax; i++) { 2487 ierr = PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2488 irow_i = irow[i]; 2489 jmax = nrow[i]; 2490 for (j=0; j<jmax; j++) { 2491 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2492 } 2493 } 2494 #else 2495 for (i=0; i<ismax; i++) { 2496 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2497 rmap_i = rmap[i]; 2498 irow_i = irow[i]; 2499 jmax = nrow[i]; 2500 for (j=0; j<jmax; j++) { 2501 rmap_i[irow_i[j]] = j; 2502 } 2503 } 2504 #endif 2505 2506 /* Update lens from offproc data */ 2507 { 2508 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2509 2510 ierr = MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2511 for (tmp2=0; tmp2<nrqs; tmp2++) { 2512 sbuf1_i = sbuf1[pa[tmp2]]; 2513 jmax = sbuf1_i[0]; 2514 ct1 = 2*jmax+1; 2515 ct2 = 0; 2516 rbuf2_i = rbuf2[tmp2]; 2517 rbuf3_i = rbuf3[tmp2]; 2518 for (j=1; j<=jmax; j++) { 2519 is_no = sbuf1_i[2*j-1]; 2520 max1 = sbuf1_i[2*j]; 2521 lens_i = lens[is_no]; 2522 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2523 rmap_i = rmap[is_no]; 2524 for (k=0; k<max1; k++,ct1++) { 2525 #if defined(PETSC_USE_CTABLE) 2526 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2527 row--; 2528 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2529 #else 2530 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2531 #endif 2532 max2 = rbuf2_i[ct1]; 2533 for (l=0; l<max2; l++,ct2++) { 2534 if (!allcolumns[is_no]) { 2535 #if defined(PETSC_USE_CTABLE) 2536 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2537 #else 2538 tcol = cmap_i[rbuf3_i[ct2]]; 2539 #endif 2540 if (tcol) lens_i[row]++; 2541 } else { /* allcolumns */ 2542 lens_i[row]++; /* lens_i[row] += max2 ? */ 2543 } 2544 } 2545 } 2546 } 2547 } 2548 } 2549 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2550 ierr = MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2551 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2552 2553 /* Create the submatrices */ 2554 for (i=0; i<ismax; i++) { 2555 PetscInt rbs,cbs; 2556 2557 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2558 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2559 2560 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2561 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2562 2563 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2564 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2565 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2566 2567 /* create struct Mat_SubSppt and attached it to submat */ 2568 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2569 subc = (Mat_SeqAIJ*)submats[i]->data; 2570 subc->submatis1 = smat_i; 2571 2572 smat_i->destroy = submats[i]->ops->destroy; 2573 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2574 submats[i]->factortype = C->factortype; 2575 2576 smat_i->id = i; 2577 smat_i->nrqs = nrqs; 2578 smat_i->nrqr = nrqr; 2579 smat_i->rbuf1 = rbuf1; 2580 smat_i->rbuf2 = rbuf2; 2581 smat_i->rbuf3 = rbuf3; 2582 smat_i->sbuf2 = sbuf2; 2583 smat_i->req_source2 = req_source2; 2584 2585 smat_i->sbuf1 = sbuf1; 2586 smat_i->ptr = ptr; 2587 smat_i->tmp = tmp; 2588 smat_i->ctr = ctr; 2589 2590 smat_i->pa = pa; 2591 smat_i->req_size = req_size; 2592 smat_i->req_source1 = req_source1; 2593 2594 smat_i->allcolumns = allcolumns[i]; 2595 smat_i->singleis = PETSC_FALSE; 2596 smat_i->row2proc = row2proc[i]; 2597 smat_i->rmap = rmap[i]; 2598 smat_i->cmap = cmap[i]; 2599 } 2600 2601 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2602 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2603 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2604 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2605 2606 /* create struct Mat_SubSppt and attached it to submat */ 2607 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2608 submats[0]->data = (void*)smat_i; 2609 2610 smat_i->destroy = submats[0]->ops->destroy; 2611 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2612 submats[0]->factortype = C->factortype; 2613 2614 smat_i->id = 0; 2615 smat_i->nrqs = nrqs; 2616 smat_i->nrqr = nrqr; 2617 smat_i->rbuf1 = rbuf1; 2618 smat_i->rbuf2 = rbuf2; 2619 smat_i->rbuf3 = rbuf3; 2620 smat_i->sbuf2 = sbuf2; 2621 smat_i->req_source2 = req_source2; 2622 2623 smat_i->sbuf1 = sbuf1; 2624 smat_i->ptr = ptr; 2625 smat_i->tmp = tmp; 2626 smat_i->ctr = ctr; 2627 2628 smat_i->pa = pa; 2629 smat_i->req_size = req_size; 2630 smat_i->req_source1 = req_source1; 2631 2632 smat_i->allcolumns = PETSC_FALSE; 2633 smat_i->singleis = PETSC_FALSE; 2634 smat_i->row2proc = NULL; 2635 smat_i->rmap = NULL; 2636 smat_i->cmap = NULL; 2637 } 2638 2639 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2640 ierr = PetscFree(lens);CHKERRQ(ierr); 2641 if (sbuf_aj) { 2642 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2643 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2644 } 2645 2646 } /* endof scall == MAT_INITIAL_MATRIX */ 2647 2648 /* Post recv matrix values */ 2649 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2650 ierr = PetscMalloc1(nrqs,&rbuf4);CHKERRQ(ierr); 2651 ierr = PetscMalloc1(nrqs,&r_waits4);CHKERRQ(ierr); 2652 for (i=0; i<nrqs; ++i) { 2653 ierr = PetscMalloc1(rbuf2[i][0],&rbuf4[i]);CHKERRQ(ierr); 2654 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 2655 } 2656 2657 /* Allocate sending buffers for a->a, and send them off */ 2658 ierr = PetscMalloc1(nrqr,&sbuf_aa);CHKERRQ(ierr); 2659 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2660 if (nrqr) {ierr = PetscMalloc1(j,&sbuf_aa[0]);CHKERRQ(ierr);} 2661 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2662 2663 ierr = PetscMalloc1(nrqr,&s_waits4);CHKERRQ(ierr); 2664 { 2665 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2666 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2667 PetscInt cend = C->cmap->rend; 2668 PetscInt *b_j = b->j; 2669 PetscScalar *vworkA,*vworkB,*a_a,*b_a; 2670 2671 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2672 ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2673 for (i=0; i<nrqr; i++) { 2674 rbuf1_i = rbuf1[i]; 2675 sbuf_aa_i = sbuf_aa[i]; 2676 ct1 = 2*rbuf1_i[0]+1; 2677 ct2 = 0; 2678 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2679 kmax = rbuf1_i[2*j]; 2680 for (k=0; k<kmax; k++,ct1++) { 2681 row = rbuf1_i[ct1] - rstart; 2682 nzA = a_i[row+1] - a_i[row]; 2683 nzB = b_i[row+1] - b_i[row]; 2684 ncols = nzA + nzB; 2685 cworkB = b_j + b_i[row]; 2686 vworkA = a_a + a_i[row]; 2687 vworkB = b_a + b_i[row]; 2688 2689 /* load the column values for this row into vals*/ 2690 vals = sbuf_aa_i+ct2; 2691 2692 lwrite = 0; 2693 for (l=0; l<nzB; l++) { 2694 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2695 } 2696 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2697 for (l=0; l<nzB; l++) { 2698 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2699 } 2700 2701 ct2 += ncols; 2702 } 2703 } 2704 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 2705 } 2706 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2707 ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2708 } 2709 2710 /* Assemble the matrices */ 2711 /* First assemble the local rows */ 2712 for (i=0; i<ismax; i++) { 2713 row2proc_i = row2proc[i]; 2714 subc = (Mat_SeqAIJ*)submats[i]->data; 2715 imat_ilen = subc->ilen; 2716 imat_j = subc->j; 2717 imat_i = subc->i; 2718 imat_a = subc->a; 2719 2720 if (!allcolumns[i]) cmap_i = cmap[i]; 2721 rmap_i = rmap[i]; 2722 irow_i = irow[i]; 2723 jmax = nrow[i]; 2724 for (j=0; j<jmax; j++) { 2725 row = irow_i[j]; 2726 proc = row2proc_i[j]; 2727 if (proc == rank) { 2728 old_row = row; 2729 #if defined(PETSC_USE_CTABLE) 2730 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2731 row--; 2732 #else 2733 row = rmap_i[row]; 2734 #endif 2735 ilen_row = imat_ilen[row]; 2736 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2737 mat_i = imat_i[row]; 2738 mat_a = imat_a + mat_i; 2739 mat_j = imat_j + mat_i; 2740 if (!allcolumns[i]) { 2741 for (k=0; k<ncols; k++) { 2742 #if defined(PETSC_USE_CTABLE) 2743 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2744 #else 2745 tcol = cmap_i[cols[k]]; 2746 #endif 2747 if (tcol) { 2748 *mat_j++ = tcol - 1; 2749 *mat_a++ = vals[k]; 2750 ilen_row++; 2751 } 2752 } 2753 } else { /* allcolumns */ 2754 for (k=0; k<ncols; k++) { 2755 *mat_j++ = cols[k]; /* global col index! */ 2756 *mat_a++ = vals[k]; 2757 ilen_row++; 2758 } 2759 } 2760 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2761 2762 imat_ilen[row] = ilen_row; 2763 } 2764 } 2765 } 2766 2767 /* Now assemble the off proc rows */ 2768 ierr = MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2769 for (tmp2=0; tmp2<nrqs; tmp2++) { 2770 sbuf1_i = sbuf1[pa[tmp2]]; 2771 jmax = sbuf1_i[0]; 2772 ct1 = 2*jmax + 1; 2773 ct2 = 0; 2774 rbuf2_i = rbuf2[tmp2]; 2775 rbuf3_i = rbuf3[tmp2]; 2776 rbuf4_i = rbuf4[tmp2]; 2777 for (j=1; j<=jmax; j++) { 2778 is_no = sbuf1_i[2*j-1]; 2779 rmap_i = rmap[is_no]; 2780 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2781 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2782 imat_ilen = subc->ilen; 2783 imat_j = subc->j; 2784 imat_i = subc->i; 2785 imat_a = subc->a; 2786 max1 = sbuf1_i[2*j]; 2787 for (k=0; k<max1; k++,ct1++) { 2788 row = sbuf1_i[ct1]; 2789 #if defined(PETSC_USE_CTABLE) 2790 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2791 row--; 2792 #else 2793 row = rmap_i[row]; 2794 #endif 2795 ilen = imat_ilen[row]; 2796 mat_i = imat_i[row]; 2797 mat_a = imat_a + mat_i; 2798 mat_j = imat_j + mat_i; 2799 max2 = rbuf2_i[ct1]; 2800 if (!allcolumns[is_no]) { 2801 for (l=0; l<max2; l++,ct2++) { 2802 #if defined(PETSC_USE_CTABLE) 2803 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2804 #else 2805 tcol = cmap_i[rbuf3_i[ct2]]; 2806 #endif 2807 if (tcol) { 2808 *mat_j++ = tcol - 1; 2809 *mat_a++ = rbuf4_i[ct2]; 2810 ilen++; 2811 } 2812 } 2813 } else { /* allcolumns */ 2814 for (l=0; l<max2; l++,ct2++) { 2815 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2816 *mat_a++ = rbuf4_i[ct2]; 2817 ilen++; 2818 } 2819 } 2820 imat_ilen[row] = ilen; 2821 } 2822 } 2823 } 2824 2825 if (!iscsorted) { /* sort column indices of the rows */ 2826 for (i=0; i<ismax; i++) { 2827 subc = (Mat_SeqAIJ*)submats[i]->data; 2828 imat_j = subc->j; 2829 imat_i = subc->i; 2830 imat_a = subc->a; 2831 imat_ilen = subc->ilen; 2832 2833 if (allcolumns[i]) continue; 2834 jmax = nrow[i]; 2835 for (j=0; j<jmax; j++) { 2836 mat_i = imat_i[j]; 2837 mat_a = imat_a + mat_i; 2838 mat_j = imat_j + mat_i; 2839 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2840 } 2841 } 2842 } 2843 2844 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2845 ierr = MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);CHKERRMPI(ierr); 2846 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2847 2848 /* Restore the indices */ 2849 for (i=0; i<ismax; i++) { 2850 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2851 if (!allcolumns[i]) { 2852 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2853 } 2854 } 2855 2856 for (i=0; i<ismax; i++) { 2857 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2858 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2859 } 2860 2861 /* Destroy allocated memory */ 2862 if (sbuf_aa) { 2863 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2864 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2865 } 2866 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2867 2868 for (i=0; i<nrqs; ++i) { 2869 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2870 } 2871 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2872 2873 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2874 PetscFunctionReturn(0); 2875 } 2876 2877 /* 2878 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2879 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2880 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2881 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2882 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2883 state, and needs to be "assembled" later by compressing B's column space. 2884 2885 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2886 Following this call, C->A & C->B have been created, even if empty. 2887 */ 2888 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2889 { 2890 /* If making this function public, change the error returned in this function away from _PLIB. */ 2891 PetscErrorCode ierr; 2892 Mat_MPIAIJ *aij; 2893 Mat_SeqAIJ *Baij; 2894 PetscBool seqaij,Bdisassembled; 2895 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2896 PetscScalar v; 2897 const PetscInt *rowindices,*colindices; 2898 2899 PetscFunctionBegin; 2900 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2901 if (A) { 2902 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2903 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2904 if (rowemb) { 2905 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2906 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2907 } else { 2908 if (C->rmap->n != A->rmap->n) { 2909 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2910 } 2911 } 2912 if (dcolemb) { 2913 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2914 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2915 } else { 2916 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2917 } 2918 } 2919 if (B) { 2920 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2921 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2922 if (rowemb) { 2923 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2924 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2925 } else { 2926 if (C->rmap->n != B->rmap->n) { 2927 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2928 } 2929 } 2930 if (ocolemb) { 2931 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2932 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2933 } else { 2934 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2935 } 2936 } 2937 2938 aij = (Mat_MPIAIJ*)C->data; 2939 if (!aij->A) { 2940 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2941 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2942 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2943 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2944 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2945 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2946 } 2947 if (A) { 2948 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2949 } else { 2950 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2951 } 2952 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2953 /* 2954 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2955 need to "disassemble" B -- convert it to using C's global indices. 2956 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2957 2958 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2959 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2960 2961 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2962 At least avoid calling MatSetValues() and the implied searches? 2963 */ 2964 2965 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2966 #if defined(PETSC_USE_CTABLE) 2967 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2968 #else 2969 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2970 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2971 if (aij->B) { 2972 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2973 } 2974 #endif 2975 ngcol = 0; 2976 if (aij->lvec) { 2977 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2978 } 2979 if (aij->garray) { 2980 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2981 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2982 } 2983 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2984 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2985 } 2986 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2987 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2988 } 2989 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2990 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2991 } 2992 } 2993 Bdisassembled = PETSC_FALSE; 2994 if (!aij->B) { 2995 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2996 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2997 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2998 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2999 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3000 Bdisassembled = PETSC_TRUE; 3001 } 3002 if (B) { 3003 Baij = (Mat_SeqAIJ*)B->data; 3004 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3005 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3006 for (i=0; i<B->rmap->n; i++) { 3007 nz[i] = Baij->i[i+1] - Baij->i[i]; 3008 } 3009 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3010 ierr = PetscFree(nz);CHKERRQ(ierr); 3011 } 3012 3013 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3014 shift = rend-rstart; 3015 count = 0; 3016 rowindices = NULL; 3017 colindices = NULL; 3018 if (rowemb) { 3019 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3020 } 3021 if (ocolemb) { 3022 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3023 } 3024 for (i=0; i<B->rmap->n; i++) { 3025 PetscInt row; 3026 row = i; 3027 if (rowindices) row = rowindices[i]; 3028 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3029 col = Baij->j[count]; 3030 if (colindices) col = colindices[col]; 3031 if (Bdisassembled && col>=rstart) col += shift; 3032 v = Baij->a[count]; 3033 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3034 ++count; 3035 } 3036 } 3037 /* No assembly for aij->B is necessary. */ 3038 /* FIXME: set aij->B's nonzerostate correctly. */ 3039 } else { 3040 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3041 } 3042 C->preallocated = PETSC_TRUE; 3043 C->was_assembled = PETSC_FALSE; 3044 C->assembled = PETSC_FALSE; 3045 /* 3046 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3047 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3048 */ 3049 PetscFunctionReturn(0); 3050 } 3051 3052 /* 3053 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3054 */ 3055 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3056 { 3057 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3058 3059 PetscFunctionBegin; 3060 PetscValidPointer(A,2); 3061 PetscValidPointer(B,3); 3062 /* FIXME: make sure C is assembled */ 3063 *A = aij->A; 3064 *B = aij->B; 3065 /* Note that we don't incref *A and *B, so be careful! */ 3066 PetscFunctionReturn(0); 3067 } 3068 3069 /* 3070 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3071 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3072 */ 3073 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3074 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3075 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3076 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3077 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3078 { 3079 PetscErrorCode ierr; 3080 PetscMPIInt size,flag; 3081 PetscInt i,ii,cismax,ispar; 3082 Mat *A,*B; 3083 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3084 3085 PetscFunctionBegin; 3086 if (!ismax) PetscFunctionReturn(0); 3087 3088 for (i = 0, cismax = 0; i < ismax; ++i) { 3089 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr); 3090 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3091 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr); 3092 if (size > 1) ++cismax; 3093 } 3094 3095 /* 3096 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3097 ispar counts the number of parallel ISs across C's comm. 3098 */ 3099 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRMPI(ierr); 3100 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3101 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3102 PetscFunctionReturn(0); 3103 } 3104 3105 /* if (ispar) */ 3106 /* 3107 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3108 These are used to extract the off-diag portion of the resulting parallel matrix. 3109 The row IS for the off-diag portion is the same as for the diag portion, 3110 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3111 */ 3112 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3113 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3114 for (i = 0, ii = 0; i < ismax; ++i) { 3115 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3116 if (size > 1) { 3117 /* 3118 TODO: This is the part that's ***NOT SCALABLE***. 3119 To fix this we need to extract just the indices of C's nonzero columns 3120 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3121 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3122 to be done without serializing on the IS list, so, most likely, it is best 3123 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3124 */ 3125 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3126 /* Now we have to 3127 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3128 were sorted on each rank, concatenated they might no longer be sorted; 3129 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3130 indices in the nondecreasing order to the original index positions. 3131 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3132 */ 3133 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3134 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3135 ++ii; 3136 } 3137 } 3138 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3139 for (i = 0, ii = 0; i < ismax; ++i) { 3140 PetscInt j,issize; 3141 const PetscInt *indices; 3142 3143 /* 3144 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3145 */ 3146 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3147 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3148 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3149 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3150 for (j = 1; j < issize; ++j) { 3151 if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3152 } 3153 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3154 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3155 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3156 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3157 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3158 for (j = 1; j < issize; ++j) { 3159 if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3160 } 3161 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3162 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3163 if (size > 1) { 3164 cisrow[ii] = isrow[i]; 3165 ++ii; 3166 } 3167 } 3168 /* 3169 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3170 array of sequential matrices underlying the resulting parallel matrices. 3171 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3172 contain duplicates. 3173 3174 There are as many diag matrices as there are original index sets. There are only as many parallel 3175 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3176 3177 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3178 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3179 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3180 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3181 with A[i] and B[ii] extracted from the corresponding MPI submat. 3182 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3183 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3184 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3185 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3186 values into A[i] and B[ii] sitting inside the corresponding submat. 3187 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3188 A[i], B[ii], AA[i] or BB[ii] matrices. 3189 */ 3190 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3191 if (scall == MAT_INITIAL_MATRIX) { 3192 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3193 } 3194 3195 /* Now obtain the sequential A and B submatrices separately. */ 3196 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3197 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3198 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3199 3200 /* 3201 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3202 matrices A & B have been extracted directly into the parallel matrices containing them, or 3203 simply into the sequential matrix identical with the corresponding A (if size == 1). 3204 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3205 to have the same sparsity pattern. 3206 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3207 must be constructed for C. This is done by setseqmat(s). 3208 */ 3209 for (i = 0, ii = 0; i < ismax; ++i) { 3210 /* 3211 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3212 That way we can avoid sorting and computing permutations when reusing. 3213 To this end: 3214 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3215 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3216 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3217 */ 3218 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3219 3220 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3221 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3222 if (size > 1) { 3223 if (scall == MAT_INITIAL_MATRIX) { 3224 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3225 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3226 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3227 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3228 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3229 } 3230 /* 3231 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3232 */ 3233 { 3234 Mat AA = A[i],BB = B[ii]; 3235 3236 if (AA || BB) { 3237 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3238 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3239 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3240 } 3241 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3242 } 3243 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3244 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3245 ++ii; 3246 } else { /* if (size == 1) */ 3247 if (scall == MAT_REUSE_MATRIX) { 3248 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3249 } 3250 if (isrow_p[i] || iscol_p[i]) { 3251 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3252 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3253 /* Otherwise A is extracted straight into (*submats)[i]. */ 3254 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3255 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3256 } else (*submat)[i] = A[i]; 3257 } 3258 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3259 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3260 } 3261 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3262 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3263 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3264 ierr = PetscFree(A);CHKERRQ(ierr); 3265 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3266 PetscFunctionReturn(0); 3267 } 3268 3269 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3270 { 3271 PetscErrorCode ierr; 3272 3273 PetscFunctionBegin; 3274 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3275 PetscFunctionReturn(0); 3276 } 3277