1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j,owner; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscCalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++) { 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++) { 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank) { 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows) { 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++) { 115 if (tosizes_temp[i]) { 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscCalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 for (i=0; i<nto; i++) { 123 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 124 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 125 } 126 /* send information to other processors */ 127 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 128 nrecvrows = 0; 129 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 130 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 131 nrecvrows = 0; 132 for (i=0; i<nfrom; i++) { 133 for (j=0; j<fromsizes[2*i]; j++) { 134 remote[nrecvrows].rank = fromranks[i]; 135 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 136 } 137 } 138 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 139 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 140 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 141 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 143 /* message pair <no of is, row> */ 144 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 145 for (i=0; i<nrrows; i++) { 146 owner = rrow_ranks[i]; /* processor */ 147 j = tosizes_temp[owner]; /* index */ 148 todata[toffsets[j]++] = rrow_isids[i]; 149 todata[toffsets[j]++] = remoterows[i]; 150 } 151 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 152 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 153 ierr = PetscFree(toffsets);CHKERRQ(ierr); 154 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 155 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 157 /* send rows belonging to the remote so that then we could get the overlapping data back */ 158 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 159 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 160 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 161 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 162 ierr = PetscFree(fromranks);CHKERRQ(ierr); 163 nrecvrows = 0; 164 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 165 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 166 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 167 nrecvrows = 0; 168 for (i=0; i<nto; i++) { 169 for (j=0; j<tosizes[2*i]; j++) { 170 remote[nrecvrows].rank = toranks[i]; 171 remote[nrecvrows++].index = tosizes[2*i+1]+j; 172 } 173 } 174 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 175 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 176 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 177 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 178 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 179 /* overlap communication and computation */ 180 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 181 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 182 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 183 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 184 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 185 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 186 ierr = PetscFree(toranks);CHKERRQ(ierr); 187 ierr = PetscFree(tosizes);CHKERRQ(ierr); 188 ierr = PetscFree(todata);CHKERRQ(ierr); 189 PetscFunctionReturn(0); 190 } 191 192 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 193 { 194 PetscInt *isz,isz_i,i,j,is_id, data_size; 195 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 196 const PetscInt *indices_i_temp; 197 MPI_Comm *iscomms; 198 PetscErrorCode ierr; 199 200 PetscFunctionBegin; 201 max_lsize = 0; 202 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 203 for (i=0; i<nidx; i++){ 204 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 205 max_lsize = lsize>max_lsize ? lsize:max_lsize; 206 isz[i] = lsize; 207 } 208 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 209 for (i=0; i<nidx; i++){ 210 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 211 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 212 ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr); 213 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 214 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 215 } 216 /* retrieve information to get row id and its overlap */ 217 for (i=0; i<nrecvs; ){ 218 is_id = recvdata[i++]; 219 data_size = recvdata[i++]; 220 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 221 isz_i = isz[is_id]; 222 for (j=0; j< data_size; j++){ 223 col = recvdata[i++]; 224 indices_i[isz_i++] = col; 225 } 226 isz[is_id] = isz_i; 227 } 228 /* remove duplicate entities */ 229 for (i=0; i<nidx; i++){ 230 indices_i = indices_temp+(max_lsize+nrecvs)*i; 231 isz_i = isz[i]; 232 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 233 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 234 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 235 } 236 ierr = PetscFree(isz);CHKERRQ(ierr); 237 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 238 PetscFunctionReturn(0); 239 } 240 241 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 242 { 243 PetscLayout rmap,cmap; 244 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 245 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 246 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 247 const PetscInt *gcols,*ai,*aj,*bi,*bj; 248 Mat amat,bmat; 249 PetscMPIInt rank; 250 PetscBool done; 251 MPI_Comm comm; 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 256 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 257 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 258 /* Even if the mat is symmetric, we still assume it is not symmetric */ 259 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 260 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 261 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 262 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 263 /* total number of nonzero values is used to estimate the memory usage in the next step */ 264 tnz = ai[an]+bi[bn]; 265 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 266 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 267 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 268 /* to find the longest message */ 269 max_fszs = 0; 270 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 271 /* better way to estimate number of nonzero in the mat??? */ 272 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 273 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 274 rows_pos = 0; 275 totalrows = 0; 276 for (i=0; i<nfrom; i++){ 277 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 278 /* group data together */ 279 for (j=0; j<fromsizes[2*i]; j+=2){ 280 is_id = fromrows[rows_pos++];/* no of is */ 281 rows_i = rows_data[is_id]; 282 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 283 } 284 /* estimate a space to avoid multiple allocations */ 285 for (j=0; j<nidx; j++){ 286 indvc_ij = 0; 287 rows_i = rows_data[j]; 288 for (l=0; l<rows_pos_i[j]; l++){ 289 row = rows_i[l]-rstart; 290 start = ai[row]; 291 end = ai[row+1]; 292 for (k=start; k<end; k++){ /* Amat */ 293 col = aj[k] + cstart; 294 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 295 } 296 start = bi[row]; 297 end = bi[row+1]; 298 for (k=start; k<end; k++) { /* Bmat */ 299 col = gcols[bj[k]]; 300 indices_tmp[indvc_ij++] = col; 301 } 302 } 303 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 304 indv_counts[i*nidx+j] = indvc_ij; 305 totalrows += indvc_ij; 306 } 307 } 308 /* message triple <no of is, number of rows, rows> */ 309 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 310 totalrows = 0; 311 rows_pos = 0; 312 /* use this code again */ 313 for (i=0;i<nfrom;i++){ 314 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 315 for (j=0; j<fromsizes[2*i]; j+=2){ 316 is_id = fromrows[rows_pos++]; 317 rows_i = rows_data[is_id]; 318 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 319 } 320 /* add data */ 321 for (j=0; j<nidx; j++){ 322 if (!indv_counts[i*nidx+j]) continue; 323 indvc_ij = 0; 324 sbdata[totalrows++] = j; 325 sbdata[totalrows++] = indv_counts[i*nidx+j]; 326 sbsizes[2*i] += 2; 327 rows_i = rows_data[j]; 328 for (l=0; l<rows_pos_i[j]; l++){ 329 row = rows_i[l]-rstart; 330 start = ai[row]; 331 end = ai[row+1]; 332 for (k=start; k<end; k++){ /* Amat */ 333 col = aj[k] + cstart; 334 indices_tmp[indvc_ij++] = col; 335 } 336 start = bi[row]; 337 end = bi[row+1]; 338 for (k=start; k<end; k++) { /* Bmat */ 339 col = gcols[bj[k]]; 340 indices_tmp[indvc_ij++] = col; 341 } 342 } 343 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 344 sbsizes[2*i] += indvc_ij; 345 ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr); 346 totalrows += indvc_ij; 347 } 348 } 349 ierr = PetscCalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 350 for (i=0; i<nfrom; i++){ 351 offsets[i+1] = offsets[i] + sbsizes[2*i]; 352 sbsizes[2*i+1] = offsets[i]; 353 } 354 ierr = PetscFree(offsets);CHKERRQ(ierr); 355 if (sbrowsizes) *sbrowsizes = sbsizes; 356 if (sbrows) *sbrows = sbdata; 357 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 358 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 359 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 360 PetscFunctionReturn(0); 361 } 362 363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 364 { 365 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 366 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 367 PetscInt lsize,lsize_tmp,owner; 368 PetscMPIInt rank; 369 Mat amat,bmat; 370 PetscBool done; 371 PetscLayout cmap,rmap; 372 MPI_Comm comm; 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 377 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 378 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 379 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 380 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 381 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 382 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 383 /* is it a safe way to compute number of nonzero values ? */ 384 tnz = ai[an]+bi[bn]; 385 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 386 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 387 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 388 /* it is a better way to estimate memory than the old implementation 389 * where global size of matrix is used 390 * */ 391 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 392 for (i=0; i<nidx; i++) { 393 MPI_Comm iscomm; 394 395 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 396 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 397 lsize_tmp = 0; 398 for (j=0; j<lsize; j++) { 399 owner = -1; 400 row = indices[j]; 401 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 402 if (owner != rank) continue; 403 /* local number */ 404 row -= rstart; 405 start = ai[row]; 406 end = ai[row+1]; 407 for (k=start; k<end; k++) { /* Amat */ 408 col = aj[k] + cstart; 409 indices_temp[lsize_tmp++] = col; 410 } 411 start = bi[row]; 412 end = bi[row+1]; 413 for (k=start; k<end; k++) { /* Bmat */ 414 col = gcols[bj[k]]; 415 indices_temp[lsize_tmp++] = col; 416 } 417 } 418 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 419 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 420 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 421 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 422 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 423 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 424 } 425 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 426 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 427 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 428 PetscFunctionReturn(0); 429 } 430 431 432 /* 433 Sample message format: 434 If a processor A wants processor B to process some elements corresponding 435 to index sets is[1],is[5] 436 mesg [0] = 2 (no of index sets in the mesg) 437 ----------- 438 mesg [1] = 1 => is[1] 439 mesg [2] = sizeof(is[1]); 440 ----------- 441 mesg [3] = 5 => is[5] 442 mesg [4] = sizeof(is[5]); 443 ----------- 444 mesg [5] 445 mesg [n] datas[1] 446 ----------- 447 mesg[n+1] 448 mesg[m] data(is[5]) 449 ----------- 450 451 Notes: 452 nrqs - no of requests sent (or to be sent out) 453 nrqr - no of requests recieved (which have to be or which have been processed 454 */ 455 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 456 { 457 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 458 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 459 const PetscInt **idx,*idx_i; 460 PetscInt *n,**data,len; 461 #if defined(PETSC_USE_CTABLE) 462 PetscTable *table_data,table_data_i; 463 PetscInt *tdata,tcount,tcount_max; 464 #else 465 PetscInt *data_i,*d_p; 466 #endif 467 PetscErrorCode ierr; 468 PetscMPIInt size,rank,tag1,tag2; 469 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 470 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 471 PetscBT *table; 472 MPI_Comm comm; 473 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 474 MPI_Status *s_status,*recv_status; 475 MPI_Comm *iscomms; 476 char *t_p; 477 478 PetscFunctionBegin; 479 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 480 size = c->size; 481 rank = c->rank; 482 M = C->rmap->N; 483 484 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 485 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 486 487 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 488 489 for (i=0; i<imax; i++) { 490 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 491 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 492 } 493 494 /* evaluate communication - mesg to who,length of mesg, and buffer space 495 required. Based on this, buffers are allocated, and data copied into them */ 496 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 497 for (i=0; i<imax; i++) { 498 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 499 idx_i = idx[i]; 500 len = n[i]; 501 for (j=0; j<len; j++) { 502 row = idx_i[j]; 503 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 504 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 505 w4[proc]++; 506 } 507 for (j=0; j<size; j++) { 508 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 509 } 510 } 511 512 nrqs = 0; /* no of outgoing messages */ 513 msz = 0; /* total mesg length (for all proc */ 514 w1[rank] = 0; /* no mesg sent to intself */ 515 w3[rank] = 0; 516 for (i=0; i<size; i++) { 517 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 518 } 519 /* pa - is list of processors to communicate with */ 520 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 521 for (i=0,j=0; i<size; i++) { 522 if (w1[i]) {pa[j] = i; j++;} 523 } 524 525 /* Each message would have a header = 1 + 2*(no of IS) + data */ 526 for (i=0; i<nrqs; i++) { 527 j = pa[i]; 528 w1[j] += w2[j] + 2*w3[j]; 529 msz += w1[j]; 530 } 531 532 /* Determine the number of messages to expect, their lengths, from from-ids */ 533 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 534 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 535 536 /* Now post the Irecvs corresponding to these messages */ 537 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 538 539 /* Allocate Memory for outgoing messages */ 540 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 541 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 542 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 543 544 { 545 PetscInt *iptr = tmp,ict = 0; 546 for (i=0; i<nrqs; i++) { 547 j = pa[i]; 548 iptr += ict; 549 outdat[j] = iptr; 550 ict = w1[j]; 551 } 552 } 553 554 /* Form the outgoing messages */ 555 /* plug in the headers */ 556 for (i=0; i<nrqs; i++) { 557 j = pa[i]; 558 outdat[j][0] = 0; 559 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 560 ptr[j] = outdat[j] + 2*w3[j] + 1; 561 } 562 563 /* Memory for doing local proc's work */ 564 { 565 PetscInt M_BPB_imax = 0; 566 #if defined(PETSC_USE_CTABLE) 567 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 568 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 569 for (i=0; i<imax; i++) { 570 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 571 } 572 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 573 for (i=0; i<imax; i++) { 574 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 575 } 576 #else 577 PetscInt Mimax = 0; 578 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 579 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 580 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 581 for (i=0; i<imax; i++) { 582 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 583 data[i] = d_p + M*i; 584 } 585 #endif 586 } 587 588 /* Parse the IS and update local tables and the outgoing buf with the data */ 589 { 590 PetscInt n_i,isz_i,*outdat_j,ctr_j; 591 PetscBT table_i; 592 593 for (i=0; i<imax; i++) { 594 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 595 n_i = n[i]; 596 table_i = table[i]; 597 idx_i = idx[i]; 598 #if defined(PETSC_USE_CTABLE) 599 table_data_i = table_data[i]; 600 #else 601 data_i = data[i]; 602 #endif 603 isz_i = isz[i]; 604 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 605 row = idx_i[j]; 606 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 607 if (proc != rank) { /* copy to the outgoing buffer */ 608 ctr[proc]++; 609 *ptr[proc] = row; 610 ptr[proc]++; 611 } else if (!PetscBTLookupSet(table_i,row)) { 612 #if defined(PETSC_USE_CTABLE) 613 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 614 #else 615 data_i[isz_i] = row; /* Update the local table */ 616 #endif 617 isz_i++; 618 } 619 } 620 /* Update the headers for the current IS */ 621 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 622 if ((ctr_j = ctr[j])) { 623 outdat_j = outdat[j]; 624 k = ++outdat_j[0]; 625 outdat_j[2*k] = ctr_j; 626 outdat_j[2*k-1] = i; 627 } 628 } 629 isz[i] = isz_i; 630 } 631 } 632 633 /* Now post the sends */ 634 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 635 for (i=0; i<nrqs; ++i) { 636 j = pa[i]; 637 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 638 } 639 640 /* No longer need the original indices */ 641 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 642 for (i=0; i<imax; ++i) { 643 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 644 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 645 } 646 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 647 648 for (i=0; i<imax; ++i) { 649 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 650 } 651 652 /* Do Local work */ 653 #if defined(PETSC_USE_CTABLE) 654 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 655 #else 656 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 657 #endif 658 659 /* Receive messages */ 660 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 661 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 662 663 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 664 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 665 666 /* Phase 1 sends are complete - deallocate buffers */ 667 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 668 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 669 670 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 671 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 672 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 673 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 674 ierr = PetscFree(rbuf);CHKERRQ(ierr); 675 676 677 /* Send the data back */ 678 /* Do a global reduction to know the buffer space req for incoming messages */ 679 { 680 PetscMPIInt *rw1; 681 682 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 683 684 for (i=0; i<nrqr; ++i) { 685 proc = recv_status[i].MPI_SOURCE; 686 687 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 688 rw1[proc] = isz1[i]; 689 } 690 ierr = PetscFree(onodes1);CHKERRQ(ierr); 691 ierr = PetscFree(olengths1);CHKERRQ(ierr); 692 693 /* Determine the number of messages to expect, their lengths, from from-ids */ 694 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 695 ierr = PetscFree(rw1);CHKERRQ(ierr); 696 } 697 /* Now post the Irecvs corresponding to these messages */ 698 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 699 700 /* Now post the sends */ 701 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 702 for (i=0; i<nrqr; ++i) { 703 j = recv_status[i].MPI_SOURCE; 704 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 705 } 706 707 /* receive work done on other processors */ 708 { 709 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 710 PetscMPIInt idex; 711 PetscBT table_i; 712 MPI_Status *status2; 713 714 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 715 for (i=0; i<nrqs; ++i) { 716 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 717 /* Process the message */ 718 rbuf2_i = rbuf2[idex]; 719 ct1 = 2*rbuf2_i[0]+1; 720 jmax = rbuf2[idex][0]; 721 for (j=1; j<=jmax; j++) { 722 max = rbuf2_i[2*j]; 723 is_no = rbuf2_i[2*j-1]; 724 isz_i = isz[is_no]; 725 table_i = table[is_no]; 726 #if defined(PETSC_USE_CTABLE) 727 table_data_i = table_data[is_no]; 728 #else 729 data_i = data[is_no]; 730 #endif 731 for (k=0; k<max; k++,ct1++) { 732 row = rbuf2_i[ct1]; 733 if (!PetscBTLookupSet(table_i,row)) { 734 #if defined(PETSC_USE_CTABLE) 735 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 736 #else 737 data_i[isz_i] = row; 738 #endif 739 isz_i++; 740 } 741 } 742 isz[is_no] = isz_i; 743 } 744 } 745 746 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 747 ierr = PetscFree(status2);CHKERRQ(ierr); 748 } 749 750 #if defined(PETSC_USE_CTABLE) 751 tcount_max = 0; 752 for (i=0; i<imax; ++i) { 753 table_data_i = table_data[i]; 754 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 755 if (tcount_max < tcount) tcount_max = tcount; 756 } 757 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 758 #endif 759 760 for (i=0; i<imax; ++i) { 761 #if defined(PETSC_USE_CTABLE) 762 PetscTablePosition tpos; 763 table_data_i = table_data[i]; 764 765 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 766 while (tpos) { 767 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 768 tdata[--j] = --k; 769 } 770 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 771 #else 772 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 773 #endif 774 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 775 } 776 777 ierr = PetscFree(iscomms);CHKERRQ(ierr); 778 ierr = PetscFree(onodes2);CHKERRQ(ierr); 779 ierr = PetscFree(olengths2);CHKERRQ(ierr); 780 781 ierr = PetscFree(pa);CHKERRQ(ierr); 782 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 783 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 784 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 785 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 786 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 787 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 788 ierr = PetscFree(s_status);CHKERRQ(ierr); 789 ierr = PetscFree(recv_status);CHKERRQ(ierr); 790 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 791 ierr = PetscFree(xdata);CHKERRQ(ierr); 792 ierr = PetscFree(isz1);CHKERRQ(ierr); 793 #if defined(PETSC_USE_CTABLE) 794 for (i=0; i<imax; i++) { 795 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 796 } 797 ierr = PetscFree(table_data);CHKERRQ(ierr); 798 ierr = PetscFree(tdata);CHKERRQ(ierr); 799 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 800 #else 801 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 802 #endif 803 PetscFunctionReturn(0); 804 } 805 806 /* 807 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 808 the work on the local processor. 809 810 Inputs: 811 C - MAT_MPIAIJ; 812 imax - total no of index sets processed at a time; 813 table - an array of char - size = m bits. 814 815 Output: 816 isz - array containing the count of the solution elements corresponding 817 to each index set; 818 data or table_data - pointer to the solutions 819 */ 820 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 821 { 822 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 823 Mat A = c->A,B = c->B; 824 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 825 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 826 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 827 PetscBT table_i; 828 #if defined(PETSC_USE_CTABLE) 829 PetscTable table_data_i; 830 PetscErrorCode ierr; 831 PetscTablePosition tpos; 832 PetscInt tcount,*tdata; 833 #else 834 PetscInt *data_i; 835 #endif 836 837 PetscFunctionBegin; 838 rstart = C->rmap->rstart; 839 cstart = C->cmap->rstart; 840 ai = a->i; 841 aj = a->j; 842 bi = b->i; 843 bj = b->j; 844 garray = c->garray; 845 846 for (i=0; i<imax; i++) { 847 #if defined(PETSC_USE_CTABLE) 848 /* copy existing entries of table_data_i into tdata[] */ 849 table_data_i = table_data[i]; 850 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 851 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 852 853 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 854 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 855 while (tpos) { 856 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 857 tdata[--j] = --row; 858 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 859 } 860 #else 861 data_i = data[i]; 862 #endif 863 table_i = table[i]; 864 isz_i = isz[i]; 865 max = isz[i]; 866 867 for (j=0; j<max; j++) { 868 #if defined(PETSC_USE_CTABLE) 869 row = tdata[j] - rstart; 870 #else 871 row = data_i[j] - rstart; 872 #endif 873 start = ai[row]; 874 end = ai[row+1]; 875 for (k=start; k<end; k++) { /* Amat */ 876 val = aj[k] + cstart; 877 if (!PetscBTLookupSet(table_i,val)) { 878 #if defined(PETSC_USE_CTABLE) 879 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 880 #else 881 data_i[isz_i] = val; 882 #endif 883 isz_i++; 884 } 885 } 886 start = bi[row]; 887 end = bi[row+1]; 888 for (k=start; k<end; k++) { /* Bmat */ 889 val = garray[bj[k]]; 890 if (!PetscBTLookupSet(table_i,val)) { 891 #if defined(PETSC_USE_CTABLE) 892 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 893 #else 894 data_i[isz_i] = val; 895 #endif 896 isz_i++; 897 } 898 } 899 } 900 isz[i] = isz_i; 901 902 #if defined(PETSC_USE_CTABLE) 903 ierr = PetscFree(tdata);CHKERRQ(ierr); 904 #endif 905 } 906 PetscFunctionReturn(0); 907 } 908 909 /* 910 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 911 and return the output 912 913 Input: 914 C - the matrix 915 nrqr - no of messages being processed. 916 rbuf - an array of pointers to the recieved requests 917 918 Output: 919 xdata - array of messages to be sent back 920 isz1 - size of each message 921 922 For better efficiency perhaps we should malloc separately each xdata[i], 923 then if a remalloc is required we need only copy the data for that one row 924 rather then all previous rows as it is now where a single large chunck of 925 memory is used. 926 927 */ 928 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 929 { 930 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 931 Mat A = c->A,B = c->B; 932 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 933 PetscErrorCode ierr; 934 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 935 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 936 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 937 PetscInt *rbuf_i,kmax,rbuf_0; 938 PetscBT xtable; 939 940 PetscFunctionBegin; 941 m = C->rmap->N; 942 rstart = C->rmap->rstart; 943 cstart = C->cmap->rstart; 944 ai = a->i; 945 aj = a->j; 946 bi = b->i; 947 bj = b->j; 948 garray = c->garray; 949 950 951 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 952 rbuf_i = rbuf[i]; 953 rbuf_0 = rbuf_i[0]; 954 ct += rbuf_0; 955 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 956 } 957 958 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 959 else max1 = 1; 960 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 961 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 962 ++no_malloc; 963 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 964 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 965 966 ct3 = 0; 967 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 968 rbuf_i = rbuf[i]; 969 rbuf_0 = rbuf_i[0]; 970 ct1 = 2*rbuf_0+1; 971 ct2 = ct1; 972 ct3 += ct1; 973 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 974 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 975 oct2 = ct2; 976 kmax = rbuf_i[2*j]; 977 for (k=0; k<kmax; k++,ct1++) { 978 row = rbuf_i[ct1]; 979 if (!PetscBTLookupSet(xtable,row)) { 980 if (!(ct3 < mem_estimate)) { 981 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 982 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 983 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 984 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 985 xdata[0] = tmp; 986 mem_estimate = new_estimate; ++no_malloc; 987 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 988 } 989 xdata[i][ct2++] = row; 990 ct3++; 991 } 992 } 993 for (k=oct2,max2=ct2; k<max2; k++) { 994 row = xdata[i][k] - rstart; 995 start = ai[row]; 996 end = ai[row+1]; 997 for (l=start; l<end; l++) { 998 val = aj[l] + cstart; 999 if (!PetscBTLookupSet(xtable,val)) { 1000 if (!(ct3 < mem_estimate)) { 1001 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1002 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1003 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1004 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1005 xdata[0] = tmp; 1006 mem_estimate = new_estimate; ++no_malloc; 1007 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1008 } 1009 xdata[i][ct2++] = val; 1010 ct3++; 1011 } 1012 } 1013 start = bi[row]; 1014 end = bi[row+1]; 1015 for (l=start; l<end; l++) { 1016 val = garray[bj[l]]; 1017 if (!PetscBTLookupSet(xtable,val)) { 1018 if (!(ct3 < mem_estimate)) { 1019 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1020 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1021 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1022 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1023 xdata[0] = tmp; 1024 mem_estimate = new_estimate; ++no_malloc; 1025 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1026 } 1027 xdata[i][ct2++] = val; 1028 ct3++; 1029 } 1030 } 1031 } 1032 /* Update the header*/ 1033 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1034 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1035 } 1036 xdata[i][0] = rbuf_0; 1037 xdata[i+1] = xdata[i] + ct2; 1038 isz1[i] = ct2; /* size of each message */ 1039 } 1040 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1041 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1042 PetscFunctionReturn(0); 1043 } 1044 /* -------------------------------------------------------------------------*/ 1045 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1046 /* 1047 Every processor gets the entire matrix 1048 */ 1049 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1050 { 1051 Mat B; 1052 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1053 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1054 PetscErrorCode ierr; 1055 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1056 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1057 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1058 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1059 1060 PetscFunctionBegin; 1061 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1062 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1063 1064 if (scall == MAT_INITIAL_MATRIX) { 1065 /* ---------------------------------------------------------------- 1066 Tell every processor the number of nonzeros per row 1067 */ 1068 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1069 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1070 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1071 } 1072 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1073 for (i=0; i<size; i++) { 1074 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1075 displs[i] = A->rmap->range[i]; 1076 } 1077 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1078 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1079 #else 1080 sendcount = A->rmap->rend - A->rmap->rstart; 1081 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1082 #endif 1083 /* --------------------------------------------------------------- 1084 Create the sequential matrix of the same type as the local block diagonal 1085 */ 1086 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1087 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1088 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1089 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1090 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1091 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1092 **Bin = B; 1093 b = (Mat_SeqAIJ*)B->data; 1094 1095 /*-------------------------------------------------------------------- 1096 Copy my part of matrix column indices over 1097 */ 1098 sendcount = ad->nz + bd->nz; 1099 jsendbuf = b->j + b->i[rstarts[rank]]; 1100 a_jsendbuf = ad->j; 1101 b_jsendbuf = bd->j; 1102 n = A->rmap->rend - A->rmap->rstart; 1103 cnt = 0; 1104 for (i=0; i<n; i++) { 1105 1106 /* put in lower diagonal portion */ 1107 m = bd->i[i+1] - bd->i[i]; 1108 while (m > 0) { 1109 /* is it above diagonal (in bd (compressed) numbering) */ 1110 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1111 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1112 m--; 1113 } 1114 1115 /* put in diagonal portion */ 1116 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1117 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1118 } 1119 1120 /* put in upper diagonal portion */ 1121 while (m-- > 0) { 1122 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1123 } 1124 } 1125 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1126 1127 /*-------------------------------------------------------------------- 1128 Gather all column indices to all processors 1129 */ 1130 for (i=0; i<size; i++) { 1131 recvcounts[i] = 0; 1132 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1133 recvcounts[i] += lens[j]; 1134 } 1135 } 1136 displs[0] = 0; 1137 for (i=1; i<size; i++) { 1138 displs[i] = displs[i-1] + recvcounts[i-1]; 1139 } 1140 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1141 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1142 #else 1143 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1144 #endif 1145 /*-------------------------------------------------------------------- 1146 Assemble the matrix into useable form (note numerical values not yet set) 1147 */ 1148 /* set the b->ilen (length of each row) values */ 1149 ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr); 1150 /* set the b->i indices */ 1151 b->i[0] = 0; 1152 for (i=1; i<=A->rmap->N; i++) { 1153 b->i[i] = b->i[i-1] + lens[i-1]; 1154 } 1155 ierr = PetscFree(lens);CHKERRQ(ierr); 1156 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1157 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1158 1159 } else { 1160 B = **Bin; 1161 b = (Mat_SeqAIJ*)B->data; 1162 } 1163 1164 /*-------------------------------------------------------------------- 1165 Copy my part of matrix numerical values into the values location 1166 */ 1167 if (flag == MAT_GET_VALUES) { 1168 sendcount = ad->nz + bd->nz; 1169 sendbuf = b->a + b->i[rstarts[rank]]; 1170 a_sendbuf = ad->a; 1171 b_sendbuf = bd->a; 1172 b_sendj = bd->j; 1173 n = A->rmap->rend - A->rmap->rstart; 1174 cnt = 0; 1175 for (i=0; i<n; i++) { 1176 1177 /* put in lower diagonal portion */ 1178 m = bd->i[i+1] - bd->i[i]; 1179 while (m > 0) { 1180 /* is it above diagonal (in bd (compressed) numbering) */ 1181 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1182 sendbuf[cnt++] = *b_sendbuf++; 1183 m--; 1184 b_sendj++; 1185 } 1186 1187 /* put in diagonal portion */ 1188 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1189 sendbuf[cnt++] = *a_sendbuf++; 1190 } 1191 1192 /* put in upper diagonal portion */ 1193 while (m-- > 0) { 1194 sendbuf[cnt++] = *b_sendbuf++; 1195 b_sendj++; 1196 } 1197 } 1198 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1199 1200 /* ----------------------------------------------------------------- 1201 Gather all numerical values to all processors 1202 */ 1203 if (!recvcounts) { 1204 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1205 } 1206 for (i=0; i<size; i++) { 1207 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1208 } 1209 displs[0] = 0; 1210 for (i=1; i<size; i++) { 1211 displs[i] = displs[i-1] + recvcounts[i-1]; 1212 } 1213 recvbuf = b->a; 1214 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1215 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1216 #else 1217 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1218 #endif 1219 } /* endof (flag == MAT_GET_VALUES) */ 1220 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1221 1222 if (A->symmetric) { 1223 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1224 } else if (A->hermitian) { 1225 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1226 } else if (A->structurally_symmetric) { 1227 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1228 } 1229 PetscFunctionReturn(0); 1230 } 1231 1232 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1233 { 1234 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1235 Mat submat,A = c->A,B = c->B; 1236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1237 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1238 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1239 const PetscInt *icol,*irow; 1240 PetscInt nrow,ncol,start; 1241 PetscErrorCode ierr; 1242 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1243 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1244 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1245 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1246 PetscInt *lens,rmax,ncols,*cols,Crow; 1247 #if defined(PETSC_USE_CTABLE) 1248 PetscTable cmap,rmap; 1249 PetscInt *cmap_loc,*rmap_loc; 1250 #else 1251 PetscInt *cmap,*rmap; 1252 #endif 1253 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1254 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1255 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1256 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1257 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1258 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1259 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1260 MPI_Comm comm; 1261 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1262 PetscMPIInt *onodes1,*olengths1,idex,end; 1263 Mat_SubSppt *smatis1; 1264 PetscBool isrowsorted,iscolsorted; 1265 1266 PetscFunctionBegin; 1267 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1268 1269 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1270 size = c->size; 1271 rank = c->rank; 1272 1273 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1274 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1275 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1276 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1277 if (allcolumns) { 1278 icol = NULL; 1279 ncol = C->cmap->N; 1280 } else { 1281 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1282 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1283 } 1284 1285 if (scall == MAT_INITIAL_MATRIX) { 1286 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1287 1288 /* Get some new tags to keep the communication clean */ 1289 tag1 = ((PetscObject)C)->tag; 1290 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1291 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1292 1293 /* evaluate communication - mesg to who, length of mesg, and buffer space 1294 required. Based on this, buffers are allocated, and data copied into them */ 1295 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1296 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1297 1298 /* w1[proc] = num of rows owned by proc -- to be requested */ 1299 proc = 0; 1300 nrqs = 0; /* num of outgoing messages */ 1301 for (j=0; j<nrow; j++) { 1302 row = irow[j]; 1303 if (!isrowsorted) proc = 0; 1304 while (row >= C->rmap->range[proc+1]) proc++; 1305 w1[proc]++; 1306 row2proc[j] = proc; /* map row index to proc */ 1307 1308 if (proc != rank && !w2[proc]) { 1309 w2[proc] = 1; nrqs++; 1310 } 1311 } 1312 w1[rank] = 0; /* rows owned by self will not be requested */ 1313 1314 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1315 for (proc=0,j=0; proc<size; proc++) { 1316 if (w1[proc]) { pa[j++] = proc;} 1317 } 1318 1319 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1320 msz = 0; /* total mesg length (for all procs) */ 1321 for (i=0; i<nrqs; i++) { 1322 proc = pa[i]; 1323 w1[proc] += 3; 1324 msz += w1[proc]; 1325 } 1326 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1327 1328 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1329 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1330 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1331 1332 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1333 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1334 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1335 1336 /* Now post the Irecvs corresponding to these messages */ 1337 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1338 1339 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1340 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1341 1342 /* Allocate Memory for outgoing messages */ 1343 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1344 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 1345 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 1346 1347 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1348 iptr = tmp; 1349 for (i=0; i<nrqs; i++) { 1350 proc = pa[i]; 1351 sbuf1[proc] = iptr; 1352 iptr += w1[proc]; 1353 } 1354 1355 /* Form the outgoing messages */ 1356 /* Initialize the header space */ 1357 for (i=0; i<nrqs; i++) { 1358 proc = pa[i]; 1359 ierr = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr); 1360 ptr[proc] = sbuf1[proc] + 3; 1361 } 1362 1363 /* Parse the isrow and copy data into outbuf */ 1364 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 1365 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1366 proc = row2proc[j]; 1367 if (proc != rank) { /* copy to the outgoing buf*/ 1368 *ptr[proc] = irow[j]; 1369 ctr[proc]++; ptr[proc]++; 1370 } 1371 } 1372 1373 /* Update the headers for the current IS */ 1374 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1375 if ((ctr_j = ctr[j])) { 1376 sbuf1_j = sbuf1[j]; 1377 k = ++sbuf1_j[0]; 1378 sbuf1_j[2*k] = ctr_j; 1379 sbuf1_j[2*k-1] = 0; 1380 } 1381 } 1382 1383 /* Now post the sends */ 1384 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1385 for (i=0; i<nrqs; ++i) { 1386 proc = pa[i]; 1387 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1388 } 1389 1390 /* Post Receives to capture the buffer size */ 1391 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1392 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1393 1394 rbuf2[0] = tmp + msz; 1395 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1396 1397 for (i=0; i<nrqs; ++i) { 1398 proc = pa[i]; 1399 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1400 } 1401 1402 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1403 1404 /* Send to other procs the buf size they should allocate */ 1405 /* Receive messages*/ 1406 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1407 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1408 1409 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1410 for (i=0; i<nrqr; ++i) { 1411 req_size[i] = 0; 1412 rbuf1_i = rbuf1[i]; 1413 start = 2*rbuf1_i[0] + 1; 1414 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1415 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1416 sbuf2_i = sbuf2[i]; 1417 for (j=start; j<end; j++) { 1418 k = rbuf1_i[j] - rstart; 1419 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1420 sbuf2_i[j] = ncols; 1421 req_size[i] += ncols; 1422 } 1423 req_source1[i] = r_status1[i].MPI_SOURCE; 1424 1425 /* form the header */ 1426 sbuf2_i[0] = req_size[i]; 1427 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1428 1429 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1430 } 1431 1432 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1433 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1434 1435 /* rbuf2 is received, Post recv column indices a->j */ 1436 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1437 1438 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1439 for (i=0; i<nrqs; ++i) { 1440 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1441 req_source2[i] = r_status2[i].MPI_SOURCE; 1442 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1443 } 1444 1445 /* Wait on sends1 and sends2 */ 1446 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1447 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1448 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1449 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1450 1451 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1452 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1453 1454 /* Now allocate sending buffers for a->j, and send them off */ 1455 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1456 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1457 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1458 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1459 1460 for (i=0; i<nrqr; i++) { /* for each requested message */ 1461 rbuf1_i = rbuf1[i]; 1462 sbuf_aj_i = sbuf_aj[i]; 1463 ct1 = 2*rbuf1_i[0] + 1; 1464 ct2 = 0; 1465 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1466 1467 kmax = rbuf1[i][2]; 1468 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1469 row = rbuf1_i[ct1] - rstart; 1470 nzA = ai[row+1] - ai[row]; 1471 nzB = bi[row+1] - bi[row]; 1472 ncols = nzA + nzB; 1473 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1474 1475 /* load the column indices for this row into cols*/ 1476 cols = sbuf_aj_i + ct2; 1477 1478 lwrite = 0; 1479 for (l=0; l<nzB; l++) { 1480 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1481 } 1482 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1483 for (l=0; l<nzB; l++) { 1484 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1485 } 1486 1487 ct2 += ncols; 1488 } 1489 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1490 } 1491 1492 /* create column map (cmap): global col of C -> local col of submat */ 1493 #if defined(PETSC_USE_CTABLE) 1494 if (!allcolumns) { 1495 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1496 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1497 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1498 if (icol[j] >= cstart && icol[j] <cend) { 1499 cmap_loc[icol[j] - cstart] = j+1; 1500 } else { /* use PetscTable for non-local col indices */ 1501 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1502 } 1503 } 1504 } else { 1505 cmap = NULL; 1506 cmap_loc = NULL; 1507 } 1508 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1509 #else 1510 if (!allcolumns) { 1511 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1512 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1513 } else { 1514 cmap = NULL; 1515 } 1516 #endif 1517 1518 /* Create lens for MatSeqAIJSetPreallocation() */ 1519 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1520 1521 /* Compute lens from local part of C */ 1522 for (j=0; j<nrow; j++) { 1523 row = irow[j]; 1524 proc = row2proc[j]; 1525 if (proc == rank) { 1526 /* diagonal part A = c->A */ 1527 ncols = ai[row-rstart+1] - ai[row-rstart]; 1528 cols = aj + ai[row-rstart]; 1529 if (!allcolumns) { 1530 for (k=0; k<ncols; k++) { 1531 #if defined(PETSC_USE_CTABLE) 1532 tcol = cmap_loc[cols[k]]; 1533 #else 1534 tcol = cmap[cols[k]+cstart]; 1535 #endif 1536 if (tcol) lens[j]++; 1537 } 1538 } else { /* allcolumns */ 1539 lens[j] = ncols; 1540 } 1541 1542 /* off-diagonal part B = c->B */ 1543 ncols = bi[row-rstart+1] - bi[row-rstart]; 1544 cols = bj + bi[row-rstart]; 1545 if (!allcolumns) { 1546 for (k=0; k<ncols; k++) { 1547 #if defined(PETSC_USE_CTABLE) 1548 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1549 #else 1550 tcol = cmap[bmap[cols[k]]]; 1551 #endif 1552 if (tcol) lens[j]++; 1553 } 1554 } else { /* allcolumns */ 1555 lens[j] += ncols; 1556 } 1557 } 1558 } 1559 1560 /* Create row map (rmap): global row of C -> local row of submat */ 1561 #if defined(PETSC_USE_CTABLE) 1562 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1563 for (j=0; j<nrow; j++) { 1564 row = irow[j]; 1565 proc = row2proc[j]; 1566 if (proc == rank) { /* a local row */ 1567 rmap_loc[row - rstart] = j; 1568 } else { 1569 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1570 } 1571 } 1572 #else 1573 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1574 for (j=0; j<nrow; j++) { 1575 rmap[irow[j]] = j; 1576 } 1577 #endif 1578 1579 /* Update lens from offproc data */ 1580 /* recv a->j is done */ 1581 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1582 for (i=0; i<nrqs; i++) { 1583 proc = pa[i]; 1584 sbuf1_i = sbuf1[proc]; 1585 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1586 ct1 = 2 + 1; 1587 ct2 = 0; 1588 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1589 rbuf3_i = rbuf3[i]; /* received C->j */ 1590 1591 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1592 max1 = sbuf1_i[2]; 1593 for (k=0; k<max1; k++,ct1++) { 1594 #if defined(PETSC_USE_CTABLE) 1595 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1596 row--; 1597 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1598 #else 1599 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1600 #endif 1601 /* Now, store row index of submat in sbuf1_i[ct1] */ 1602 sbuf1_i[ct1] = row; 1603 1604 nnz = rbuf2_i[ct1]; 1605 if (!allcolumns) { 1606 for (l=0; l<nnz; l++,ct2++) { 1607 #if defined(PETSC_USE_CTABLE) 1608 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1609 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1610 } else { 1611 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1612 } 1613 #else 1614 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1615 #endif 1616 if (tcol) lens[row]++; 1617 } 1618 } else { /* allcolumns */ 1619 lens[row] += nnz; 1620 } 1621 } 1622 } 1623 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1624 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1625 1626 /* Create the submatrices */ 1627 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1628 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1629 1630 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1631 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1632 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1633 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1634 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1635 1636 /* create struct Mat_SubSppt and attached it to submat */ 1637 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1638 subc = (Mat_SeqAIJ*)submat->data; 1639 subc->submatis1 = smatis1; 1640 1641 smatis1->id = 0; 1642 smatis1->nrqs = nrqs; 1643 smatis1->nrqr = nrqr; 1644 smatis1->rbuf1 = rbuf1; 1645 smatis1->rbuf2 = rbuf2; 1646 smatis1->rbuf3 = rbuf3; 1647 smatis1->sbuf2 = sbuf2; 1648 smatis1->req_source2 = req_source2; 1649 1650 smatis1->sbuf1 = sbuf1; 1651 smatis1->ptr = ptr; 1652 smatis1->tmp = tmp; 1653 smatis1->ctr = ctr; 1654 1655 smatis1->pa = pa; 1656 smatis1->req_size = req_size; 1657 smatis1->req_source1 = req_source1; 1658 1659 smatis1->allcolumns = allcolumns; 1660 smatis1->singleis = PETSC_TRUE; 1661 smatis1->row2proc = row2proc; 1662 smatis1->rmap = rmap; 1663 smatis1->cmap = cmap; 1664 #if defined(PETSC_USE_CTABLE) 1665 smatis1->rmap_loc = rmap_loc; 1666 smatis1->cmap_loc = cmap_loc; 1667 #endif 1668 1669 smatis1->destroy = submat->ops->destroy; 1670 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1671 submat->factortype = C->factortype; 1672 1673 /* compute rmax */ 1674 rmax = 0; 1675 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1676 1677 } else { /* scall == MAT_REUSE_MATRIX */ 1678 submat = submats[0]; 1679 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1680 1681 subc = (Mat_SeqAIJ*)submat->data; 1682 rmax = subc->rmax; 1683 smatis1 = subc->submatis1; 1684 nrqs = smatis1->nrqs; 1685 nrqr = smatis1->nrqr; 1686 rbuf1 = smatis1->rbuf1; 1687 rbuf2 = smatis1->rbuf2; 1688 rbuf3 = smatis1->rbuf3; 1689 req_source2 = smatis1->req_source2; 1690 1691 sbuf1 = smatis1->sbuf1; 1692 sbuf2 = smatis1->sbuf2; 1693 ptr = smatis1->ptr; 1694 tmp = smatis1->tmp; 1695 ctr = smatis1->ctr; 1696 1697 pa = smatis1->pa; 1698 req_size = smatis1->req_size; 1699 req_source1 = smatis1->req_source1; 1700 1701 allcolumns = smatis1->allcolumns; 1702 row2proc = smatis1->row2proc; 1703 rmap = smatis1->rmap; 1704 cmap = smatis1->cmap; 1705 #if defined(PETSC_USE_CTABLE) 1706 rmap_loc = smatis1->rmap_loc; 1707 cmap_loc = smatis1->cmap_loc; 1708 #endif 1709 } 1710 1711 /* Post recv matrix values */ 1712 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1713 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1714 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1715 for (i=0; i<nrqs; ++i) { 1716 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1717 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1718 } 1719 1720 /* Allocate sending buffers for a->a, and send them off */ 1721 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1722 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1723 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1724 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1725 1726 for (i=0; i<nrqr; i++) { 1727 rbuf1_i = rbuf1[i]; 1728 sbuf_aa_i = sbuf_aa[i]; 1729 ct1 = 2*rbuf1_i[0]+1; 1730 ct2 = 0; 1731 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1732 1733 kmax = rbuf1_i[2]; 1734 for (k=0; k<kmax; k++,ct1++) { 1735 row = rbuf1_i[ct1] - rstart; 1736 nzA = ai[row+1] - ai[row]; 1737 nzB = bi[row+1] - bi[row]; 1738 ncols = nzA + nzB; 1739 cworkB = bj + bi[row]; 1740 vworkA = a_a + ai[row]; 1741 vworkB = b_a + bi[row]; 1742 1743 /* load the column values for this row into vals*/ 1744 vals = sbuf_aa_i + ct2; 1745 1746 lwrite = 0; 1747 for (l=0; l<nzB; l++) { 1748 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1749 } 1750 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1751 for (l=0; l<nzB; l++) { 1752 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1753 } 1754 1755 ct2 += ncols; 1756 } 1757 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1758 } 1759 1760 /* Assemble submat */ 1761 /* First assemble the local rows */ 1762 for (j=0; j<nrow; j++) { 1763 row = irow[j]; 1764 proc = row2proc[j]; 1765 if (proc == rank) { 1766 Crow = row - rstart; /* local row index of C */ 1767 #if defined(PETSC_USE_CTABLE) 1768 row = rmap_loc[Crow]; /* row index of submat */ 1769 #else 1770 row = rmap[row]; 1771 #endif 1772 1773 if (allcolumns) { 1774 /* diagonal part A = c->A */ 1775 ncols = ai[Crow+1] - ai[Crow]; 1776 cols = aj + ai[Crow]; 1777 vals = a->a + ai[Crow]; 1778 i = 0; 1779 for (k=0; k<ncols; k++) { 1780 subcols[i] = cols[k] + cstart; 1781 subvals[i++] = vals[k]; 1782 } 1783 1784 /* off-diagonal part B = c->B */ 1785 ncols = bi[Crow+1] - bi[Crow]; 1786 cols = bj + bi[Crow]; 1787 vals = b->a + bi[Crow]; 1788 for (k=0; k<ncols; k++) { 1789 subcols[i] = bmap[cols[k]]; 1790 subvals[i++] = vals[k]; 1791 } 1792 1793 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1794 1795 } else { /* !allcolumns */ 1796 #if defined(PETSC_USE_CTABLE) 1797 /* diagonal part A = c->A */ 1798 ncols = ai[Crow+1] - ai[Crow]; 1799 cols = aj + ai[Crow]; 1800 vals = a->a + ai[Crow]; 1801 i = 0; 1802 for (k=0; k<ncols; k++) { 1803 tcol = cmap_loc[cols[k]]; 1804 if (tcol) { 1805 subcols[i] = --tcol; 1806 subvals[i++] = vals[k]; 1807 } 1808 } 1809 1810 /* off-diagonal part B = c->B */ 1811 ncols = bi[Crow+1] - bi[Crow]; 1812 cols = bj + bi[Crow]; 1813 vals = b->a + bi[Crow]; 1814 for (k=0; k<ncols; k++) { 1815 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1816 if (tcol) { 1817 subcols[i] = --tcol; 1818 subvals[i++] = vals[k]; 1819 } 1820 } 1821 #else 1822 /* diagonal part A = c->A */ 1823 ncols = ai[Crow+1] - ai[Crow]; 1824 cols = aj + ai[Crow]; 1825 vals = a->a + ai[Crow]; 1826 i = 0; 1827 for (k=0; k<ncols; k++) { 1828 tcol = cmap[cols[k]+cstart]; 1829 if (tcol) { 1830 subcols[i] = --tcol; 1831 subvals[i++] = vals[k]; 1832 } 1833 } 1834 1835 /* off-diagonal part B = c->B */ 1836 ncols = bi[Crow+1] - bi[Crow]; 1837 cols = bj + bi[Crow]; 1838 vals = b->a + bi[Crow]; 1839 for (k=0; k<ncols; k++) { 1840 tcol = cmap[bmap[cols[k]]]; 1841 if (tcol) { 1842 subcols[i] = --tcol; 1843 subvals[i++] = vals[k]; 1844 } 1845 } 1846 #endif 1847 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1848 } 1849 } 1850 } 1851 1852 /* Now assemble the off-proc rows */ 1853 for (i=0; i<nrqs; i++) { /* for each requested message */ 1854 /* recv values from other processes */ 1855 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1856 proc = pa[idex]; 1857 sbuf1_i = sbuf1[proc]; 1858 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1859 ct1 = 2 + 1; 1860 ct2 = 0; /* count of received C->j */ 1861 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1862 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1863 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1864 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1865 1866 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1867 max1 = sbuf1_i[2]; /* num of rows */ 1868 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1869 row = sbuf1_i[ct1]; /* row index of submat */ 1870 if (!allcolumns) { 1871 idex = 0; 1872 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1873 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1874 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1875 #if defined(PETSC_USE_CTABLE) 1876 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1877 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1878 } else { 1879 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1880 } 1881 #else 1882 tcol = cmap[rbuf3_i[ct2]]; 1883 #endif 1884 if (tcol) { 1885 subcols[idex] = --tcol; /* may not be sorted */ 1886 subvals[idex++] = rbuf4_i[ct2]; 1887 1888 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1889 For reuse, we replace received C->j with index that should be inserted to submat */ 1890 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1891 } 1892 } 1893 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1894 } else { /* scall == MAT_REUSE_MATRIX */ 1895 submat = submats[0]; 1896 subc = (Mat_SeqAIJ*)submat->data; 1897 1898 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1899 for (l=0; l<nnz; l++) { 1900 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1901 subvals[idex++] = rbuf4_i[ct2]; 1902 } 1903 1904 bj = subc->j + subc->i[row]; /* sorted column indices */ 1905 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1906 } 1907 } else { /* allcolumns */ 1908 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1909 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1910 ct2 += nnz; 1911 } 1912 } 1913 } 1914 1915 /* sending a->a are done */ 1916 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1917 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1918 1919 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1920 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1921 submats[0] = submat; 1922 1923 /* Restore the indices */ 1924 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1925 if (!allcolumns) { 1926 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1927 } 1928 1929 /* Destroy allocated memory */ 1930 for (i=0; i<nrqs; ++i) { 1931 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1932 } 1933 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1934 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1935 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1936 1937 if (scall == MAT_INITIAL_MATRIX) { 1938 ierr = PetscFree(lens);CHKERRQ(ierr); 1939 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1940 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1941 } 1942 PetscFunctionReturn(0); 1943 } 1944 1945 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1946 { 1947 PetscErrorCode ierr; 1948 PetscInt ncol; 1949 PetscBool colflag,allcolumns=PETSC_FALSE; 1950 1951 PetscFunctionBegin; 1952 /* Allocate memory to hold all the submatrices */ 1953 if (scall == MAT_INITIAL_MATRIX) { 1954 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1955 } 1956 1957 /* Check for special case: each processor gets entire matrix columns */ 1958 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1959 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1960 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1961 1962 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1963 PetscFunctionReturn(0); 1964 } 1965 1966 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1967 { 1968 PetscErrorCode ierr; 1969 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1970 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1971 Mat_SeqAIJ *subc; 1972 Mat_SubSppt *smat; 1973 1974 PetscFunctionBegin; 1975 /* Check for special case: each processor has a single IS */ 1976 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1977 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1978 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-singlis */ 1979 PetscFunctionReturn(0); 1980 } 1981 1982 /* Collect global wantallmatrix and nstages */ 1983 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1984 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1985 if (!nmax) nmax = 1; 1986 1987 if (scall == MAT_INITIAL_MATRIX) { 1988 /* Collect global wantallmatrix and nstages */ 1989 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1990 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1991 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1992 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1993 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1994 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1995 wantallmatrix = PETSC_TRUE; 1996 1997 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 1998 } 1999 } 2000 2001 /* Determine the number of stages through which submatrices are done 2002 Each stage will extract nmax submatrices. 2003 nmax is determined by the matrix column dimension. 2004 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2005 */ 2006 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2007 2008 in[0] = -1*(PetscInt)wantallmatrix; 2009 in[1] = nstages; 2010 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2011 wantallmatrix = (PetscBool)(-out[0]); 2012 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2013 2014 } else { /* MAT_REUSE_MATRIX */ 2015 if (ismax) { 2016 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2017 smat = subc->submatis1; 2018 } else { /* (*submat)[0] is a dummy matrix */ 2019 smat = (Mat_SubSppt*)(*submat)[0]->data; 2020 } 2021 if (!smat) { 2022 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2023 wantallmatrix = PETSC_TRUE; 2024 } else if (smat->singleis) { 2025 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2026 PetscFunctionReturn(0); 2027 } else { 2028 nstages = smat->nstages; 2029 } 2030 } 2031 2032 if (wantallmatrix) { 2033 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2034 PetscFunctionReturn(0); 2035 } 2036 2037 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2038 if (scall == MAT_INITIAL_MATRIX) { 2039 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2040 } 2041 2042 for (i=0,pos=0; i<nstages; i++) { 2043 if (pos+nmax <= ismax) max_no = nmax; 2044 else if (pos >= ismax) max_no = 0; 2045 else max_no = ismax-pos; 2046 2047 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2048 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2049 smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++; 2050 smat->nstages = nstages; 2051 } 2052 pos += max_no; 2053 } 2054 2055 if (ismax && scall == MAT_INITIAL_MATRIX) { 2056 /* save nstages for reuse */ 2057 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2058 smat = subc->submatis1; 2059 smat->nstages = nstages; 2060 } 2061 PetscFunctionReturn(0); 2062 } 2063 2064 /* -------------------------------------------------------------------------*/ 2065 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2066 { 2067 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2068 Mat A = c->A; 2069 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2070 const PetscInt **icol,**irow; 2071 PetscInt *nrow,*ncol,start; 2072 PetscErrorCode ierr; 2073 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2074 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2075 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2076 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2077 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2078 #if defined(PETSC_USE_CTABLE) 2079 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2080 #else 2081 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2082 #endif 2083 const PetscInt *irow_i; 2084 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2085 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2086 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2087 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2088 MPI_Status *r_status3,*r_status4,*s_status4; 2089 MPI_Comm comm; 2090 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2091 PetscMPIInt *onodes1,*olengths1,end; 2092 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2093 Mat_SubSppt *smat_i; 2094 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2095 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2096 2097 PetscFunctionBegin; 2098 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2099 size = c->size; 2100 rank = c->rank; 2101 2102 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2103 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2104 2105 for (i=0; i<ismax; i++) { 2106 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2107 if (!issorted[i]) iscsorted = issorted[i]; 2108 2109 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2110 2111 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2112 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2113 2114 /* Check for special case: allcolumn */ 2115 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2116 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2117 if (colflag && ncol[i] == C->cmap->N) { 2118 allcolumns[i] = PETSC_TRUE; 2119 icol[i] = NULL; 2120 } else { 2121 allcolumns[i] = PETSC_FALSE; 2122 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2123 } 2124 } 2125 2126 if (scall == MAT_REUSE_MATRIX) { 2127 /* Assumes new rows are same length as the old rows */ 2128 for (i=0; i<ismax; i++) { 2129 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2130 subc = (Mat_SeqAIJ*)submats[i]->data; 2131 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2132 2133 /* Initial matrix as if empty */ 2134 ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr); 2135 2136 smat_i = subc->submatis1; 2137 2138 nrqs = smat_i->nrqs; 2139 nrqr = smat_i->nrqr; 2140 rbuf1 = smat_i->rbuf1; 2141 rbuf2 = smat_i->rbuf2; 2142 rbuf3 = smat_i->rbuf3; 2143 req_source2 = smat_i->req_source2; 2144 2145 sbuf1 = smat_i->sbuf1; 2146 sbuf2 = smat_i->sbuf2; 2147 ptr = smat_i->ptr; 2148 tmp = smat_i->tmp; 2149 ctr = smat_i->ctr; 2150 2151 pa = smat_i->pa; 2152 req_size = smat_i->req_size; 2153 req_source1 = smat_i->req_source1; 2154 2155 allcolumns[i] = smat_i->allcolumns; 2156 row2proc[i] = smat_i->row2proc; 2157 rmap[i] = smat_i->rmap; 2158 cmap[i] = smat_i->cmap; 2159 } 2160 2161 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2162 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2163 smat_i = (Mat_SubSppt*)submats[0]->data; 2164 2165 nrqs = smat_i->nrqs; 2166 nrqr = smat_i->nrqr; 2167 rbuf1 = smat_i->rbuf1; 2168 rbuf2 = smat_i->rbuf2; 2169 rbuf3 = smat_i->rbuf3; 2170 req_source2 = smat_i->req_source2; 2171 2172 sbuf1 = smat_i->sbuf1; 2173 sbuf2 = smat_i->sbuf2; 2174 ptr = smat_i->ptr; 2175 tmp = smat_i->tmp; 2176 ctr = smat_i->ctr; 2177 2178 pa = smat_i->pa; 2179 req_size = smat_i->req_size; 2180 req_source1 = smat_i->req_source1; 2181 2182 allcolumns[0] = PETSC_FALSE; 2183 } 2184 } else { /* scall == MAT_INITIAL_MATRIX */ 2185 /* Get some new tags to keep the communication clean */ 2186 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2187 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2188 2189 /* evaluate communication - mesg to who, length of mesg, and buffer space 2190 required. Based on this, buffers are allocated, and data copied into them*/ 2191 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2192 2193 for (i=0; i<ismax; i++) { 2194 jmax = nrow[i]; 2195 irow_i = irow[i]; 2196 2197 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2198 row2proc[i] = row2proc_i; 2199 2200 if (issorted[i]) proc = 0; 2201 for (j=0; j<jmax; j++) { 2202 if (!issorted[i]) proc = 0; 2203 row = irow_i[j]; 2204 while (row >= C->rmap->range[proc+1]) proc++; 2205 w4[proc]++; 2206 row2proc_i[j] = proc; /* map row index to proc */ 2207 } 2208 for (j=0; j<size; j++) { 2209 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2210 } 2211 } 2212 2213 nrqs = 0; /* no of outgoing messages */ 2214 msz = 0; /* total mesg length (for all procs) */ 2215 w1[rank] = 0; /* no mesg sent to self */ 2216 w3[rank] = 0; 2217 for (i=0; i<size; i++) { 2218 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2219 } 2220 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2221 for (i=0,j=0; i<size; i++) { 2222 if (w1[i]) { pa[j] = i; j++; } 2223 } 2224 2225 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2226 for (i=0; i<nrqs; i++) { 2227 j = pa[i]; 2228 w1[j] += w2[j] + 2* w3[j]; 2229 msz += w1[j]; 2230 } 2231 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2232 2233 /* Determine the number of messages to expect, their lengths, from from-ids */ 2234 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2235 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2236 2237 /* Now post the Irecvs corresponding to these messages */ 2238 tag0 = ((PetscObject)C)->tag; 2239 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2240 2241 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2242 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2243 2244 /* Allocate Memory for outgoing messages */ 2245 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2246 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 2247 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 2248 2249 { 2250 PetscInt *iptr = tmp; 2251 k = 0; 2252 for (i=0; i<nrqs; i++) { 2253 j = pa[i]; 2254 iptr += k; 2255 sbuf1[j] = iptr; 2256 k = w1[j]; 2257 } 2258 } 2259 2260 /* Form the outgoing messages. Initialize the header space */ 2261 for (i=0; i<nrqs; i++) { 2262 j = pa[i]; 2263 sbuf1[j][0] = 0; 2264 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 2265 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2266 } 2267 2268 /* Parse the isrow and copy data into outbuf */ 2269 for (i=0; i<ismax; i++) { 2270 row2proc_i = row2proc[i]; 2271 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 2272 irow_i = irow[i]; 2273 jmax = nrow[i]; 2274 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2275 proc = row2proc_i[j]; 2276 if (proc != rank) { /* copy to the outgoing buf*/ 2277 ctr[proc]++; 2278 *ptr[proc] = irow_i[j]; 2279 ptr[proc]++; 2280 } 2281 } 2282 /* Update the headers for the current IS */ 2283 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2284 if ((ctr_j = ctr[j])) { 2285 sbuf1_j = sbuf1[j]; 2286 k = ++sbuf1_j[0]; 2287 sbuf1_j[2*k] = ctr_j; 2288 sbuf1_j[2*k-1] = i; 2289 } 2290 } 2291 } 2292 2293 /* Now post the sends */ 2294 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2295 for (i=0; i<nrqs; ++i) { 2296 j = pa[i]; 2297 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2298 } 2299 2300 /* Post Receives to capture the buffer size */ 2301 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2302 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2303 rbuf2[0] = tmp + msz; 2304 for (i=1; i<nrqs; ++i) { 2305 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2306 } 2307 for (i=0; i<nrqs; ++i) { 2308 j = pa[i]; 2309 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2310 } 2311 2312 /* Send to other procs the buf size they should allocate */ 2313 /* Receive messages*/ 2314 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2315 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2316 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2317 { 2318 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2319 PetscInt *sbuf2_i; 2320 2321 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2322 for (i=0; i<nrqr; ++i) { 2323 req_size[i] = 0; 2324 rbuf1_i = rbuf1[i]; 2325 start = 2*rbuf1_i[0] + 1; 2326 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2327 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2328 sbuf2_i = sbuf2[i]; 2329 for (j=start; j<end; j++) { 2330 id = rbuf1_i[j] - rstart; 2331 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2332 sbuf2_i[j] = ncols; 2333 req_size[i] += ncols; 2334 } 2335 req_source1[i] = r_status1[i].MPI_SOURCE; 2336 /* form the header */ 2337 sbuf2_i[0] = req_size[i]; 2338 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2339 2340 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2341 } 2342 } 2343 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2344 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2345 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2346 2347 /* Receive messages*/ 2348 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2349 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2350 2351 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2352 for (i=0; i<nrqs; ++i) { 2353 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2354 req_source2[i] = r_status2[i].MPI_SOURCE; 2355 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2356 } 2357 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2358 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2359 2360 /* Wait on sends1 and sends2 */ 2361 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2362 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2363 2364 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2365 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2366 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2367 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2368 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2369 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2370 2371 /* Now allocate sending buffers for a->j, and send them off */ 2372 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2373 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2374 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2375 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2376 2377 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2378 { 2379 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2380 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2381 PetscInt cend = C->cmap->rend; 2382 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2383 2384 for (i=0; i<nrqr; i++) { 2385 rbuf1_i = rbuf1[i]; 2386 sbuf_aj_i = sbuf_aj[i]; 2387 ct1 = 2*rbuf1_i[0] + 1; 2388 ct2 = 0; 2389 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2390 kmax = rbuf1[i][2*j]; 2391 for (k=0; k<kmax; k++,ct1++) { 2392 row = rbuf1_i[ct1] - rstart; 2393 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2394 ncols = nzA + nzB; 2395 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2396 2397 /* load the column indices for this row into cols */ 2398 cols = sbuf_aj_i + ct2; 2399 2400 lwrite = 0; 2401 for (l=0; l<nzB; l++) { 2402 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2403 } 2404 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2405 for (l=0; l<nzB; l++) { 2406 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2407 } 2408 2409 ct2 += ncols; 2410 } 2411 } 2412 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2413 } 2414 } 2415 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2416 2417 /* create col map: global col of C -> local col of submatrices */ 2418 { 2419 const PetscInt *icol_i; 2420 #if defined(PETSC_USE_CTABLE) 2421 for (i=0; i<ismax; i++) { 2422 if (!allcolumns[i]) { 2423 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2424 2425 jmax = ncol[i]; 2426 icol_i = icol[i]; 2427 cmap_i = cmap[i]; 2428 for (j=0; j<jmax; j++) { 2429 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2430 } 2431 } else cmap[i] = NULL; 2432 } 2433 #else 2434 for (i=0; i<ismax; i++) { 2435 if (!allcolumns[i]) { 2436 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2437 jmax = ncol[i]; 2438 icol_i = icol[i]; 2439 cmap_i = cmap[i]; 2440 for (j=0; j<jmax; j++) { 2441 cmap_i[icol_i[j]] = j+1; 2442 } 2443 } else cmap[i] = NULL; 2444 } 2445 #endif 2446 } 2447 2448 /* Create lens which is required for MatCreate... */ 2449 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2450 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2451 2452 if (ismax) { 2453 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2454 } 2455 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2456 2457 /* Update lens from local data */ 2458 for (i=0; i<ismax; i++) { 2459 row2proc_i = row2proc[i]; 2460 jmax = nrow[i]; 2461 if (!allcolumns[i]) cmap_i = cmap[i]; 2462 irow_i = irow[i]; 2463 lens_i = lens[i]; 2464 for (j=0; j<jmax; j++) { 2465 row = irow_i[j]; 2466 proc = row2proc_i[j]; 2467 if (proc == rank) { 2468 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2469 if (!allcolumns[i]) { 2470 for (k=0; k<ncols; k++) { 2471 #if defined(PETSC_USE_CTABLE) 2472 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2473 #else 2474 tcol = cmap_i[cols[k]]; 2475 #endif 2476 if (tcol) lens_i[j]++; 2477 } 2478 } else { /* allcolumns */ 2479 lens_i[j] = ncols; 2480 } 2481 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2482 } 2483 } 2484 } 2485 2486 /* Create row map: global row of C -> local row of submatrices */ 2487 #if defined(PETSC_USE_CTABLE) 2488 for (i=0; i<ismax; i++) { 2489 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2490 irow_i = irow[i]; 2491 jmax = nrow[i]; 2492 for (j=0; j<jmax; j++) { 2493 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2494 } 2495 } 2496 #else 2497 for (i=0; i<ismax; i++) { 2498 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2499 rmap_i = rmap[i]; 2500 irow_i = irow[i]; 2501 jmax = nrow[i]; 2502 for (j=0; j<jmax; j++) { 2503 rmap_i[irow_i[j]] = j; 2504 } 2505 } 2506 #endif 2507 2508 /* Update lens from offproc data */ 2509 { 2510 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2511 2512 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2513 for (tmp2=0; tmp2<nrqs; tmp2++) { 2514 sbuf1_i = sbuf1[pa[tmp2]]; 2515 jmax = sbuf1_i[0]; 2516 ct1 = 2*jmax+1; 2517 ct2 = 0; 2518 rbuf2_i = rbuf2[tmp2]; 2519 rbuf3_i = rbuf3[tmp2]; 2520 for (j=1; j<=jmax; j++) { 2521 is_no = sbuf1_i[2*j-1]; 2522 max1 = sbuf1_i[2*j]; 2523 lens_i = lens[is_no]; 2524 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2525 rmap_i = rmap[is_no]; 2526 for (k=0; k<max1; k++,ct1++) { 2527 #if defined(PETSC_USE_CTABLE) 2528 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2529 row--; 2530 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2531 #else 2532 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2533 #endif 2534 max2 = rbuf2_i[ct1]; 2535 for (l=0; l<max2; l++,ct2++) { 2536 if (!allcolumns[is_no]) { 2537 #if defined(PETSC_USE_CTABLE) 2538 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2539 #else 2540 tcol = cmap_i[rbuf3_i[ct2]]; 2541 #endif 2542 if (tcol) lens_i[row]++; 2543 } else { /* allcolumns */ 2544 lens_i[row]++; /* lens_i[row] += max2 ? */ 2545 } 2546 } 2547 } 2548 } 2549 } 2550 } 2551 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2552 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2553 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2554 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2555 2556 /* Create the submatrices */ 2557 for (i=0; i<ismax; i++) { 2558 PetscInt rbs,cbs; 2559 2560 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2561 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2562 2563 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2564 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2565 2566 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2567 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2568 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2569 2570 /* create struct Mat_SubSppt and attached it to submat */ 2571 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2572 subc = (Mat_SeqAIJ*)submats[i]->data; 2573 subc->submatis1 = smat_i; 2574 2575 smat_i->destroy = submats[i]->ops->destroy; 2576 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2577 submats[i]->factortype = C->factortype; 2578 2579 smat_i->id = i; 2580 smat_i->nrqs = nrqs; 2581 smat_i->nrqr = nrqr; 2582 smat_i->rbuf1 = rbuf1; 2583 smat_i->rbuf2 = rbuf2; 2584 smat_i->rbuf3 = rbuf3; 2585 smat_i->sbuf2 = sbuf2; 2586 smat_i->req_source2 = req_source2; 2587 2588 smat_i->sbuf1 = sbuf1; 2589 smat_i->ptr = ptr; 2590 smat_i->tmp = tmp; 2591 smat_i->ctr = ctr; 2592 2593 smat_i->pa = pa; 2594 smat_i->req_size = req_size; 2595 smat_i->req_source1 = req_source1; 2596 2597 smat_i->allcolumns = allcolumns[i]; 2598 smat_i->singleis = PETSC_FALSE; 2599 smat_i->row2proc = row2proc[i]; 2600 smat_i->rmap = rmap[i]; 2601 smat_i->cmap = cmap[i]; 2602 } 2603 2604 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2605 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2606 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2607 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2608 2609 /* create struct Mat_SubSppt and attached it to submat */ 2610 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2611 submats[0]->data = (void*)smat_i; 2612 2613 smat_i->destroy = submats[0]->ops->destroy; 2614 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2615 submats[0]->factortype = C->factortype; 2616 2617 smat_i->id = 0; 2618 smat_i->nrqs = nrqs; 2619 smat_i->nrqr = nrqr; 2620 smat_i->rbuf1 = rbuf1; 2621 smat_i->rbuf2 = rbuf2; 2622 smat_i->rbuf3 = rbuf3; 2623 smat_i->sbuf2 = sbuf2; 2624 smat_i->req_source2 = req_source2; 2625 2626 smat_i->sbuf1 = sbuf1; 2627 smat_i->ptr = ptr; 2628 smat_i->tmp = tmp; 2629 smat_i->ctr = ctr; 2630 2631 smat_i->pa = pa; 2632 smat_i->req_size = req_size; 2633 smat_i->req_source1 = req_source1; 2634 2635 smat_i->allcolumns = PETSC_FALSE; 2636 smat_i->singleis = PETSC_FALSE; 2637 smat_i->row2proc = NULL; 2638 smat_i->rmap = NULL; 2639 smat_i->cmap = NULL; 2640 } 2641 2642 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2643 ierr = PetscFree(lens);CHKERRQ(ierr); 2644 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2645 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2646 2647 } /* endof scall == MAT_INITIAL_MATRIX */ 2648 2649 /* Post recv matrix values */ 2650 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2651 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2652 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2653 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2654 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2655 for (i=0; i<nrqs; ++i) { 2656 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2657 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2658 } 2659 2660 /* Allocate sending buffers for a->a, and send them off */ 2661 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2662 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2663 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2664 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2665 2666 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2667 { 2668 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2669 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2670 PetscInt cend = C->cmap->rend; 2671 PetscInt *b_j = b->j; 2672 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2673 2674 for (i=0; i<nrqr; i++) { 2675 rbuf1_i = rbuf1[i]; 2676 sbuf_aa_i = sbuf_aa[i]; 2677 ct1 = 2*rbuf1_i[0]+1; 2678 ct2 = 0; 2679 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2680 kmax = rbuf1_i[2*j]; 2681 for (k=0; k<kmax; k++,ct1++) { 2682 row = rbuf1_i[ct1] - rstart; 2683 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2684 ncols = nzA + nzB; 2685 cworkB = b_j + b_i[row]; 2686 vworkA = a_a + a_i[row]; 2687 vworkB = b_a + b_i[row]; 2688 2689 /* load the column values for this row into vals*/ 2690 vals = sbuf_aa_i+ct2; 2691 2692 lwrite = 0; 2693 for (l=0; l<nzB; l++) { 2694 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2695 } 2696 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2697 for (l=0; l<nzB; l++) { 2698 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2699 } 2700 2701 ct2 += ncols; 2702 } 2703 } 2704 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2705 } 2706 } 2707 2708 /* Assemble the matrices */ 2709 /* First assemble the local rows */ 2710 for (i=0; i<ismax; i++) { 2711 row2proc_i = row2proc[i]; 2712 subc = (Mat_SeqAIJ*)submats[i]->data; 2713 imat_ilen = subc->ilen; 2714 imat_j = subc->j; 2715 imat_i = subc->i; 2716 imat_a = subc->a; 2717 2718 if (!allcolumns[i]) cmap_i = cmap[i]; 2719 rmap_i = rmap[i]; 2720 irow_i = irow[i]; 2721 jmax = nrow[i]; 2722 for (j=0; j<jmax; j++) { 2723 row = irow_i[j]; 2724 proc = row2proc_i[j]; 2725 if (proc == rank) { 2726 old_row = row; 2727 #if defined(PETSC_USE_CTABLE) 2728 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2729 row--; 2730 #else 2731 row = rmap_i[row]; 2732 #endif 2733 ilen_row = imat_ilen[row]; 2734 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2735 mat_i = imat_i[row]; 2736 mat_a = imat_a + mat_i; 2737 mat_j = imat_j + mat_i; 2738 if (!allcolumns[i]) { 2739 for (k=0; k<ncols; k++) { 2740 #if defined(PETSC_USE_CTABLE) 2741 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2742 #else 2743 tcol = cmap_i[cols[k]]; 2744 #endif 2745 if (tcol) { 2746 *mat_j++ = tcol - 1; 2747 *mat_a++ = vals[k]; 2748 ilen_row++; 2749 } 2750 } 2751 } else { /* allcolumns */ 2752 for (k=0; k<ncols; k++) { 2753 *mat_j++ = cols[k]; /* global col index! */ 2754 *mat_a++ = vals[k]; 2755 ilen_row++; 2756 } 2757 } 2758 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2759 2760 imat_ilen[row] = ilen_row; 2761 } 2762 } 2763 } 2764 2765 /* Now assemble the off proc rows */ 2766 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2767 for (tmp2=0; tmp2<nrqs; tmp2++) { 2768 sbuf1_i = sbuf1[pa[tmp2]]; 2769 jmax = sbuf1_i[0]; 2770 ct1 = 2*jmax + 1; 2771 ct2 = 0; 2772 rbuf2_i = rbuf2[tmp2]; 2773 rbuf3_i = rbuf3[tmp2]; 2774 rbuf4_i = rbuf4[tmp2]; 2775 for (j=1; j<=jmax; j++) { 2776 is_no = sbuf1_i[2*j-1]; 2777 rmap_i = rmap[is_no]; 2778 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2779 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2780 imat_ilen = subc->ilen; 2781 imat_j = subc->j; 2782 imat_i = subc->i; 2783 imat_a = subc->a; 2784 max1 = sbuf1_i[2*j]; 2785 for (k=0; k<max1; k++,ct1++) { 2786 row = sbuf1_i[ct1]; 2787 #if defined(PETSC_USE_CTABLE) 2788 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2789 row--; 2790 #else 2791 row = rmap_i[row]; 2792 #endif 2793 ilen = imat_ilen[row]; 2794 mat_i = imat_i[row]; 2795 mat_a = imat_a + mat_i; 2796 mat_j = imat_j + mat_i; 2797 max2 = rbuf2_i[ct1]; 2798 if (!allcolumns[is_no]) { 2799 for (l=0; l<max2; l++,ct2++) { 2800 #if defined(PETSC_USE_CTABLE) 2801 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2802 #else 2803 tcol = cmap_i[rbuf3_i[ct2]]; 2804 #endif 2805 if (tcol) { 2806 *mat_j++ = tcol - 1; 2807 *mat_a++ = rbuf4_i[ct2]; 2808 ilen++; 2809 } 2810 } 2811 } else { /* allcolumns */ 2812 for (l=0; l<max2; l++,ct2++) { 2813 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2814 *mat_a++ = rbuf4_i[ct2]; 2815 ilen++; 2816 } 2817 } 2818 imat_ilen[row] = ilen; 2819 } 2820 } 2821 } 2822 2823 if (!iscsorted) { /* sort column indices of the rows */ 2824 for (i=0; i<ismax; i++) { 2825 subc = (Mat_SeqAIJ*)submats[i]->data; 2826 imat_j = subc->j; 2827 imat_i = subc->i; 2828 imat_a = subc->a; 2829 imat_ilen = subc->ilen; 2830 2831 if (allcolumns[i]) continue; 2832 jmax = nrow[i]; 2833 for (j=0; j<jmax; j++) { 2834 mat_i = imat_i[j]; 2835 mat_a = imat_a + mat_i; 2836 mat_j = imat_j + mat_i; 2837 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2838 } 2839 } 2840 } 2841 2842 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2843 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2844 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2845 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2846 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2847 2848 /* Restore the indices */ 2849 for (i=0; i<ismax; i++) { 2850 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2851 if (!allcolumns[i]) { 2852 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2853 } 2854 } 2855 2856 for (i=0; i<ismax; i++) { 2857 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2858 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2859 } 2860 2861 /* Destroy allocated memory */ 2862 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2863 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2864 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2865 2866 for (i=0; i<nrqs; ++i) { 2867 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2868 } 2869 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2870 2871 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2872 PetscFunctionReturn(0); 2873 } 2874 2875 /* 2876 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2877 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2878 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2879 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2880 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2881 state, and needs to be "assembled" later by compressing B's column space. 2882 2883 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2884 Following this call, C->A & C->B have been created, even if empty. 2885 */ 2886 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2887 { 2888 /* If making this function public, change the error returned in this function away from _PLIB. */ 2889 PetscErrorCode ierr; 2890 Mat_MPIAIJ *aij; 2891 Mat_SeqAIJ *Baij; 2892 PetscBool seqaij,Bdisassembled; 2893 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2894 PetscScalar v; 2895 const PetscInt *rowindices,*colindices; 2896 2897 PetscFunctionBegin; 2898 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2899 if (A) { 2900 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2901 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2902 if (rowemb) { 2903 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2904 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2905 } else { 2906 if (C->rmap->n != A->rmap->n) { 2907 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2908 } 2909 } 2910 if (dcolemb) { 2911 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2912 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2913 } else { 2914 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2915 } 2916 } 2917 if (B) { 2918 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2919 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2920 if (rowemb) { 2921 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2922 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2923 } else { 2924 if (C->rmap->n != B->rmap->n) { 2925 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2926 } 2927 } 2928 if (ocolemb) { 2929 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2930 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2931 } else { 2932 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2933 } 2934 } 2935 2936 aij = (Mat_MPIAIJ*)C->data; 2937 if (!aij->A) { 2938 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2939 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2940 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2941 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2942 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2943 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2944 } 2945 if (A) { 2946 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2947 } else { 2948 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2949 } 2950 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2951 /* 2952 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2953 need to "disassemble" B -- convert it to using C's global indices. 2954 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2955 2956 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2957 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2958 2959 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2960 At least avoid calling MatSetValues() and the implied searches? 2961 */ 2962 2963 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2964 #if defined(PETSC_USE_CTABLE) 2965 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2966 #else 2967 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2968 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2969 if (aij->B) { 2970 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2971 } 2972 #endif 2973 ngcol = 0; 2974 if (aij->lvec) { 2975 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2976 } 2977 if (aij->garray) { 2978 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2979 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2980 } 2981 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2982 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2983 } 2984 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2985 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2986 } 2987 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2988 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2989 } 2990 } 2991 Bdisassembled = PETSC_FALSE; 2992 if (!aij->B) { 2993 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2994 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2995 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2996 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2997 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 2998 Bdisassembled = PETSC_TRUE; 2999 } 3000 if (B) { 3001 Baij = (Mat_SeqAIJ*)B->data; 3002 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3003 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3004 for (i=0; i<B->rmap->n; i++) { 3005 nz[i] = Baij->i[i+1] - Baij->i[i]; 3006 } 3007 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3008 ierr = PetscFree(nz);CHKERRQ(ierr); 3009 } 3010 3011 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3012 shift = rend-rstart; 3013 count = 0; 3014 rowindices = NULL; 3015 colindices = NULL; 3016 if (rowemb) { 3017 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3018 } 3019 if (ocolemb) { 3020 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3021 } 3022 for (i=0; i<B->rmap->n; i++) { 3023 PetscInt row; 3024 row = i; 3025 if (rowindices) row = rowindices[i]; 3026 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3027 col = Baij->j[count]; 3028 if (colindices) col = colindices[col]; 3029 if (Bdisassembled && col>=rstart) col += shift; 3030 v = Baij->a[count]; 3031 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3032 ++count; 3033 } 3034 } 3035 /* No assembly for aij->B is necessary. */ 3036 /* FIXME: set aij->B's nonzerostate correctly. */ 3037 } else { 3038 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3039 } 3040 C->preallocated = PETSC_TRUE; 3041 C->was_assembled = PETSC_FALSE; 3042 C->assembled = PETSC_FALSE; 3043 /* 3044 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3045 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3046 */ 3047 PetscFunctionReturn(0); 3048 } 3049 3050 /* 3051 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3052 */ 3053 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3054 { 3055 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3056 3057 PetscFunctionBegin; 3058 PetscValidPointer(A,2); 3059 PetscValidPointer(B,3); 3060 /* FIXME: make sure C is assembled */ 3061 *A = aij->A; 3062 *B = aij->B; 3063 /* Note that we don't incref *A and *B, so be careful! */ 3064 PetscFunctionReturn(0); 3065 } 3066 3067 /* 3068 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3069 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3070 */ 3071 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3072 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3073 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3074 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3075 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3076 { 3077 PetscErrorCode ierr; 3078 PetscMPIInt isize,flag; 3079 PetscInt i,ii,cismax,ispar; 3080 Mat *A,*B; 3081 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3082 3083 PetscFunctionBegin; 3084 if (!ismax) PetscFunctionReturn(0); 3085 3086 for (i = 0, cismax = 0; i < ismax; ++i) { 3087 PetscMPIInt isize; 3088 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3089 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3090 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3091 if (isize > 1) ++cismax; 3092 } 3093 3094 /* 3095 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3096 ispar counts the number of parallel ISs across C's comm. 3097 */ 3098 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3099 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3100 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3101 PetscFunctionReturn(0); 3102 } 3103 3104 /* if (ispar) */ 3105 /* 3106 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3107 These are used to extract the off-diag portion of the resulting parallel matrix. 3108 The row IS for the off-diag portion is the same as for the diag portion, 3109 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3110 */ 3111 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3112 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3113 for (i = 0, ii = 0; i < ismax; ++i) { 3114 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3115 if (isize > 1) { 3116 /* 3117 TODO: This is the part that's ***NOT SCALABLE***. 3118 To fix this we need to extract just the indices of C's nonzero columns 3119 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3120 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3121 to be done without serializing on the IS list, so, most likely, it is best 3122 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3123 */ 3124 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3125 /* Now we have to 3126 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3127 were sorted on each rank, concatenated they might no longer be sorted; 3128 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3129 indices in the nondecreasing order to the original index positions. 3130 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3131 */ 3132 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3133 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3134 ++ii; 3135 } 3136 } 3137 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3138 for (i = 0, ii = 0; i < ismax; ++i) { 3139 PetscInt j,issize; 3140 const PetscInt *indices; 3141 3142 /* 3143 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3144 */ 3145 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3146 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3147 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3148 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3149 for (j = 1; j < issize; ++j) { 3150 if (indices[j] == indices[j-1]) { 3151 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3152 } 3153 } 3154 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3155 3156 3157 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3158 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3159 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3160 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3161 for (j = 1; j < issize; ++j) { 3162 if (indices[j-1] == indices[j]) { 3163 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3164 } 3165 } 3166 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3167 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3168 if (isize > 1) { 3169 cisrow[ii] = isrow[i]; 3170 ++ii; 3171 } 3172 } 3173 /* 3174 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3175 array of sequential matrices underlying the resulting parallel matrices. 3176 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3177 contain duplicates. 3178 3179 There are as many diag matrices as there are original index sets. There are only as many parallel 3180 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3181 3182 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3183 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3184 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3185 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3186 with A[i] and B[ii] extracted from the corresponding MPI submat. 3187 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3188 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3189 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3190 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3191 values into A[i] and B[ii] sitting inside the corresponding submat. 3192 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3193 A[i], B[ii], AA[i] or BB[ii] matrices. 3194 */ 3195 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3196 if (scall == MAT_INITIAL_MATRIX) { 3197 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3198 } 3199 3200 /* Now obtain the sequential A and B submatrices separately. */ 3201 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3202 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3203 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3204 3205 /* 3206 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3207 matrices A & B have been extracted directly into the parallel matrices containing them, or 3208 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3209 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3210 to have the same sparsity pattern. 3211 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3212 must be constructed for C. This is done by setseqmat(s). 3213 */ 3214 for (i = 0, ii = 0; i < ismax; ++i) { 3215 /* 3216 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3217 That way we can avoid sorting and computing permutations when reusing. 3218 To this end: 3219 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3220 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3221 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3222 */ 3223 MatStructure pattern; 3224 pattern = DIFFERENT_NONZERO_PATTERN; 3225 3226 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3227 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3228 if (isize > 1) { 3229 if (scall == MAT_INITIAL_MATRIX) { 3230 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3231 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3232 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3233 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3234 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3235 } 3236 /* 3237 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3238 */ 3239 { 3240 Mat AA,BB; 3241 AA = A[i]; 3242 BB = B[ii]; 3243 if (AA || BB) { 3244 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3245 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3246 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3247 } 3248 3249 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3250 } 3251 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3252 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3253 ++ii; 3254 } else { /* if (isize == 1) */ 3255 if (scall == MAT_REUSE_MATRIX) { 3256 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3257 } 3258 if (isrow_p[i] || iscol_p[i]) { 3259 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3260 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3261 /* Otherwise A is extracted straight into (*submats)[i]. */ 3262 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3263 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3264 } else (*submat)[i] = A[i]; 3265 } 3266 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3267 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3268 } 3269 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3270 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3271 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3272 ierr = PetscFree(A);CHKERRQ(ierr); 3273 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3274 PetscFunctionReturn(0); 3275 } 3276 3277 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3278 { 3279 PetscErrorCode ierr; 3280 3281 PetscFunctionBegin; 3282 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3283 PetscFunctionReturn(0); 3284 } 3285