1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = 0,*sbdata = 0; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++) { 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++) { 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank) { 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows) { 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++) { 115 if (tosizes_temp[i]) { 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 toffsets[0] = 0; 123 for (i=0; i<nto; i++) { 124 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 125 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 126 } 127 /* send information to other processors */ 128 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 129 nrecvrows = 0; 130 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 131 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 132 nrecvrows = 0; 133 for (i=0; i<nfrom; i++) { 134 for (j=0; j<fromsizes[2*i]; j++) { 135 remote[nrecvrows].rank = fromranks[i]; 136 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 137 } 138 } 139 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 140 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 141 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 142 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 143 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 144 /* message pair <no of is, row> */ 145 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 146 for (i=0; i<nrrows; i++) { 147 owner = rrow_ranks[i]; /* processor */ 148 j = tosizes_temp[owner]; /* index */ 149 todata[toffsets[j]++] = rrow_isids[i]; 150 todata[toffsets[j]++] = remoterows[i]; 151 } 152 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 153 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 154 ierr = PetscFree(toffsets);CHKERRQ(ierr); 155 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 157 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 158 /* send rows belonging to the remote so that then we could get the overlapping data back */ 159 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 160 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 162 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 163 ierr = PetscFree(fromranks);CHKERRQ(ierr); 164 nrecvrows = 0; 165 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 166 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 167 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 168 nrecvrows = 0; 169 for (i=0; i<nto; i++) { 170 for (j=0; j<tosizes[2*i]; j++) { 171 remote[nrecvrows].rank = toranks[i]; 172 remote[nrecvrows++].index = tosizes[2*i+1]+j; 173 } 174 } 175 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 176 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 177 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 178 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 179 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 180 /* overlap communication and computation */ 181 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 182 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 183 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 184 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 185 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 186 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 187 ierr = PetscFree(toranks);CHKERRQ(ierr); 188 ierr = PetscFree(tosizes);CHKERRQ(ierr); 189 ierr = PetscFree(todata);CHKERRQ(ierr); 190 PetscFunctionReturn(0); 191 } 192 193 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 194 { 195 PetscInt *isz,isz_i,i,j,is_id, data_size; 196 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 197 const PetscInt *indices_i_temp; 198 MPI_Comm *iscomms; 199 PetscErrorCode ierr; 200 201 PetscFunctionBegin; 202 max_lsize = 0; 203 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 204 for (i=0; i<nidx; i++){ 205 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 206 max_lsize = lsize>max_lsize ? lsize:max_lsize; 207 isz[i] = lsize; 208 } 209 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 212 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 213 ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr); 214 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 215 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 216 } 217 /* retrieve information to get row id and its overlap */ 218 for (i=0; i<nrecvs; ){ 219 is_id = recvdata[i++]; 220 data_size = recvdata[i++]; 221 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 222 isz_i = isz[is_id]; 223 for (j=0; j< data_size; j++){ 224 col = recvdata[i++]; 225 indices_i[isz_i++] = col; 226 } 227 isz[is_id] = isz_i; 228 } 229 /* remove duplicate entities */ 230 for (i=0; i<nidx; i++){ 231 indices_i = indices_temp+(max_lsize+nrecvs)*i; 232 isz_i = isz[i]; 233 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 234 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 235 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 236 } 237 ierr = PetscFree(isz);CHKERRQ(ierr); 238 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 243 { 244 PetscLayout rmap,cmap; 245 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 246 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 247 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 248 const PetscInt *gcols,*ai,*aj,*bi,*bj; 249 Mat amat,bmat; 250 PetscMPIInt rank; 251 PetscBool done; 252 MPI_Comm comm; 253 PetscErrorCode ierr; 254 255 PetscFunctionBegin; 256 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 257 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 258 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 259 /* Even if the mat is symmetric, we still assume it is not symmetric */ 260 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 261 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 262 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 263 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 264 /* total number of nonzero values is used to estimate the memory usage in the next step */ 265 tnz = ai[an]+bi[bn]; 266 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 267 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 268 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 269 /* to find the longest message */ 270 max_fszs = 0; 271 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 272 /* better way to estimate number of nonzero in the mat??? */ 273 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 274 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 275 rows_pos = 0; 276 totalrows = 0; 277 for (i=0; i<nfrom; i++){ 278 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 279 /* group data together */ 280 for (j=0; j<fromsizes[2*i]; j+=2){ 281 is_id = fromrows[rows_pos++];/* no of is */ 282 rows_i = rows_data[is_id]; 283 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 284 } 285 /* estimate a space to avoid multiple allocations */ 286 for (j=0; j<nidx; j++){ 287 indvc_ij = 0; 288 rows_i = rows_data[j]; 289 for (l=0; l<rows_pos_i[j]; l++){ 290 row = rows_i[l]-rstart; 291 start = ai[row]; 292 end = ai[row+1]; 293 for (k=start; k<end; k++){ /* Amat */ 294 col = aj[k] + cstart; 295 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 296 } 297 start = bi[row]; 298 end = bi[row+1]; 299 for (k=start; k<end; k++) { /* Bmat */ 300 col = gcols[bj[k]]; 301 indices_tmp[indvc_ij++] = col; 302 } 303 } 304 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 305 indv_counts[i*nidx+j] = indvc_ij; 306 totalrows += indvc_ij; 307 } 308 } 309 /* message triple <no of is, number of rows, rows> */ 310 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 311 totalrows = 0; 312 rows_pos = 0; 313 /* use this code again */ 314 for (i=0;i<nfrom;i++){ 315 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 316 for (j=0; j<fromsizes[2*i]; j+=2){ 317 is_id = fromrows[rows_pos++]; 318 rows_i = rows_data[is_id]; 319 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 320 } 321 /* add data */ 322 for (j=0; j<nidx; j++){ 323 if (!indv_counts[i*nidx+j]) continue; 324 indvc_ij = 0; 325 sbdata[totalrows++] = j; 326 sbdata[totalrows++] = indv_counts[i*nidx+j]; 327 sbsizes[2*i] += 2; 328 rows_i = rows_data[j]; 329 for (l=0; l<rows_pos_i[j]; l++){ 330 row = rows_i[l]-rstart; 331 start = ai[row]; 332 end = ai[row+1]; 333 for (k=start; k<end; k++){ /* Amat */ 334 col = aj[k] + cstart; 335 indices_tmp[indvc_ij++] = col; 336 } 337 start = bi[row]; 338 end = bi[row+1]; 339 for (k=start; k<end; k++) { /* Bmat */ 340 col = gcols[bj[k]]; 341 indices_tmp[indvc_ij++] = col; 342 } 343 } 344 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 345 sbsizes[2*i] += indvc_ij; 346 ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr); 347 totalrows += indvc_ij; 348 } 349 } 350 ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 351 offsets[0] = 0; 352 for (i=0; i<nfrom; i++){ 353 offsets[i+1] = offsets[i] + sbsizes[2*i]; 354 sbsizes[2*i+1] = offsets[i]; 355 } 356 ierr = PetscFree(offsets);CHKERRQ(ierr); 357 if (sbrowsizes) *sbrowsizes = sbsizes; 358 if (sbrows) *sbrows = sbdata; 359 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 360 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 361 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 362 PetscFunctionReturn(0); 363 } 364 365 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 366 { 367 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 368 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 369 PetscInt lsize,lsize_tmp; 370 PetscMPIInt rank,owner; 371 Mat amat,bmat; 372 PetscBool done; 373 PetscLayout cmap,rmap; 374 MPI_Comm comm; 375 PetscErrorCode ierr; 376 377 PetscFunctionBegin; 378 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 379 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 380 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 381 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 382 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 383 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 384 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 385 /* is it a safe way to compute number of nonzero values ? */ 386 tnz = ai[an]+bi[bn]; 387 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 388 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 389 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 390 /* it is a better way to estimate memory than the old implementation 391 * where global size of matrix is used 392 * */ 393 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 394 for (i=0; i<nidx; i++) { 395 MPI_Comm iscomm; 396 397 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 398 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 399 lsize_tmp = 0; 400 for (j=0; j<lsize; j++) { 401 owner = -1; 402 row = indices[j]; 403 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 404 if (owner != rank) continue; 405 /* local number */ 406 row -= rstart; 407 start = ai[row]; 408 end = ai[row+1]; 409 for (k=start; k<end; k++) { /* Amat */ 410 col = aj[k] + cstart; 411 indices_temp[lsize_tmp++] = col; 412 } 413 start = bi[row]; 414 end = bi[row+1]; 415 for (k=start; k<end; k++) { /* Bmat */ 416 col = gcols[bj[k]]; 417 indices_temp[lsize_tmp++] = col; 418 } 419 } 420 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 421 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 422 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 423 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 424 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 425 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 426 } 427 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 428 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 429 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 430 PetscFunctionReturn(0); 431 } 432 433 434 /* 435 Sample message format: 436 If a processor A wants processor B to process some elements corresponding 437 to index sets is[1],is[5] 438 mesg [0] = 2 (no of index sets in the mesg) 439 ----------- 440 mesg [1] = 1 => is[1] 441 mesg [2] = sizeof(is[1]); 442 ----------- 443 mesg [3] = 5 => is[5] 444 mesg [4] = sizeof(is[5]); 445 ----------- 446 mesg [5] 447 mesg [n] datas[1] 448 ----------- 449 mesg[n+1] 450 mesg[m] data(is[5]) 451 ----------- 452 453 Notes: 454 nrqs - no of requests sent (or to be sent out) 455 nrqr - no of requests recieved (which have to be or which have been processed 456 */ 457 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 458 { 459 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 460 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 461 const PetscInt **idx,*idx_i; 462 PetscInt *n,**data,len; 463 #if defined(PETSC_USE_CTABLE) 464 PetscTable *table_data,table_data_i; 465 PetscInt *tdata,tcount,tcount_max; 466 #else 467 PetscInt *data_i,*d_p; 468 #endif 469 PetscErrorCode ierr; 470 PetscMPIInt size,rank,tag1,tag2,proc = 0; 471 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 472 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 473 PetscBT *table; 474 MPI_Comm comm; 475 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 476 MPI_Status *s_status,*recv_status; 477 MPI_Comm *iscomms; 478 char *t_p; 479 480 PetscFunctionBegin; 481 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 482 size = c->size; 483 rank = c->rank; 484 M = C->rmap->N; 485 486 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 487 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 488 489 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 490 491 for (i=0; i<imax; i++) { 492 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 493 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 494 } 495 496 /* evaluate communication - mesg to who,length of mesg, and buffer space 497 required. Based on this, buffers are allocated, and data copied into them */ 498 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 499 for (i=0; i<imax; i++) { 500 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 501 idx_i = idx[i]; 502 len = n[i]; 503 for (j=0; j<len; j++) { 504 row = idx_i[j]; 505 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 506 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 507 w4[proc]++; 508 } 509 for (j=0; j<size; j++) { 510 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 511 } 512 } 513 514 nrqs = 0; /* no of outgoing messages */ 515 msz = 0; /* total mesg length (for all proc */ 516 w1[rank] = 0; /* no mesg sent to intself */ 517 w3[rank] = 0; 518 for (i=0; i<size; i++) { 519 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 520 } 521 /* pa - is list of processors to communicate with */ 522 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 523 for (i=0,j=0; i<size; i++) { 524 if (w1[i]) {pa[j] = i; j++;} 525 } 526 527 /* Each message would have a header = 1 + 2*(no of IS) + data */ 528 for (i=0; i<nrqs; i++) { 529 j = pa[i]; 530 w1[j] += w2[j] + 2*w3[j]; 531 msz += w1[j]; 532 } 533 534 /* Determine the number of messages to expect, their lengths, from from-ids */ 535 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 536 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 537 538 /* Now post the Irecvs corresponding to these messages */ 539 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 540 541 /* Allocate Memory for outgoing messages */ 542 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 543 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 544 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 545 546 { 547 PetscInt *iptr = tmp,ict = 0; 548 for (i=0; i<nrqs; i++) { 549 j = pa[i]; 550 iptr += ict; 551 outdat[j] = iptr; 552 ict = w1[j]; 553 } 554 } 555 556 /* Form the outgoing messages */ 557 /* plug in the headers */ 558 for (i=0; i<nrqs; i++) { 559 j = pa[i]; 560 outdat[j][0] = 0; 561 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 562 ptr[j] = outdat[j] + 2*w3[j] + 1; 563 } 564 565 /* Memory for doing local proc's work */ 566 { 567 PetscInt M_BPB_imax = 0; 568 #if defined(PETSC_USE_CTABLE) 569 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 570 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 571 for (i=0; i<imax; i++) { 572 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 573 } 574 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 575 for (i=0; i<imax; i++) { 576 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 577 } 578 #else 579 PetscInt Mimax = 0; 580 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 581 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 582 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 583 for (i=0; i<imax; i++) { 584 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 585 data[i] = d_p + M*i; 586 } 587 #endif 588 } 589 590 /* Parse the IS and update local tables and the outgoing buf with the data */ 591 { 592 PetscInt n_i,isz_i,*outdat_j,ctr_j; 593 PetscBT table_i; 594 595 for (i=0; i<imax; i++) { 596 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 597 n_i = n[i]; 598 table_i = table[i]; 599 idx_i = idx[i]; 600 #if defined(PETSC_USE_CTABLE) 601 table_data_i = table_data[i]; 602 #else 603 data_i = data[i]; 604 #endif 605 isz_i = isz[i]; 606 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 607 row = idx_i[j]; 608 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 609 if (proc != rank) { /* copy to the outgoing buffer */ 610 ctr[proc]++; 611 *ptr[proc] = row; 612 ptr[proc]++; 613 } else if (!PetscBTLookupSet(table_i,row)) { 614 #if defined(PETSC_USE_CTABLE) 615 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 616 #else 617 data_i[isz_i] = row; /* Update the local table */ 618 #endif 619 isz_i++; 620 } 621 } 622 /* Update the headers for the current IS */ 623 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 624 if ((ctr_j = ctr[j])) { 625 outdat_j = outdat[j]; 626 k = ++outdat_j[0]; 627 outdat_j[2*k] = ctr_j; 628 outdat_j[2*k-1] = i; 629 } 630 } 631 isz[i] = isz_i; 632 } 633 } 634 635 /* Now post the sends */ 636 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 637 for (i=0; i<nrqs; ++i) { 638 j = pa[i]; 639 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 640 } 641 642 /* No longer need the original indices */ 643 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 644 for (i=0; i<imax; ++i) { 645 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 646 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 647 } 648 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 649 650 for (i=0; i<imax; ++i) { 651 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 652 } 653 654 /* Do Local work */ 655 #if defined(PETSC_USE_CTABLE) 656 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 657 #else 658 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 659 #endif 660 661 /* Receive messages */ 662 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 663 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 664 665 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 666 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 667 668 /* Phase 1 sends are complete - deallocate buffers */ 669 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 670 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 671 672 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 673 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 674 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 675 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 676 ierr = PetscFree(rbuf);CHKERRQ(ierr); 677 678 679 /* Send the data back */ 680 /* Do a global reduction to know the buffer space req for incoming messages */ 681 { 682 PetscMPIInt *rw1; 683 684 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 685 686 for (i=0; i<nrqr; ++i) { 687 proc = recv_status[i].MPI_SOURCE; 688 689 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 690 rw1[proc] = isz1[i]; 691 } 692 ierr = PetscFree(onodes1);CHKERRQ(ierr); 693 ierr = PetscFree(olengths1);CHKERRQ(ierr); 694 695 /* Determine the number of messages to expect, their lengths, from from-ids */ 696 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 697 ierr = PetscFree(rw1);CHKERRQ(ierr); 698 } 699 /* Now post the Irecvs corresponding to these messages */ 700 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 701 702 /* Now post the sends */ 703 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 704 for (i=0; i<nrqr; ++i) { 705 j = recv_status[i].MPI_SOURCE; 706 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 707 } 708 709 /* receive work done on other processors */ 710 { 711 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 712 PetscMPIInt idex; 713 PetscBT table_i; 714 MPI_Status *status2; 715 716 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 717 for (i=0; i<nrqs; ++i) { 718 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 719 /* Process the message */ 720 rbuf2_i = rbuf2[idex]; 721 ct1 = 2*rbuf2_i[0]+1; 722 jmax = rbuf2[idex][0]; 723 for (j=1; j<=jmax; j++) { 724 max = rbuf2_i[2*j]; 725 is_no = rbuf2_i[2*j-1]; 726 isz_i = isz[is_no]; 727 table_i = table[is_no]; 728 #if defined(PETSC_USE_CTABLE) 729 table_data_i = table_data[is_no]; 730 #else 731 data_i = data[is_no]; 732 #endif 733 for (k=0; k<max; k++,ct1++) { 734 row = rbuf2_i[ct1]; 735 if (!PetscBTLookupSet(table_i,row)) { 736 #if defined(PETSC_USE_CTABLE) 737 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 738 #else 739 data_i[isz_i] = row; 740 #endif 741 isz_i++; 742 } 743 } 744 isz[is_no] = isz_i; 745 } 746 } 747 748 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 749 ierr = PetscFree(status2);CHKERRQ(ierr); 750 } 751 752 #if defined(PETSC_USE_CTABLE) 753 tcount_max = 0; 754 for (i=0; i<imax; ++i) { 755 table_data_i = table_data[i]; 756 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 757 if (tcount_max < tcount) tcount_max = tcount; 758 } 759 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 760 #endif 761 762 for (i=0; i<imax; ++i) { 763 #if defined(PETSC_USE_CTABLE) 764 PetscTablePosition tpos; 765 table_data_i = table_data[i]; 766 767 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 768 while (tpos) { 769 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 770 tdata[--j] = --k; 771 } 772 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 773 #else 774 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 775 #endif 776 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 777 } 778 779 ierr = PetscFree(iscomms);CHKERRQ(ierr); 780 ierr = PetscFree(onodes2);CHKERRQ(ierr); 781 ierr = PetscFree(olengths2);CHKERRQ(ierr); 782 783 ierr = PetscFree(pa);CHKERRQ(ierr); 784 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 785 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 786 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 787 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 788 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 789 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 790 ierr = PetscFree(s_status);CHKERRQ(ierr); 791 ierr = PetscFree(recv_status);CHKERRQ(ierr); 792 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 793 ierr = PetscFree(xdata);CHKERRQ(ierr); 794 ierr = PetscFree(isz1);CHKERRQ(ierr); 795 #if defined(PETSC_USE_CTABLE) 796 for (i=0; i<imax; i++) { 797 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 798 } 799 ierr = PetscFree(table_data);CHKERRQ(ierr); 800 ierr = PetscFree(tdata);CHKERRQ(ierr); 801 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 802 #else 803 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 804 #endif 805 PetscFunctionReturn(0); 806 } 807 808 /* 809 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 810 the work on the local processor. 811 812 Inputs: 813 C - MAT_MPIAIJ; 814 imax - total no of index sets processed at a time; 815 table - an array of char - size = m bits. 816 817 Output: 818 isz - array containing the count of the solution elements corresponding 819 to each index set; 820 data or table_data - pointer to the solutions 821 */ 822 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 823 { 824 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 825 Mat A = c->A,B = c->B; 826 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 827 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 828 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 829 PetscBT table_i; 830 #if defined(PETSC_USE_CTABLE) 831 PetscTable table_data_i; 832 PetscErrorCode ierr; 833 PetscTablePosition tpos; 834 PetscInt tcount,*tdata; 835 #else 836 PetscInt *data_i; 837 #endif 838 839 PetscFunctionBegin; 840 rstart = C->rmap->rstart; 841 cstart = C->cmap->rstart; 842 ai = a->i; 843 aj = a->j; 844 bi = b->i; 845 bj = b->j; 846 garray = c->garray; 847 848 for (i=0; i<imax; i++) { 849 #if defined(PETSC_USE_CTABLE) 850 /* copy existing entries of table_data_i into tdata[] */ 851 table_data_i = table_data[i]; 852 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 853 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 854 855 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 856 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 857 while (tpos) { 858 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 859 tdata[--j] = --row; 860 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 861 } 862 #else 863 data_i = data[i]; 864 #endif 865 table_i = table[i]; 866 isz_i = isz[i]; 867 max = isz[i]; 868 869 for (j=0; j<max; j++) { 870 #if defined(PETSC_USE_CTABLE) 871 row = tdata[j] - rstart; 872 #else 873 row = data_i[j] - rstart; 874 #endif 875 start = ai[row]; 876 end = ai[row+1]; 877 for (k=start; k<end; k++) { /* Amat */ 878 val = aj[k] + cstart; 879 if (!PetscBTLookupSet(table_i,val)) { 880 #if defined(PETSC_USE_CTABLE) 881 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 882 #else 883 data_i[isz_i] = val; 884 #endif 885 isz_i++; 886 } 887 } 888 start = bi[row]; 889 end = bi[row+1]; 890 for (k=start; k<end; k++) { /* Bmat */ 891 val = garray[bj[k]]; 892 if (!PetscBTLookupSet(table_i,val)) { 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 895 #else 896 data_i[isz_i] = val; 897 #endif 898 isz_i++; 899 } 900 } 901 } 902 isz[i] = isz_i; 903 904 #if defined(PETSC_USE_CTABLE) 905 ierr = PetscFree(tdata);CHKERRQ(ierr); 906 #endif 907 } 908 PetscFunctionReturn(0); 909 } 910 911 /* 912 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 913 and return the output 914 915 Input: 916 C - the matrix 917 nrqr - no of messages being processed. 918 rbuf - an array of pointers to the recieved requests 919 920 Output: 921 xdata - array of messages to be sent back 922 isz1 - size of each message 923 924 For better efficiency perhaps we should malloc separately each xdata[i], 925 then if a remalloc is required we need only copy the data for that one row 926 rather then all previous rows as it is now where a single large chunck of 927 memory is used. 928 929 */ 930 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 931 { 932 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 933 Mat A = c->A,B = c->B; 934 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 935 PetscErrorCode ierr; 936 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 937 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 938 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 939 PetscInt *rbuf_i,kmax,rbuf_0; 940 PetscBT xtable; 941 942 PetscFunctionBegin; 943 m = C->rmap->N; 944 rstart = C->rmap->rstart; 945 cstart = C->cmap->rstart; 946 ai = a->i; 947 aj = a->j; 948 bi = b->i; 949 bj = b->j; 950 garray = c->garray; 951 952 953 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 954 rbuf_i = rbuf[i]; 955 rbuf_0 = rbuf_i[0]; 956 ct += rbuf_0; 957 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 958 } 959 960 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 961 else max1 = 1; 962 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 963 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 964 ++no_malloc; 965 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 966 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 967 968 ct3 = 0; 969 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 970 rbuf_i = rbuf[i]; 971 rbuf_0 = rbuf_i[0]; 972 ct1 = 2*rbuf_0+1; 973 ct2 = ct1; 974 ct3 += ct1; 975 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 976 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 977 oct2 = ct2; 978 kmax = rbuf_i[2*j]; 979 for (k=0; k<kmax; k++,ct1++) { 980 row = rbuf_i[ct1]; 981 if (!PetscBTLookupSet(xtable,row)) { 982 if (!(ct3 < mem_estimate)) { 983 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 984 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 985 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 986 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 987 xdata[0] = tmp; 988 mem_estimate = new_estimate; ++no_malloc; 989 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 990 } 991 xdata[i][ct2++] = row; 992 ct3++; 993 } 994 } 995 for (k=oct2,max2=ct2; k<max2; k++) { 996 row = xdata[i][k] - rstart; 997 start = ai[row]; 998 end = ai[row+1]; 999 for (l=start; l<end; l++) { 1000 val = aj[l] + cstart; 1001 if (!PetscBTLookupSet(xtable,val)) { 1002 if (!(ct3 < mem_estimate)) { 1003 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1004 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1005 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1006 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1007 xdata[0] = tmp; 1008 mem_estimate = new_estimate; ++no_malloc; 1009 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1010 } 1011 xdata[i][ct2++] = val; 1012 ct3++; 1013 } 1014 } 1015 start = bi[row]; 1016 end = bi[row+1]; 1017 for (l=start; l<end; l++) { 1018 val = garray[bj[l]]; 1019 if (!PetscBTLookupSet(xtable,val)) { 1020 if (!(ct3 < mem_estimate)) { 1021 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1022 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1023 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1024 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1025 xdata[0] = tmp; 1026 mem_estimate = new_estimate; ++no_malloc; 1027 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1028 } 1029 xdata[i][ct2++] = val; 1030 ct3++; 1031 } 1032 } 1033 } 1034 /* Update the header*/ 1035 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1036 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1037 } 1038 xdata[i][0] = rbuf_0; 1039 xdata[i+1] = xdata[i] + ct2; 1040 isz1[i] = ct2; /* size of each message */ 1041 } 1042 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1043 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1044 PetscFunctionReturn(0); 1045 } 1046 /* -------------------------------------------------------------------------*/ 1047 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1048 /* 1049 Every processor gets the entire matrix 1050 */ 1051 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1052 { 1053 Mat B; 1054 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1055 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1056 PetscErrorCode ierr; 1057 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 1058 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1059 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1060 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1061 1062 PetscFunctionBegin; 1063 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1064 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1065 1066 if (scall == MAT_INITIAL_MATRIX) { 1067 /* ---------------------------------------------------------------- 1068 Tell every processor the number of nonzeros per row 1069 */ 1070 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1071 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1072 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1073 } 1074 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1075 for (i=0; i<size; i++) { 1076 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1077 displs[i] = A->rmap->range[i]; 1078 } 1079 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1080 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1081 #else 1082 sendcount = A->rmap->rend - A->rmap->rstart; 1083 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1084 #endif 1085 /* --------------------------------------------------------------- 1086 Create the sequential matrix of the same type as the local block diagonal 1087 */ 1088 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1089 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1090 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1091 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1092 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1093 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1094 **Bin = B; 1095 b = (Mat_SeqAIJ*)B->data; 1096 1097 /*-------------------------------------------------------------------- 1098 Copy my part of matrix column indices over 1099 */ 1100 sendcount = ad->nz + bd->nz; 1101 jsendbuf = b->j + b->i[rstarts[rank]]; 1102 a_jsendbuf = ad->j; 1103 b_jsendbuf = bd->j; 1104 n = A->rmap->rend - A->rmap->rstart; 1105 cnt = 0; 1106 for (i=0; i<n; i++) { 1107 1108 /* put in lower diagonal portion */ 1109 m = bd->i[i+1] - bd->i[i]; 1110 while (m > 0) { 1111 /* is it above diagonal (in bd (compressed) numbering) */ 1112 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1113 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1114 m--; 1115 } 1116 1117 /* put in diagonal portion */ 1118 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1119 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1120 } 1121 1122 /* put in upper diagonal portion */ 1123 while (m-- > 0) { 1124 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1125 } 1126 } 1127 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1128 1129 /*-------------------------------------------------------------------- 1130 Gather all column indices to all processors 1131 */ 1132 for (i=0; i<size; i++) { 1133 recvcounts[i] = 0; 1134 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1135 recvcounts[i] += lens[j]; 1136 } 1137 } 1138 displs[0] = 0; 1139 for (i=1; i<size; i++) { 1140 displs[i] = displs[i-1] + recvcounts[i-1]; 1141 } 1142 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1143 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1144 #else 1145 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1146 #endif 1147 /*-------------------------------------------------------------------- 1148 Assemble the matrix into useable form (note numerical values not yet set) 1149 */ 1150 /* set the b->ilen (length of each row) values */ 1151 ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr); 1152 /* set the b->i indices */ 1153 b->i[0] = 0; 1154 for (i=1; i<=A->rmap->N; i++) { 1155 b->i[i] = b->i[i-1] + lens[i-1]; 1156 } 1157 ierr = PetscFree(lens);CHKERRQ(ierr); 1158 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1159 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1160 1161 } else { 1162 B = **Bin; 1163 b = (Mat_SeqAIJ*)B->data; 1164 } 1165 1166 /*-------------------------------------------------------------------- 1167 Copy my part of matrix numerical values into the values location 1168 */ 1169 if (flag == MAT_GET_VALUES) { 1170 sendcount = ad->nz + bd->nz; 1171 sendbuf = b->a + b->i[rstarts[rank]]; 1172 a_sendbuf = ad->a; 1173 b_sendbuf = bd->a; 1174 b_sendj = bd->j; 1175 n = A->rmap->rend - A->rmap->rstart; 1176 cnt = 0; 1177 for (i=0; i<n; i++) { 1178 1179 /* put in lower diagonal portion */ 1180 m = bd->i[i+1] - bd->i[i]; 1181 while (m > 0) { 1182 /* is it above diagonal (in bd (compressed) numbering) */ 1183 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1184 sendbuf[cnt++] = *b_sendbuf++; 1185 m--; 1186 b_sendj++; 1187 } 1188 1189 /* put in diagonal portion */ 1190 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1191 sendbuf[cnt++] = *a_sendbuf++; 1192 } 1193 1194 /* put in upper diagonal portion */ 1195 while (m-- > 0) { 1196 sendbuf[cnt++] = *b_sendbuf++; 1197 b_sendj++; 1198 } 1199 } 1200 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1201 1202 /* ----------------------------------------------------------------- 1203 Gather all numerical values to all processors 1204 */ 1205 if (!recvcounts) { 1206 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1207 } 1208 for (i=0; i<size; i++) { 1209 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1210 } 1211 displs[0] = 0; 1212 for (i=1; i<size; i++) { 1213 displs[i] = displs[i-1] + recvcounts[i-1]; 1214 } 1215 recvbuf = b->a; 1216 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1217 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1218 #else 1219 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1220 #endif 1221 } /* endof (flag == MAT_GET_VALUES) */ 1222 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1223 1224 if (A->symmetric) { 1225 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1226 } else if (A->hermitian) { 1227 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1228 } else if (A->structurally_symmetric) { 1229 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1230 } 1231 PetscFunctionReturn(0); 1232 } 1233 1234 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1235 { 1236 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1237 Mat submat,A = c->A,B = c->B; 1238 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1239 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1240 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1241 const PetscInt *icol,*irow; 1242 PetscInt nrow,ncol,start; 1243 PetscErrorCode ierr; 1244 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1245 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1246 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1247 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1248 PetscInt *lens,rmax,ncols,*cols,Crow; 1249 #if defined(PETSC_USE_CTABLE) 1250 PetscTable cmap,rmap; 1251 PetscInt *cmap_loc,*rmap_loc; 1252 #else 1253 PetscInt *cmap,*rmap; 1254 #endif 1255 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1256 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1257 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1258 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1259 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1260 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1261 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1262 MPI_Comm comm; 1263 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1264 PetscMPIInt *onodes1,*olengths1,idex,end; 1265 Mat_SubSppt *smatis1; 1266 PetscBool isrowsorted,iscolsorted; 1267 1268 PetscFunctionBegin; 1269 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1270 1271 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1272 size = c->size; 1273 rank = c->rank; 1274 1275 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1276 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1277 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1278 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1279 if (allcolumns) { 1280 icol = NULL; 1281 ncol = C->cmap->N; 1282 } else { 1283 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1284 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1285 } 1286 1287 if (scall == MAT_INITIAL_MATRIX) { 1288 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1289 1290 /* Get some new tags to keep the communication clean */ 1291 tag1 = ((PetscObject)C)->tag; 1292 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1293 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1294 1295 /* evaluate communication - mesg to who, length of mesg, and buffer space 1296 required. Based on this, buffers are allocated, and data copied into them */ 1297 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1298 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1299 1300 /* w1[proc] = num of rows owned by proc -- to be requested */ 1301 proc = 0; 1302 nrqs = 0; /* num of outgoing messages */ 1303 for (j=0; j<nrow; j++) { 1304 row = irow[j]; 1305 if (!isrowsorted) proc = 0; 1306 while (row >= C->rmap->range[proc+1]) proc++; 1307 w1[proc]++; 1308 row2proc[j] = proc; /* map row index to proc */ 1309 1310 if (proc != rank && !w2[proc]) { 1311 w2[proc] = 1; nrqs++; 1312 } 1313 } 1314 w1[rank] = 0; /* rows owned by self will not be requested */ 1315 1316 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1317 for (proc=0,j=0; proc<size; proc++) { 1318 if (w1[proc]) { pa[j++] = proc;} 1319 } 1320 1321 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1322 msz = 0; /* total mesg length (for all procs) */ 1323 for (i=0; i<nrqs; i++) { 1324 proc = pa[i]; 1325 w1[proc] += 3; 1326 msz += w1[proc]; 1327 } 1328 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1329 1330 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1331 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1332 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1333 1334 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1335 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1336 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1337 1338 /* Now post the Irecvs corresponding to these messages */ 1339 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1340 1341 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1342 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1343 1344 /* Allocate Memory for outgoing messages */ 1345 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1346 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 1347 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 1348 1349 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1350 iptr = tmp; 1351 for (i=0; i<nrqs; i++) { 1352 proc = pa[i]; 1353 sbuf1[proc] = iptr; 1354 iptr += w1[proc]; 1355 } 1356 1357 /* Form the outgoing messages */ 1358 /* Initialize the header space */ 1359 for (i=0; i<nrqs; i++) { 1360 proc = pa[i]; 1361 ierr = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr); 1362 ptr[proc] = sbuf1[proc] + 3; 1363 } 1364 1365 /* Parse the isrow and copy data into outbuf */ 1366 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 1367 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1368 proc = row2proc[j]; 1369 if (proc != rank) { /* copy to the outgoing buf*/ 1370 *ptr[proc] = irow[j]; 1371 ctr[proc]++; ptr[proc]++; 1372 } 1373 } 1374 1375 /* Update the headers for the current IS */ 1376 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1377 if ((ctr_j = ctr[j])) { 1378 sbuf1_j = sbuf1[j]; 1379 k = ++sbuf1_j[0]; 1380 sbuf1_j[2*k] = ctr_j; 1381 sbuf1_j[2*k-1] = 0; 1382 } 1383 } 1384 1385 /* Now post the sends */ 1386 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1387 for (i=0; i<nrqs; ++i) { 1388 proc = pa[i]; 1389 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1390 } 1391 1392 /* Post Receives to capture the buffer size */ 1393 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1394 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1395 1396 rbuf2[0] = tmp + msz; 1397 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1398 1399 for (i=0; i<nrqs; ++i) { 1400 proc = pa[i]; 1401 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1402 } 1403 1404 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1405 1406 /* Send to other procs the buf size they should allocate */ 1407 /* Receive messages*/ 1408 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1409 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1410 1411 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1412 for (i=0; i<nrqr; ++i) { 1413 req_size[i] = 0; 1414 rbuf1_i = rbuf1[i]; 1415 start = 2*rbuf1_i[0] + 1; 1416 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1417 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1418 sbuf2_i = sbuf2[i]; 1419 for (j=start; j<end; j++) { 1420 k = rbuf1_i[j] - rstart; 1421 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1422 sbuf2_i[j] = ncols; 1423 req_size[i] += ncols; 1424 } 1425 req_source1[i] = r_status1[i].MPI_SOURCE; 1426 1427 /* form the header */ 1428 sbuf2_i[0] = req_size[i]; 1429 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1430 1431 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1432 } 1433 1434 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1435 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1436 1437 /* rbuf2 is received, Post recv column indices a->j */ 1438 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1439 1440 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1441 for (i=0; i<nrqs; ++i) { 1442 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1443 req_source2[i] = r_status2[i].MPI_SOURCE; 1444 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1445 } 1446 1447 /* Wait on sends1 and sends2 */ 1448 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1449 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1450 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1451 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1452 1453 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1454 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1455 1456 /* Now allocate sending buffers for a->j, and send them off */ 1457 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1458 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1459 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1460 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1461 1462 for (i=0; i<nrqr; i++) { /* for each requested message */ 1463 rbuf1_i = rbuf1[i]; 1464 sbuf_aj_i = sbuf_aj[i]; 1465 ct1 = 2*rbuf1_i[0] + 1; 1466 ct2 = 0; 1467 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1468 1469 kmax = rbuf1[i][2]; 1470 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1471 row = rbuf1_i[ct1] - rstart; 1472 nzA = ai[row+1] - ai[row]; 1473 nzB = bi[row+1] - bi[row]; 1474 ncols = nzA + nzB; 1475 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1476 1477 /* load the column indices for this row into cols*/ 1478 cols = sbuf_aj_i + ct2; 1479 1480 lwrite = 0; 1481 for (l=0; l<nzB; l++) { 1482 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1483 } 1484 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1485 for (l=0; l<nzB; l++) { 1486 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1487 } 1488 1489 ct2 += ncols; 1490 } 1491 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1492 } 1493 1494 /* create column map (cmap): global col of C -> local col of submat */ 1495 #if defined(PETSC_USE_CTABLE) 1496 if (!allcolumns) { 1497 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1498 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1499 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1500 if (icol[j] >= cstart && icol[j] <cend) { 1501 cmap_loc[icol[j] - cstart] = j+1; 1502 } else { /* use PetscTable for non-local col indices */ 1503 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1504 } 1505 } 1506 } else { 1507 cmap = NULL; 1508 cmap_loc = NULL; 1509 } 1510 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1511 #else 1512 if (!allcolumns) { 1513 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1514 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1515 } else { 1516 cmap = NULL; 1517 } 1518 #endif 1519 1520 /* Create lens for MatSeqAIJSetPreallocation() */ 1521 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1522 1523 /* Compute lens from local part of C */ 1524 for (j=0; j<nrow; j++) { 1525 row = irow[j]; 1526 proc = row2proc[j]; 1527 if (proc == rank) { 1528 /* diagonal part A = c->A */ 1529 ncols = ai[row-rstart+1] - ai[row-rstart]; 1530 cols = aj + ai[row-rstart]; 1531 if (!allcolumns) { 1532 for (k=0; k<ncols; k++) { 1533 #if defined(PETSC_USE_CTABLE) 1534 tcol = cmap_loc[cols[k]]; 1535 #else 1536 tcol = cmap[cols[k]+cstart]; 1537 #endif 1538 if (tcol) lens[j]++; 1539 } 1540 } else { /* allcolumns */ 1541 lens[j] = ncols; 1542 } 1543 1544 /* off-diagonal part B = c->B */ 1545 ncols = bi[row-rstart+1] - bi[row-rstart]; 1546 cols = bj + bi[row-rstart]; 1547 if (!allcolumns) { 1548 for (k=0; k<ncols; k++) { 1549 #if defined(PETSC_USE_CTABLE) 1550 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1551 #else 1552 tcol = cmap[bmap[cols[k]]]; 1553 #endif 1554 if (tcol) lens[j]++; 1555 } 1556 } else { /* allcolumns */ 1557 lens[j] += ncols; 1558 } 1559 } 1560 } 1561 1562 /* Create row map (rmap): global row of C -> local row of submat */ 1563 #if defined(PETSC_USE_CTABLE) 1564 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1565 for (j=0; j<nrow; j++) { 1566 row = irow[j]; 1567 proc = row2proc[j]; 1568 if (proc == rank) { /* a local row */ 1569 rmap_loc[row - rstart] = j; 1570 } else { 1571 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1572 } 1573 } 1574 #else 1575 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1576 for (j=0; j<nrow; j++) { 1577 rmap[irow[j]] = j; 1578 } 1579 #endif 1580 1581 /* Update lens from offproc data */ 1582 /* recv a->j is done */ 1583 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1584 for (i=0; i<nrqs; i++) { 1585 proc = pa[i]; 1586 sbuf1_i = sbuf1[proc]; 1587 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1588 ct1 = 2 + 1; 1589 ct2 = 0; 1590 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1591 rbuf3_i = rbuf3[i]; /* received C->j */ 1592 1593 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1594 max1 = sbuf1_i[2]; 1595 for (k=0; k<max1; k++,ct1++) { 1596 #if defined(PETSC_USE_CTABLE) 1597 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1598 row--; 1599 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1600 #else 1601 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1602 #endif 1603 /* Now, store row index of submat in sbuf1_i[ct1] */ 1604 sbuf1_i[ct1] = row; 1605 1606 nnz = rbuf2_i[ct1]; 1607 if (!allcolumns) { 1608 for (l=0; l<nnz; l++,ct2++) { 1609 #if defined(PETSC_USE_CTABLE) 1610 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1611 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1612 } else { 1613 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1614 } 1615 #else 1616 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1617 #endif 1618 if (tcol) lens[row]++; 1619 } 1620 } else { /* allcolumns */ 1621 lens[row] += nnz; 1622 } 1623 } 1624 } 1625 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1626 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1627 1628 /* Create the submatrices */ 1629 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1630 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1631 1632 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1633 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1634 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1635 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1636 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1637 1638 /* create struct Mat_SubSppt and attached it to submat */ 1639 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1640 subc = (Mat_SeqAIJ*)submat->data; 1641 subc->submatis1 = smatis1; 1642 1643 smatis1->id = 0; 1644 smatis1->nrqs = nrqs; 1645 smatis1->nrqr = nrqr; 1646 smatis1->rbuf1 = rbuf1; 1647 smatis1->rbuf2 = rbuf2; 1648 smatis1->rbuf3 = rbuf3; 1649 smatis1->sbuf2 = sbuf2; 1650 smatis1->req_source2 = req_source2; 1651 1652 smatis1->sbuf1 = sbuf1; 1653 smatis1->ptr = ptr; 1654 smatis1->tmp = tmp; 1655 smatis1->ctr = ctr; 1656 1657 smatis1->pa = pa; 1658 smatis1->req_size = req_size; 1659 smatis1->req_source1 = req_source1; 1660 1661 smatis1->allcolumns = allcolumns; 1662 smatis1->singleis = PETSC_TRUE; 1663 smatis1->row2proc = row2proc; 1664 smatis1->rmap = rmap; 1665 smatis1->cmap = cmap; 1666 #if defined(PETSC_USE_CTABLE) 1667 smatis1->rmap_loc = rmap_loc; 1668 smatis1->cmap_loc = cmap_loc; 1669 #endif 1670 1671 smatis1->destroy = submat->ops->destroy; 1672 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1673 submat->factortype = C->factortype; 1674 1675 /* compute rmax */ 1676 rmax = 0; 1677 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1678 1679 } else { /* scall == MAT_REUSE_MATRIX */ 1680 submat = submats[0]; 1681 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1682 1683 subc = (Mat_SeqAIJ*)submat->data; 1684 rmax = subc->rmax; 1685 smatis1 = subc->submatis1; 1686 nrqs = smatis1->nrqs; 1687 nrqr = smatis1->nrqr; 1688 rbuf1 = smatis1->rbuf1; 1689 rbuf2 = smatis1->rbuf2; 1690 rbuf3 = smatis1->rbuf3; 1691 req_source2 = smatis1->req_source2; 1692 1693 sbuf1 = smatis1->sbuf1; 1694 sbuf2 = smatis1->sbuf2; 1695 ptr = smatis1->ptr; 1696 tmp = smatis1->tmp; 1697 ctr = smatis1->ctr; 1698 1699 pa = smatis1->pa; 1700 req_size = smatis1->req_size; 1701 req_source1 = smatis1->req_source1; 1702 1703 allcolumns = smatis1->allcolumns; 1704 row2proc = smatis1->row2proc; 1705 rmap = smatis1->rmap; 1706 cmap = smatis1->cmap; 1707 #if defined(PETSC_USE_CTABLE) 1708 rmap_loc = smatis1->rmap_loc; 1709 cmap_loc = smatis1->cmap_loc; 1710 #endif 1711 } 1712 1713 /* Post recv matrix values */ 1714 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1715 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1716 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1717 for (i=0; i<nrqs; ++i) { 1718 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1719 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1720 } 1721 1722 /* Allocate sending buffers for a->a, and send them off */ 1723 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1724 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1725 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1726 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1727 1728 for (i=0; i<nrqr; i++) { 1729 rbuf1_i = rbuf1[i]; 1730 sbuf_aa_i = sbuf_aa[i]; 1731 ct1 = 2*rbuf1_i[0]+1; 1732 ct2 = 0; 1733 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1734 1735 kmax = rbuf1_i[2]; 1736 for (k=0; k<kmax; k++,ct1++) { 1737 row = rbuf1_i[ct1] - rstart; 1738 nzA = ai[row+1] - ai[row]; 1739 nzB = bi[row+1] - bi[row]; 1740 ncols = nzA + nzB; 1741 cworkB = bj + bi[row]; 1742 vworkA = a_a + ai[row]; 1743 vworkB = b_a + bi[row]; 1744 1745 /* load the column values for this row into vals*/ 1746 vals = sbuf_aa_i + ct2; 1747 1748 lwrite = 0; 1749 for (l=0; l<nzB; l++) { 1750 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1751 } 1752 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1753 for (l=0; l<nzB; l++) { 1754 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1755 } 1756 1757 ct2 += ncols; 1758 } 1759 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1760 } 1761 1762 /* Assemble submat */ 1763 /* First assemble the local rows */ 1764 for (j=0; j<nrow; j++) { 1765 row = irow[j]; 1766 proc = row2proc[j]; 1767 if (proc == rank) { 1768 Crow = row - rstart; /* local row index of C */ 1769 #if defined(PETSC_USE_CTABLE) 1770 row = rmap_loc[Crow]; /* row index of submat */ 1771 #else 1772 row = rmap[row]; 1773 #endif 1774 1775 if (allcolumns) { 1776 /* diagonal part A = c->A */ 1777 ncols = ai[Crow+1] - ai[Crow]; 1778 cols = aj + ai[Crow]; 1779 vals = a->a + ai[Crow]; 1780 i = 0; 1781 for (k=0; k<ncols; k++) { 1782 subcols[i] = cols[k] + cstart; 1783 subvals[i++] = vals[k]; 1784 } 1785 1786 /* off-diagonal part B = c->B */ 1787 ncols = bi[Crow+1] - bi[Crow]; 1788 cols = bj + bi[Crow]; 1789 vals = b->a + bi[Crow]; 1790 for (k=0; k<ncols; k++) { 1791 subcols[i] = bmap[cols[k]]; 1792 subvals[i++] = vals[k]; 1793 } 1794 1795 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1796 1797 } else { /* !allcolumns */ 1798 #if defined(PETSC_USE_CTABLE) 1799 /* diagonal part A = c->A */ 1800 ncols = ai[Crow+1] - ai[Crow]; 1801 cols = aj + ai[Crow]; 1802 vals = a->a + ai[Crow]; 1803 i = 0; 1804 for (k=0; k<ncols; k++) { 1805 tcol = cmap_loc[cols[k]]; 1806 if (tcol) { 1807 subcols[i] = --tcol; 1808 subvals[i++] = vals[k]; 1809 } 1810 } 1811 1812 /* off-diagonal part B = c->B */ 1813 ncols = bi[Crow+1] - bi[Crow]; 1814 cols = bj + bi[Crow]; 1815 vals = b->a + bi[Crow]; 1816 for (k=0; k<ncols; k++) { 1817 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1818 if (tcol) { 1819 subcols[i] = --tcol; 1820 subvals[i++] = vals[k]; 1821 } 1822 } 1823 #else 1824 /* diagonal part A = c->A */ 1825 ncols = ai[Crow+1] - ai[Crow]; 1826 cols = aj + ai[Crow]; 1827 vals = a->a + ai[Crow]; 1828 i = 0; 1829 for (k=0; k<ncols; k++) { 1830 tcol = cmap[cols[k]+cstart]; 1831 if (tcol) { 1832 subcols[i] = --tcol; 1833 subvals[i++] = vals[k]; 1834 } 1835 } 1836 1837 /* off-diagonal part B = c->B */ 1838 ncols = bi[Crow+1] - bi[Crow]; 1839 cols = bj + bi[Crow]; 1840 vals = b->a + bi[Crow]; 1841 for (k=0; k<ncols; k++) { 1842 tcol = cmap[bmap[cols[k]]]; 1843 if (tcol) { 1844 subcols[i] = --tcol; 1845 subvals[i++] = vals[k]; 1846 } 1847 } 1848 #endif 1849 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1850 } 1851 } 1852 } 1853 1854 /* Now assemble the off-proc rows */ 1855 for (i=0; i<nrqs; i++) { /* for each requested message */ 1856 /* recv values from other processes */ 1857 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1858 proc = pa[idex]; 1859 sbuf1_i = sbuf1[proc]; 1860 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1861 ct1 = 2 + 1; 1862 ct2 = 0; /* count of received C->j */ 1863 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1864 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1865 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1866 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1867 1868 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1869 max1 = sbuf1_i[2]; /* num of rows */ 1870 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1871 row = sbuf1_i[ct1]; /* row index of submat */ 1872 if (!allcolumns) { 1873 idex = 0; 1874 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1875 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1876 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1877 #if defined(PETSC_USE_CTABLE) 1878 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1879 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1880 } else { 1881 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1882 } 1883 #else 1884 tcol = cmap[rbuf3_i[ct2]]; 1885 #endif 1886 if (tcol) { 1887 subcols[idex] = --tcol; /* may not be sorted */ 1888 subvals[idex++] = rbuf4_i[ct2]; 1889 1890 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1891 For reuse, we replace received C->j with index that should be inserted to submat */ 1892 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1893 } 1894 } 1895 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1896 } else { /* scall == MAT_REUSE_MATRIX */ 1897 submat = submats[0]; 1898 subc = (Mat_SeqAIJ*)submat->data; 1899 1900 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1901 for (l=0; l<nnz; l++) { 1902 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1903 subvals[idex++] = rbuf4_i[ct2]; 1904 } 1905 1906 bj = subc->j + subc->i[row]; /* sorted column indices */ 1907 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1908 } 1909 } else { /* allcolumns */ 1910 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1911 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1912 ct2 += nnz; 1913 } 1914 } 1915 } 1916 1917 /* sending a->a are done */ 1918 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1919 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1920 1921 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1922 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1923 submats[0] = submat; 1924 1925 /* Restore the indices */ 1926 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1927 if (!allcolumns) { 1928 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1929 } 1930 1931 /* Destroy allocated memory */ 1932 for (i=0; i<nrqs; ++i) { 1933 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1934 } 1935 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1936 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1937 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1938 1939 if (scall == MAT_INITIAL_MATRIX) { 1940 ierr = PetscFree(lens);CHKERRQ(ierr); 1941 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1942 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1943 } 1944 PetscFunctionReturn(0); 1945 } 1946 1947 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1948 { 1949 PetscErrorCode ierr; 1950 PetscInt ncol; 1951 PetscBool colflag,allcolumns=PETSC_FALSE; 1952 1953 PetscFunctionBegin; 1954 /* Allocate memory to hold all the submatrices */ 1955 if (scall == MAT_INITIAL_MATRIX) { 1956 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1957 } 1958 1959 /* Check for special case: each processor gets entire matrix columns */ 1960 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1961 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1962 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1963 1964 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1965 PetscFunctionReturn(0); 1966 } 1967 1968 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1969 { 1970 PetscErrorCode ierr; 1971 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1972 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1973 Mat_SeqAIJ *subc; 1974 Mat_SubSppt *smat; 1975 1976 PetscFunctionBegin; 1977 /* Check for special case: each processor has a single IS */ 1978 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1979 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1980 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1981 PetscFunctionReturn(0); 1982 } 1983 1984 /* Collect global wantallmatrix and nstages */ 1985 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1986 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1987 if (!nmax) nmax = 1; 1988 1989 if (scall == MAT_INITIAL_MATRIX) { 1990 /* Collect global wantallmatrix and nstages */ 1991 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1992 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1993 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1994 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1995 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1996 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1997 wantallmatrix = PETSC_TRUE; 1998 1999 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2000 } 2001 } 2002 2003 /* Determine the number of stages through which submatrices are done 2004 Each stage will extract nmax submatrices. 2005 nmax is determined by the matrix column dimension. 2006 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2007 */ 2008 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2009 2010 in[0] = -1*(PetscInt)wantallmatrix; 2011 in[1] = nstages; 2012 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2013 wantallmatrix = (PetscBool)(-out[0]); 2014 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2015 2016 } else { /* MAT_REUSE_MATRIX */ 2017 if (ismax) { 2018 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2019 smat = subc->submatis1; 2020 } else { /* (*submat)[0] is a dummy matrix */ 2021 smat = (Mat_SubSppt*)(*submat)[0]->data; 2022 } 2023 if (!smat) { 2024 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2025 wantallmatrix = PETSC_TRUE; 2026 } else if (smat->singleis) { 2027 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2028 PetscFunctionReturn(0); 2029 } else { 2030 nstages = smat->nstages; 2031 } 2032 } 2033 2034 if (wantallmatrix) { 2035 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2036 PetscFunctionReturn(0); 2037 } 2038 2039 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2040 if (scall == MAT_INITIAL_MATRIX) { 2041 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2042 } 2043 2044 for (i=0,pos=0; i<nstages; i++) { 2045 if (pos+nmax <= ismax) max_no = nmax; 2046 else if (pos >= ismax) max_no = 0; 2047 else max_no = ismax-pos; 2048 2049 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2050 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2051 smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++; 2052 smat->nstages = nstages; 2053 } 2054 pos += max_no; 2055 } 2056 2057 if (ismax && scall == MAT_INITIAL_MATRIX) { 2058 /* save nstages for reuse */ 2059 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2060 smat = subc->submatis1; 2061 smat->nstages = nstages; 2062 } 2063 PetscFunctionReturn(0); 2064 } 2065 2066 /* -------------------------------------------------------------------------*/ 2067 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2068 { 2069 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2070 Mat A = c->A; 2071 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2072 const PetscInt **icol,**irow; 2073 PetscInt *nrow,*ncol,start; 2074 PetscErrorCode ierr; 2075 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2076 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2077 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2078 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2079 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2080 #if defined(PETSC_USE_CTABLE) 2081 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2082 #else 2083 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2084 #endif 2085 const PetscInt *irow_i; 2086 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2087 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2088 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2089 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2090 MPI_Status *r_status3,*r_status4,*s_status4; 2091 MPI_Comm comm; 2092 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2093 PetscMPIInt *onodes1,*olengths1,end; 2094 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2095 Mat_SubSppt *smat_i; 2096 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2097 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2098 2099 PetscFunctionBegin; 2100 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2101 size = c->size; 2102 rank = c->rank; 2103 2104 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2105 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2106 2107 for (i=0; i<ismax; i++) { 2108 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2109 if (!issorted[i]) iscsorted = issorted[i]; 2110 2111 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2112 2113 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2114 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2115 2116 /* Check for special case: allcolumn */ 2117 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2118 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2119 if (colflag && ncol[i] == C->cmap->N) { 2120 allcolumns[i] = PETSC_TRUE; 2121 icol[i] = NULL; 2122 } else { 2123 allcolumns[i] = PETSC_FALSE; 2124 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2125 } 2126 } 2127 2128 if (scall == MAT_REUSE_MATRIX) { 2129 /* Assumes new rows are same length as the old rows */ 2130 for (i=0; i<ismax; i++) { 2131 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2132 subc = (Mat_SeqAIJ*)submats[i]->data; 2133 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2134 2135 /* Initial matrix as if empty */ 2136 ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr); 2137 2138 smat_i = subc->submatis1; 2139 2140 nrqs = smat_i->nrqs; 2141 nrqr = smat_i->nrqr; 2142 rbuf1 = smat_i->rbuf1; 2143 rbuf2 = smat_i->rbuf2; 2144 rbuf3 = smat_i->rbuf3; 2145 req_source2 = smat_i->req_source2; 2146 2147 sbuf1 = smat_i->sbuf1; 2148 sbuf2 = smat_i->sbuf2; 2149 ptr = smat_i->ptr; 2150 tmp = smat_i->tmp; 2151 ctr = smat_i->ctr; 2152 2153 pa = smat_i->pa; 2154 req_size = smat_i->req_size; 2155 req_source1 = smat_i->req_source1; 2156 2157 allcolumns[i] = smat_i->allcolumns; 2158 row2proc[i] = smat_i->row2proc; 2159 rmap[i] = smat_i->rmap; 2160 cmap[i] = smat_i->cmap; 2161 } 2162 2163 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2164 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2165 smat_i = (Mat_SubSppt*)submats[0]->data; 2166 2167 nrqs = smat_i->nrqs; 2168 nrqr = smat_i->nrqr; 2169 rbuf1 = smat_i->rbuf1; 2170 rbuf2 = smat_i->rbuf2; 2171 rbuf3 = smat_i->rbuf3; 2172 req_source2 = smat_i->req_source2; 2173 2174 sbuf1 = smat_i->sbuf1; 2175 sbuf2 = smat_i->sbuf2; 2176 ptr = smat_i->ptr; 2177 tmp = smat_i->tmp; 2178 ctr = smat_i->ctr; 2179 2180 pa = smat_i->pa; 2181 req_size = smat_i->req_size; 2182 req_source1 = smat_i->req_source1; 2183 2184 allcolumns[0] = PETSC_FALSE; 2185 } 2186 } else { /* scall == MAT_INITIAL_MATRIX */ 2187 /* Get some new tags to keep the communication clean */ 2188 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2189 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2190 2191 /* evaluate communication - mesg to who, length of mesg, and buffer space 2192 required. Based on this, buffers are allocated, and data copied into them*/ 2193 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2194 2195 for (i=0; i<ismax; i++) { 2196 jmax = nrow[i]; 2197 irow_i = irow[i]; 2198 2199 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2200 row2proc[i] = row2proc_i; 2201 2202 if (issorted[i]) proc = 0; 2203 for (j=0; j<jmax; j++) { 2204 if (!issorted[i]) proc = 0; 2205 row = irow_i[j]; 2206 while (row >= C->rmap->range[proc+1]) proc++; 2207 w4[proc]++; 2208 row2proc_i[j] = proc; /* map row index to proc */ 2209 } 2210 for (j=0; j<size; j++) { 2211 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2212 } 2213 } 2214 2215 nrqs = 0; /* no of outgoing messages */ 2216 msz = 0; /* total mesg length (for all procs) */ 2217 w1[rank] = 0; /* no mesg sent to self */ 2218 w3[rank] = 0; 2219 for (i=0; i<size; i++) { 2220 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2221 } 2222 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2223 for (i=0,j=0; i<size; i++) { 2224 if (w1[i]) { pa[j] = i; j++; } 2225 } 2226 2227 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2228 for (i=0; i<nrqs; i++) { 2229 j = pa[i]; 2230 w1[j] += w2[j] + 2* w3[j]; 2231 msz += w1[j]; 2232 } 2233 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2234 2235 /* Determine the number of messages to expect, their lengths, from from-ids */ 2236 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2237 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2238 2239 /* Now post the Irecvs corresponding to these messages */ 2240 tag0 = ((PetscObject)C)->tag; 2241 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2242 2243 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2244 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2245 2246 /* Allocate Memory for outgoing messages */ 2247 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2248 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 2249 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 2250 2251 { 2252 PetscInt *iptr = tmp; 2253 k = 0; 2254 for (i=0; i<nrqs; i++) { 2255 j = pa[i]; 2256 iptr += k; 2257 sbuf1[j] = iptr; 2258 k = w1[j]; 2259 } 2260 } 2261 2262 /* Form the outgoing messages. Initialize the header space */ 2263 for (i=0; i<nrqs; i++) { 2264 j = pa[i]; 2265 sbuf1[j][0] = 0; 2266 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 2267 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2268 } 2269 2270 /* Parse the isrow and copy data into outbuf */ 2271 for (i=0; i<ismax; i++) { 2272 row2proc_i = row2proc[i]; 2273 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 2274 irow_i = irow[i]; 2275 jmax = nrow[i]; 2276 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2277 proc = row2proc_i[j]; 2278 if (proc != rank) { /* copy to the outgoing buf*/ 2279 ctr[proc]++; 2280 *ptr[proc] = irow_i[j]; 2281 ptr[proc]++; 2282 } 2283 } 2284 /* Update the headers for the current IS */ 2285 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2286 if ((ctr_j = ctr[j])) { 2287 sbuf1_j = sbuf1[j]; 2288 k = ++sbuf1_j[0]; 2289 sbuf1_j[2*k] = ctr_j; 2290 sbuf1_j[2*k-1] = i; 2291 } 2292 } 2293 } 2294 2295 /* Now post the sends */ 2296 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2297 for (i=0; i<nrqs; ++i) { 2298 j = pa[i]; 2299 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2300 } 2301 2302 /* Post Receives to capture the buffer size */ 2303 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2304 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2305 rbuf2[0] = tmp + msz; 2306 for (i=1; i<nrqs; ++i) { 2307 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2308 } 2309 for (i=0; i<nrqs; ++i) { 2310 j = pa[i]; 2311 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2312 } 2313 2314 /* Send to other procs the buf size they should allocate */ 2315 /* Receive messages*/ 2316 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2317 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2318 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2319 { 2320 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2321 PetscInt *sbuf2_i; 2322 2323 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2324 for (i=0; i<nrqr; ++i) { 2325 req_size[i] = 0; 2326 rbuf1_i = rbuf1[i]; 2327 start = 2*rbuf1_i[0] + 1; 2328 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2329 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2330 sbuf2_i = sbuf2[i]; 2331 for (j=start; j<end; j++) { 2332 id = rbuf1_i[j] - rstart; 2333 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2334 sbuf2_i[j] = ncols; 2335 req_size[i] += ncols; 2336 } 2337 req_source1[i] = r_status1[i].MPI_SOURCE; 2338 /* form the header */ 2339 sbuf2_i[0] = req_size[i]; 2340 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2341 2342 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2343 } 2344 } 2345 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2346 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2347 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2348 2349 /* Receive messages*/ 2350 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2351 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2352 2353 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2354 for (i=0; i<nrqs; ++i) { 2355 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2356 req_source2[i] = r_status2[i].MPI_SOURCE; 2357 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2358 } 2359 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2360 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2361 2362 /* Wait on sends1 and sends2 */ 2363 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2364 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2365 2366 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2367 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2368 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2369 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2370 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2371 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2372 2373 /* Now allocate sending buffers for a->j, and send them off */ 2374 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2375 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2376 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2377 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2378 2379 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2380 { 2381 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2382 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2383 PetscInt cend = C->cmap->rend; 2384 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2385 2386 for (i=0; i<nrqr; i++) { 2387 rbuf1_i = rbuf1[i]; 2388 sbuf_aj_i = sbuf_aj[i]; 2389 ct1 = 2*rbuf1_i[0] + 1; 2390 ct2 = 0; 2391 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2392 kmax = rbuf1[i][2*j]; 2393 for (k=0; k<kmax; k++,ct1++) { 2394 row = rbuf1_i[ct1] - rstart; 2395 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2396 ncols = nzA + nzB; 2397 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2398 2399 /* load the column indices for this row into cols */ 2400 cols = sbuf_aj_i + ct2; 2401 2402 lwrite = 0; 2403 for (l=0; l<nzB; l++) { 2404 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2405 } 2406 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2407 for (l=0; l<nzB; l++) { 2408 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2409 } 2410 2411 ct2 += ncols; 2412 } 2413 } 2414 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2415 } 2416 } 2417 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2418 2419 /* create col map: global col of C -> local col of submatrices */ 2420 { 2421 const PetscInt *icol_i; 2422 #if defined(PETSC_USE_CTABLE) 2423 for (i=0; i<ismax; i++) { 2424 if (!allcolumns[i]) { 2425 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2426 2427 jmax = ncol[i]; 2428 icol_i = icol[i]; 2429 cmap_i = cmap[i]; 2430 for (j=0; j<jmax; j++) { 2431 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2432 } 2433 } else cmap[i] = NULL; 2434 } 2435 #else 2436 for (i=0; i<ismax; i++) { 2437 if (!allcolumns[i]) { 2438 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2439 jmax = ncol[i]; 2440 icol_i = icol[i]; 2441 cmap_i = cmap[i]; 2442 for (j=0; j<jmax; j++) { 2443 cmap_i[icol_i[j]] = j+1; 2444 } 2445 } else cmap[i] = NULL; 2446 } 2447 #endif 2448 } 2449 2450 /* Create lens which is required for MatCreate... */ 2451 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2452 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2453 2454 if (ismax) { 2455 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2456 } 2457 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2458 2459 /* Update lens from local data */ 2460 for (i=0; i<ismax; i++) { 2461 row2proc_i = row2proc[i]; 2462 jmax = nrow[i]; 2463 if (!allcolumns[i]) cmap_i = cmap[i]; 2464 irow_i = irow[i]; 2465 lens_i = lens[i]; 2466 for (j=0; j<jmax; j++) { 2467 row = irow_i[j]; 2468 proc = row2proc_i[j]; 2469 if (proc == rank) { 2470 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2471 if (!allcolumns[i]) { 2472 for (k=0; k<ncols; k++) { 2473 #if defined(PETSC_USE_CTABLE) 2474 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2475 #else 2476 tcol = cmap_i[cols[k]]; 2477 #endif 2478 if (tcol) lens_i[j]++; 2479 } 2480 } else { /* allcolumns */ 2481 lens_i[j] = ncols; 2482 } 2483 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 2484 } 2485 } 2486 } 2487 2488 /* Create row map: global row of C -> local row of submatrices */ 2489 #if defined(PETSC_USE_CTABLE) 2490 for (i=0; i<ismax; i++) { 2491 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2492 irow_i = irow[i]; 2493 jmax = nrow[i]; 2494 for (j=0; j<jmax; j++) { 2495 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2496 } 2497 } 2498 #else 2499 for (i=0; i<ismax; i++) { 2500 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2501 rmap_i = rmap[i]; 2502 irow_i = irow[i]; 2503 jmax = nrow[i]; 2504 for (j=0; j<jmax; j++) { 2505 rmap_i[irow_i[j]] = j; 2506 } 2507 } 2508 #endif 2509 2510 /* Update lens from offproc data */ 2511 { 2512 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2513 2514 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2515 for (tmp2=0; tmp2<nrqs; tmp2++) { 2516 sbuf1_i = sbuf1[pa[tmp2]]; 2517 jmax = sbuf1_i[0]; 2518 ct1 = 2*jmax+1; 2519 ct2 = 0; 2520 rbuf2_i = rbuf2[tmp2]; 2521 rbuf3_i = rbuf3[tmp2]; 2522 for (j=1; j<=jmax; j++) { 2523 is_no = sbuf1_i[2*j-1]; 2524 max1 = sbuf1_i[2*j]; 2525 lens_i = lens[is_no]; 2526 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2527 rmap_i = rmap[is_no]; 2528 for (k=0; k<max1; k++,ct1++) { 2529 #if defined(PETSC_USE_CTABLE) 2530 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2531 row--; 2532 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2533 #else 2534 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2535 #endif 2536 max2 = rbuf2_i[ct1]; 2537 for (l=0; l<max2; l++,ct2++) { 2538 if (!allcolumns[is_no]) { 2539 #if defined(PETSC_USE_CTABLE) 2540 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2541 #else 2542 tcol = cmap_i[rbuf3_i[ct2]]; 2543 #endif 2544 if (tcol) lens_i[row]++; 2545 } else { /* allcolumns */ 2546 lens_i[row]++; /* lens_i[row] += max2 ? */ 2547 } 2548 } 2549 } 2550 } 2551 } 2552 } 2553 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2554 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2555 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2556 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2557 2558 /* Create the submatrices */ 2559 for (i=0; i<ismax; i++) { 2560 PetscInt rbs,cbs; 2561 2562 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2563 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2564 2565 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2566 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2567 2568 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2569 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2570 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2571 2572 /* create struct Mat_SubSppt and attached it to submat */ 2573 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2574 subc = (Mat_SeqAIJ*)submats[i]->data; 2575 subc->submatis1 = smat_i; 2576 2577 smat_i->destroy = submats[i]->ops->destroy; 2578 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2579 submats[i]->factortype = C->factortype; 2580 2581 smat_i->id = i; 2582 smat_i->nrqs = nrqs; 2583 smat_i->nrqr = nrqr; 2584 smat_i->rbuf1 = rbuf1; 2585 smat_i->rbuf2 = rbuf2; 2586 smat_i->rbuf3 = rbuf3; 2587 smat_i->sbuf2 = sbuf2; 2588 smat_i->req_source2 = req_source2; 2589 2590 smat_i->sbuf1 = sbuf1; 2591 smat_i->ptr = ptr; 2592 smat_i->tmp = tmp; 2593 smat_i->ctr = ctr; 2594 2595 smat_i->pa = pa; 2596 smat_i->req_size = req_size; 2597 smat_i->req_source1 = req_source1; 2598 2599 smat_i->allcolumns = allcolumns[i]; 2600 smat_i->singleis = PETSC_FALSE; 2601 smat_i->row2proc = row2proc[i]; 2602 smat_i->rmap = rmap[i]; 2603 smat_i->cmap = cmap[i]; 2604 } 2605 2606 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2607 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2608 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2609 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2610 2611 /* create struct Mat_SubSppt and attached it to submat */ 2612 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2613 submats[0]->data = (void*)smat_i; 2614 2615 smat_i->destroy = submats[0]->ops->destroy; 2616 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2617 submats[0]->factortype = C->factortype; 2618 2619 smat_i->id = 0; 2620 smat_i->nrqs = nrqs; 2621 smat_i->nrqr = nrqr; 2622 smat_i->rbuf1 = rbuf1; 2623 smat_i->rbuf2 = rbuf2; 2624 smat_i->rbuf3 = rbuf3; 2625 smat_i->sbuf2 = sbuf2; 2626 smat_i->req_source2 = req_source2; 2627 2628 smat_i->sbuf1 = sbuf1; 2629 smat_i->ptr = ptr; 2630 smat_i->tmp = tmp; 2631 smat_i->ctr = ctr; 2632 2633 smat_i->pa = pa; 2634 smat_i->req_size = req_size; 2635 smat_i->req_source1 = req_source1; 2636 2637 smat_i->allcolumns = PETSC_FALSE; 2638 smat_i->singleis = PETSC_FALSE; 2639 smat_i->row2proc = NULL; 2640 smat_i->rmap = NULL; 2641 smat_i->cmap = NULL; 2642 } 2643 2644 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2645 ierr = PetscFree(lens);CHKERRQ(ierr); 2646 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2647 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2648 2649 } /* endof scall == MAT_INITIAL_MATRIX */ 2650 2651 /* Post recv matrix values */ 2652 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2653 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2654 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2655 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2656 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2657 for (i=0; i<nrqs; ++i) { 2658 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2659 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2660 } 2661 2662 /* Allocate sending buffers for a->a, and send them off */ 2663 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2664 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2665 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2666 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2667 2668 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2669 { 2670 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2671 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2672 PetscInt cend = C->cmap->rend; 2673 PetscInt *b_j = b->j; 2674 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2675 2676 for (i=0; i<nrqr; i++) { 2677 rbuf1_i = rbuf1[i]; 2678 sbuf_aa_i = sbuf_aa[i]; 2679 ct1 = 2*rbuf1_i[0]+1; 2680 ct2 = 0; 2681 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2682 kmax = rbuf1_i[2*j]; 2683 for (k=0; k<kmax; k++,ct1++) { 2684 row = rbuf1_i[ct1] - rstart; 2685 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2686 ncols = nzA + nzB; 2687 cworkB = b_j + b_i[row]; 2688 vworkA = a_a + a_i[row]; 2689 vworkB = b_a + b_i[row]; 2690 2691 /* load the column values for this row into vals*/ 2692 vals = sbuf_aa_i+ct2; 2693 2694 lwrite = 0; 2695 for (l=0; l<nzB; l++) { 2696 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2697 } 2698 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2699 for (l=0; l<nzB; l++) { 2700 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2701 } 2702 2703 ct2 += ncols; 2704 } 2705 } 2706 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2707 } 2708 } 2709 2710 /* Assemble the matrices */ 2711 /* First assemble the local rows */ 2712 for (i=0; i<ismax; i++) { 2713 row2proc_i = row2proc[i]; 2714 subc = (Mat_SeqAIJ*)submats[i]->data; 2715 imat_ilen = subc->ilen; 2716 imat_j = subc->j; 2717 imat_i = subc->i; 2718 imat_a = subc->a; 2719 2720 if (!allcolumns[i]) cmap_i = cmap[i]; 2721 rmap_i = rmap[i]; 2722 irow_i = irow[i]; 2723 jmax = nrow[i]; 2724 for (j=0; j<jmax; j++) { 2725 row = irow_i[j]; 2726 proc = row2proc_i[j]; 2727 if (proc == rank) { 2728 old_row = row; 2729 #if defined(PETSC_USE_CTABLE) 2730 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2731 row--; 2732 #else 2733 row = rmap_i[row]; 2734 #endif 2735 ilen_row = imat_ilen[row]; 2736 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2737 mat_i = imat_i[row]; 2738 mat_a = imat_a + mat_i; 2739 mat_j = imat_j + mat_i; 2740 if (!allcolumns[i]) { 2741 for (k=0; k<ncols; k++) { 2742 #if defined(PETSC_USE_CTABLE) 2743 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2744 #else 2745 tcol = cmap_i[cols[k]]; 2746 #endif 2747 if (tcol) { 2748 *mat_j++ = tcol - 1; 2749 *mat_a++ = vals[k]; 2750 ilen_row++; 2751 } 2752 } 2753 } else { /* allcolumns */ 2754 for (k=0; k<ncols; k++) { 2755 *mat_j++ = cols[k]; /* global col index! */ 2756 *mat_a++ = vals[k]; 2757 ilen_row++; 2758 } 2759 } 2760 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2761 2762 imat_ilen[row] = ilen_row; 2763 } 2764 } 2765 } 2766 2767 /* Now assemble the off proc rows */ 2768 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2769 for (tmp2=0; tmp2<nrqs; tmp2++) { 2770 sbuf1_i = sbuf1[pa[tmp2]]; 2771 jmax = sbuf1_i[0]; 2772 ct1 = 2*jmax + 1; 2773 ct2 = 0; 2774 rbuf2_i = rbuf2[tmp2]; 2775 rbuf3_i = rbuf3[tmp2]; 2776 rbuf4_i = rbuf4[tmp2]; 2777 for (j=1; j<=jmax; j++) { 2778 is_no = sbuf1_i[2*j-1]; 2779 rmap_i = rmap[is_no]; 2780 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2781 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2782 imat_ilen = subc->ilen; 2783 imat_j = subc->j; 2784 imat_i = subc->i; 2785 imat_a = subc->a; 2786 max1 = sbuf1_i[2*j]; 2787 for (k=0; k<max1; k++,ct1++) { 2788 row = sbuf1_i[ct1]; 2789 #if defined(PETSC_USE_CTABLE) 2790 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2791 row--; 2792 #else 2793 row = rmap_i[row]; 2794 #endif 2795 ilen = imat_ilen[row]; 2796 mat_i = imat_i[row]; 2797 mat_a = imat_a + mat_i; 2798 mat_j = imat_j + mat_i; 2799 max2 = rbuf2_i[ct1]; 2800 if (!allcolumns[is_no]) { 2801 for (l=0; l<max2; l++,ct2++) { 2802 #if defined(PETSC_USE_CTABLE) 2803 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2804 #else 2805 tcol = cmap_i[rbuf3_i[ct2]]; 2806 #endif 2807 if (tcol) { 2808 *mat_j++ = tcol - 1; 2809 *mat_a++ = rbuf4_i[ct2]; 2810 ilen++; 2811 } 2812 } 2813 } else { /* allcolumns */ 2814 for (l=0; l<max2; l++,ct2++) { 2815 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2816 *mat_a++ = rbuf4_i[ct2]; 2817 ilen++; 2818 } 2819 } 2820 imat_ilen[row] = ilen; 2821 } 2822 } 2823 } 2824 2825 if (!iscsorted) { /* sort column indices of the rows */ 2826 for (i=0; i<ismax; i++) { 2827 subc = (Mat_SeqAIJ*)submats[i]->data; 2828 imat_j = subc->j; 2829 imat_i = subc->i; 2830 imat_a = subc->a; 2831 imat_ilen = subc->ilen; 2832 2833 if (allcolumns[i]) continue; 2834 jmax = nrow[i]; 2835 for (j=0; j<jmax; j++) { 2836 mat_i = imat_i[j]; 2837 mat_a = imat_a + mat_i; 2838 mat_j = imat_j + mat_i; 2839 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2840 } 2841 } 2842 } 2843 2844 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2845 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2846 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2847 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2848 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2849 2850 /* Restore the indices */ 2851 for (i=0; i<ismax; i++) { 2852 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2853 if (!allcolumns[i]) { 2854 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2855 } 2856 } 2857 2858 for (i=0; i<ismax; i++) { 2859 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2860 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2861 } 2862 2863 /* Destroy allocated memory */ 2864 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2865 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2866 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2867 2868 for (i=0; i<nrqs; ++i) { 2869 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2870 } 2871 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2872 2873 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2874 PetscFunctionReturn(0); 2875 } 2876 2877 /* 2878 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2879 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2880 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2881 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2882 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2883 state, and needs to be "assembled" later by compressing B's column space. 2884 2885 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2886 Following this call, C->A & C->B have been created, even if empty. 2887 */ 2888 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2889 { 2890 /* If making this function public, change the error returned in this function away from _PLIB. */ 2891 PetscErrorCode ierr; 2892 Mat_MPIAIJ *aij; 2893 Mat_SeqAIJ *Baij; 2894 PetscBool seqaij,Bdisassembled; 2895 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2896 PetscScalar v; 2897 const PetscInt *rowindices,*colindices; 2898 2899 PetscFunctionBegin; 2900 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2901 if (A) { 2902 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2903 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2904 if (rowemb) { 2905 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2906 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2907 } else { 2908 if (C->rmap->n != A->rmap->n) { 2909 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2910 } 2911 } 2912 if (dcolemb) { 2913 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2914 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2915 } else { 2916 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2917 } 2918 } 2919 if (B) { 2920 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2921 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2922 if (rowemb) { 2923 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2924 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2925 } else { 2926 if (C->rmap->n != B->rmap->n) { 2927 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2928 } 2929 } 2930 if (ocolemb) { 2931 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2932 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2933 } else { 2934 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2935 } 2936 } 2937 2938 aij = (Mat_MPIAIJ*)C->data; 2939 if (!aij->A) { 2940 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2941 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2942 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2943 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2944 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2945 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2946 } 2947 if (A) { 2948 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2949 } else { 2950 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2951 } 2952 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2953 /* 2954 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2955 need to "disassemble" B -- convert it to using C's global indices. 2956 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2957 2958 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2959 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2960 2961 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2962 At least avoid calling MatSetValues() and the implied searches? 2963 */ 2964 2965 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2966 #if defined(PETSC_USE_CTABLE) 2967 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2968 #else 2969 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2970 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2971 if (aij->B) { 2972 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2973 } 2974 #endif 2975 ngcol = 0; 2976 if (aij->lvec) { 2977 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2978 } 2979 if (aij->garray) { 2980 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2981 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2982 } 2983 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2984 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2985 } 2986 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2987 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2988 } 2989 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2990 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2991 } 2992 } 2993 Bdisassembled = PETSC_FALSE; 2994 if (!aij->B) { 2995 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2996 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2997 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 2998 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 2999 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3000 Bdisassembled = PETSC_TRUE; 3001 } 3002 if (B) { 3003 Baij = (Mat_SeqAIJ*)B->data; 3004 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3005 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3006 for (i=0; i<B->rmap->n; i++) { 3007 nz[i] = Baij->i[i+1] - Baij->i[i]; 3008 } 3009 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3010 ierr = PetscFree(nz);CHKERRQ(ierr); 3011 } 3012 3013 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3014 shift = rend-rstart; 3015 count = 0; 3016 rowindices = NULL; 3017 colindices = NULL; 3018 if (rowemb) { 3019 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3020 } 3021 if (ocolemb) { 3022 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3023 } 3024 for (i=0; i<B->rmap->n; i++) { 3025 PetscInt row; 3026 row = i; 3027 if (rowindices) row = rowindices[i]; 3028 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3029 col = Baij->j[count]; 3030 if (colindices) col = colindices[col]; 3031 if (Bdisassembled && col>=rstart) col += shift; 3032 v = Baij->a[count]; 3033 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3034 ++count; 3035 } 3036 } 3037 /* No assembly for aij->B is necessary. */ 3038 /* FIXME: set aij->B's nonzerostate correctly. */ 3039 } else { 3040 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3041 } 3042 C->preallocated = PETSC_TRUE; 3043 C->was_assembled = PETSC_FALSE; 3044 C->assembled = PETSC_FALSE; 3045 /* 3046 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3047 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3048 */ 3049 PetscFunctionReturn(0); 3050 } 3051 3052 /* 3053 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3054 */ 3055 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3056 { 3057 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3058 3059 PetscFunctionBegin; 3060 PetscValidPointer(A,2); 3061 PetscValidPointer(B,3); 3062 /* FIXME: make sure C is assembled */ 3063 *A = aij->A; 3064 *B = aij->B; 3065 /* Note that we don't incref *A and *B, so be careful! */ 3066 PetscFunctionReturn(0); 3067 } 3068 3069 /* 3070 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3071 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3072 */ 3073 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3074 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3075 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3076 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3077 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3078 { 3079 PetscErrorCode ierr; 3080 PetscMPIInt isize,flag; 3081 PetscInt i,ii,cismax,ispar; 3082 Mat *A,*B; 3083 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3084 3085 PetscFunctionBegin; 3086 if (!ismax) PetscFunctionReturn(0); 3087 3088 for (i = 0, cismax = 0; i < ismax; ++i) { 3089 PetscMPIInt isize; 3090 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3091 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3092 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 3093 if (isize > 1) ++cismax; 3094 } 3095 3096 /* 3097 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3098 ispar counts the number of parallel ISs across C's comm. 3099 */ 3100 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3101 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3102 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3103 PetscFunctionReturn(0); 3104 } 3105 3106 /* if (ispar) */ 3107 /* 3108 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3109 These are used to extract the off-diag portion of the resulting parallel matrix. 3110 The row IS for the off-diag portion is the same as for the diag portion, 3111 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3112 */ 3113 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3114 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3115 for (i = 0, ii = 0; i < ismax; ++i) { 3116 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3117 if (isize > 1) { 3118 /* 3119 TODO: This is the part that's ***NOT SCALABLE***. 3120 To fix this we need to extract just the indices of C's nonzero columns 3121 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3122 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3123 to be done without serializing on the IS list, so, most likely, it is best 3124 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3125 */ 3126 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3127 /* Now we have to 3128 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3129 were sorted on each rank, concatenated they might no longer be sorted; 3130 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3131 indices in the nondecreasing order to the original index positions. 3132 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3133 */ 3134 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3135 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3136 ++ii; 3137 } 3138 } 3139 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3140 for (i = 0, ii = 0; i < ismax; ++i) { 3141 PetscInt j,issize; 3142 const PetscInt *indices; 3143 3144 /* 3145 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3146 */ 3147 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3148 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3149 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3150 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3151 for (j = 1; j < issize; ++j) { 3152 if (indices[j] == indices[j-1]) { 3153 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3154 } 3155 } 3156 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3157 3158 3159 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3160 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3161 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3162 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3163 for (j = 1; j < issize; ++j) { 3164 if (indices[j-1] == indices[j]) { 3165 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3166 } 3167 } 3168 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3169 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3170 if (isize > 1) { 3171 cisrow[ii] = isrow[i]; 3172 ++ii; 3173 } 3174 } 3175 /* 3176 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3177 array of sequential matrices underlying the resulting parallel matrices. 3178 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3179 contain duplicates. 3180 3181 There are as many diag matrices as there are original index sets. There are only as many parallel 3182 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3183 3184 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3185 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3186 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3187 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3188 with A[i] and B[ii] extracted from the corresponding MPI submat. 3189 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3190 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3191 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3192 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3193 values into A[i] and B[ii] sitting inside the corresponding submat. 3194 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3195 A[i], B[ii], AA[i] or BB[ii] matrices. 3196 */ 3197 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3198 if (scall == MAT_INITIAL_MATRIX) { 3199 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3200 } 3201 3202 /* Now obtain the sequential A and B submatrices separately. */ 3203 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3204 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3205 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3206 3207 /* 3208 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3209 matrices A & B have been extracted directly into the parallel matrices containing them, or 3210 simply into the sequential matrix identical with the corresponding A (if isize == 1). 3211 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3212 to have the same sparsity pattern. 3213 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3214 must be constructed for C. This is done by setseqmat(s). 3215 */ 3216 for (i = 0, ii = 0; i < ismax; ++i) { 3217 /* 3218 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3219 That way we can avoid sorting and computing permutations when reusing. 3220 To this end: 3221 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3222 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3223 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3224 */ 3225 MatStructure pattern; 3226 pattern = DIFFERENT_NONZERO_PATTERN; 3227 3228 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 3229 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3230 if (isize > 1) { 3231 if (scall == MAT_INITIAL_MATRIX) { 3232 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3233 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3234 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3235 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3236 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3237 } 3238 /* 3239 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3240 */ 3241 { 3242 Mat AA,BB; 3243 AA = A[i]; 3244 BB = B[ii]; 3245 if (AA || BB) { 3246 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3247 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3248 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3249 } 3250 3251 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3252 } 3253 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3254 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3255 ++ii; 3256 } else { /* if (isize == 1) */ 3257 if (scall == MAT_REUSE_MATRIX) { 3258 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3259 } 3260 if (isrow_p[i] || iscol_p[i]) { 3261 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3262 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3263 /* Otherwise A is extracted straight into (*submats)[i]. */ 3264 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3265 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3266 } else (*submat)[i] = A[i]; 3267 } 3268 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3269 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3270 } 3271 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3272 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3273 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3274 ierr = PetscFree(A);CHKERRQ(ierr); 3275 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3276 PetscFunctionReturn(0); 3277 } 3278 3279 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3280 { 3281 PetscErrorCode ierr; 3282 3283 PetscFunctionBegin; 3284 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3285 PetscFunctionReturn(0); 3286 } 3287