1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++) { 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++) { 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank) { 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows) { 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++) { 115 if (tosizes_temp[i]) { 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 toffsets[0] = 0; 123 for (i=0; i<nto; i++) { 124 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 125 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 126 } 127 /* send information to other processors */ 128 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 129 nrecvrows = 0; 130 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 131 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 132 nrecvrows = 0; 133 for (i=0; i<nfrom; i++) { 134 for (j=0; j<fromsizes[2*i]; j++) { 135 remote[nrecvrows].rank = fromranks[i]; 136 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 137 } 138 } 139 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 140 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 141 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 142 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 143 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 144 /* message pair <no of is, row> */ 145 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 146 for (i=0; i<nrrows; i++) { 147 owner = rrow_ranks[i]; /* processor */ 148 j = tosizes_temp[owner]; /* index */ 149 todata[toffsets[j]++] = rrow_isids[i]; 150 todata[toffsets[j]++] = remoterows[i]; 151 } 152 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 153 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 154 ierr = PetscFree(toffsets);CHKERRQ(ierr); 155 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 157 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 158 /* send rows belonging to the remote so that then we could get the overlapping data back */ 159 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 160 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 162 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 163 ierr = PetscFree(fromranks);CHKERRQ(ierr); 164 nrecvrows = 0; 165 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 166 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 167 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 168 nrecvrows = 0; 169 for (i=0; i<nto; i++) { 170 for (j=0; j<tosizes[2*i]; j++) { 171 remote[nrecvrows].rank = toranks[i]; 172 remote[nrecvrows++].index = tosizes[2*i+1]+j; 173 } 174 } 175 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 176 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 177 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 178 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 179 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 180 /* overlap communication and computation */ 181 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 182 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 183 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 184 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 185 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 186 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 187 ierr = PetscFree(toranks);CHKERRQ(ierr); 188 ierr = PetscFree(tosizes);CHKERRQ(ierr); 189 ierr = PetscFree(todata);CHKERRQ(ierr); 190 PetscFunctionReturn(0); 191 } 192 193 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 194 { 195 PetscInt *isz,isz_i,i,j,is_id, data_size; 196 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 197 const PetscInt *indices_i_temp; 198 MPI_Comm *iscomms; 199 PetscErrorCode ierr; 200 201 PetscFunctionBegin; 202 max_lsize = 0; 203 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 204 for (i=0; i<nidx; i++){ 205 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 206 max_lsize = lsize>max_lsize ? lsize:max_lsize; 207 isz[i] = lsize; 208 } 209 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 212 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 213 ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr); 214 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 215 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 216 } 217 /* retrieve information to get row id and its overlap */ 218 for (i=0; i<nrecvs;){ 219 is_id = recvdata[i++]; 220 data_size = recvdata[i++]; 221 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 222 isz_i = isz[is_id]; 223 for (j=0; j< data_size; j++){ 224 col = recvdata[i++]; 225 indices_i[isz_i++] = col; 226 } 227 isz[is_id] = isz_i; 228 } 229 /* remove duplicate entities */ 230 for (i=0; i<nidx; i++){ 231 indices_i = indices_temp+(max_lsize+nrecvs)*i; 232 isz_i = isz[i]; 233 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 234 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 235 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 236 } 237 ierr = PetscFree(isz);CHKERRQ(ierr); 238 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 243 { 244 PetscLayout rmap,cmap; 245 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 246 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 247 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 248 const PetscInt *gcols,*ai,*aj,*bi,*bj; 249 Mat amat,bmat; 250 PetscMPIInt rank; 251 PetscBool done; 252 MPI_Comm comm; 253 PetscErrorCode ierr; 254 255 PetscFunctionBegin; 256 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 257 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 258 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 259 /* Even if the mat is symmetric, we still assume it is not symmetric */ 260 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 261 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 262 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 263 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 264 /* total number of nonzero values is used to estimate the memory usage in the next step */ 265 tnz = ai[an]+bi[bn]; 266 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 267 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 268 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 269 /* to find the longest message */ 270 max_fszs = 0; 271 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 272 /* better way to estimate number of nonzero in the mat??? */ 273 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 274 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 275 rows_pos = 0; 276 totalrows = 0; 277 for (i=0; i<nfrom; i++){ 278 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 279 /* group data together */ 280 for (j=0; j<fromsizes[2*i]; j+=2){ 281 is_id = fromrows[rows_pos++];/* no of is */ 282 rows_i = rows_data[is_id]; 283 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 284 } 285 /* estimate a space to avoid multiple allocations */ 286 for (j=0; j<nidx; j++){ 287 indvc_ij = 0; 288 rows_i = rows_data[j]; 289 for (l=0; l<rows_pos_i[j]; l++){ 290 row = rows_i[l]-rstart; 291 start = ai[row]; 292 end = ai[row+1]; 293 for (k=start; k<end; k++){ /* Amat */ 294 col = aj[k] + cstart; 295 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 296 } 297 start = bi[row]; 298 end = bi[row+1]; 299 for (k=start; k<end; k++) { /* Bmat */ 300 col = gcols[bj[k]]; 301 indices_tmp[indvc_ij++] = col; 302 } 303 } 304 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 305 indv_counts[i*nidx+j] = indvc_ij; 306 totalrows += indvc_ij; 307 } 308 } 309 /* message triple <no of is, number of rows, rows> */ 310 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 311 totalrows = 0; 312 rows_pos = 0; 313 /* use this code again */ 314 for (i=0;i<nfrom;i++){ 315 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 316 for (j=0; j<fromsizes[2*i]; j+=2){ 317 is_id = fromrows[rows_pos++]; 318 rows_i = rows_data[is_id]; 319 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 320 } 321 /* add data */ 322 for (j=0; j<nidx; j++){ 323 if (!indv_counts[i*nidx+j]) continue; 324 indvc_ij = 0; 325 sbdata[totalrows++] = j; 326 sbdata[totalrows++] = indv_counts[i*nidx+j]; 327 sbsizes[2*i] += 2; 328 rows_i = rows_data[j]; 329 for (l=0; l<rows_pos_i[j]; l++){ 330 row = rows_i[l]-rstart; 331 start = ai[row]; 332 end = ai[row+1]; 333 for (k=start; k<end; k++){ /* Amat */ 334 col = aj[k] + cstart; 335 indices_tmp[indvc_ij++] = col; 336 } 337 start = bi[row]; 338 end = bi[row+1]; 339 for (k=start; k<end; k++) { /* Bmat */ 340 col = gcols[bj[k]]; 341 indices_tmp[indvc_ij++] = col; 342 } 343 } 344 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 345 sbsizes[2*i] += indvc_ij; 346 ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr); 347 totalrows += indvc_ij; 348 } 349 } 350 ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 351 offsets[0] = 0; 352 for (i=0; i<nfrom; i++){ 353 offsets[i+1] = offsets[i] + sbsizes[2*i]; 354 sbsizes[2*i+1] = offsets[i]; 355 } 356 ierr = PetscFree(offsets);CHKERRQ(ierr); 357 if (sbrowsizes) *sbrowsizes = sbsizes; 358 if (sbrows) *sbrows = sbdata; 359 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 360 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 361 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 362 PetscFunctionReturn(0); 363 } 364 365 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 366 { 367 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 368 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 369 PetscInt lsize,lsize_tmp; 370 PetscMPIInt rank,owner; 371 Mat amat,bmat; 372 PetscBool done; 373 PetscLayout cmap,rmap; 374 MPI_Comm comm; 375 PetscErrorCode ierr; 376 377 PetscFunctionBegin; 378 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 379 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 380 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 381 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 382 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 383 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 384 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 385 /* is it a safe way to compute number of nonzero values ? */ 386 tnz = ai[an]+bi[bn]; 387 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 388 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 389 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 390 /* it is a better way to estimate memory than the old implementation 391 * where global size of matrix is used 392 * */ 393 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 394 for (i=0; i<nidx; i++) { 395 MPI_Comm iscomm; 396 397 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 398 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 399 lsize_tmp = 0; 400 for (j=0; j<lsize; j++) { 401 owner = -1; 402 row = indices[j]; 403 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 404 if (owner != rank) continue; 405 /* local number */ 406 row -= rstart; 407 start = ai[row]; 408 end = ai[row+1]; 409 for (k=start; k<end; k++) { /* Amat */ 410 col = aj[k] + cstart; 411 indices_temp[lsize_tmp++] = col; 412 } 413 start = bi[row]; 414 end = bi[row+1]; 415 for (k=start; k<end; k++) { /* Bmat */ 416 col = gcols[bj[k]]; 417 indices_temp[lsize_tmp++] = col; 418 } 419 } 420 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 421 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 422 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 423 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 424 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 425 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 426 } 427 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 428 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 429 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 430 PetscFunctionReturn(0); 431 } 432 433 434 /* 435 Sample message format: 436 If a processor A wants processor B to process some elements corresponding 437 to index sets is[1],is[5] 438 mesg [0] = 2 (no of index sets in the mesg) 439 ----------- 440 mesg [1] = 1 => is[1] 441 mesg [2] = sizeof(is[1]); 442 ----------- 443 mesg [3] = 5 => is[5] 444 mesg [4] = sizeof(is[5]); 445 ----------- 446 mesg [5] 447 mesg [n] datas[1] 448 ----------- 449 mesg[n+1] 450 mesg[m] data(is[5]) 451 ----------- 452 453 Notes: 454 nrqs - no of requests sent (or to be sent out) 455 nrqr - no of requests received (which have to be or which have been processed) 456 */ 457 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 458 { 459 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 460 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 461 const PetscInt **idx,*idx_i; 462 PetscInt *n,**data,len; 463 #if defined(PETSC_USE_CTABLE) 464 PetscTable *table_data,table_data_i; 465 PetscInt *tdata,tcount,tcount_max; 466 #else 467 PetscInt *data_i,*d_p; 468 #endif 469 PetscErrorCode ierr; 470 PetscMPIInt size,rank,tag1,tag2,proc = 0; 471 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 472 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 473 PetscBT *table; 474 MPI_Comm comm; 475 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 476 MPI_Status *s_status,*recv_status; 477 MPI_Comm *iscomms; 478 char *t_p; 479 480 PetscFunctionBegin; 481 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 482 size = c->size; 483 rank = c->rank; 484 M = C->rmap->N; 485 486 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 487 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 488 489 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 490 491 for (i=0; i<imax; i++) { 492 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 493 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 494 } 495 496 /* evaluate communication - mesg to who,length of mesg, and buffer space 497 required. Based on this, buffers are allocated, and data copied into them */ 498 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 499 for (i=0; i<imax; i++) { 500 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 501 idx_i = idx[i]; 502 len = n[i]; 503 for (j=0; j<len; j++) { 504 row = idx_i[j]; 505 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 506 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 507 w4[proc]++; 508 } 509 for (j=0; j<size; j++) { 510 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 511 } 512 } 513 514 nrqs = 0; /* no of outgoing messages */ 515 msz = 0; /* total mesg length (for all proc */ 516 w1[rank] = 0; /* no mesg sent to intself */ 517 w3[rank] = 0; 518 for (i=0; i<size; i++) { 519 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 520 } 521 /* pa - is list of processors to communicate with */ 522 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 523 for (i=0,j=0; i<size; i++) { 524 if (w1[i]) {pa[j] = i; j++;} 525 } 526 527 /* Each message would have a header = 1 + 2*(no of IS) + data */ 528 for (i=0; i<nrqs; i++) { 529 j = pa[i]; 530 w1[j] += w2[j] + 2*w3[j]; 531 msz += w1[j]; 532 } 533 534 /* Determine the number of messages to expect, their lengths, from from-ids */ 535 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 536 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 537 538 /* Now post the Irecvs corresponding to these messages */ 539 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 540 541 /* Allocate Memory for outgoing messages */ 542 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 543 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 544 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 545 546 { 547 PetscInt *iptr = tmp,ict = 0; 548 for (i=0; i<nrqs; i++) { 549 j = pa[i]; 550 iptr += ict; 551 outdat[j] = iptr; 552 ict = w1[j]; 553 } 554 } 555 556 /* Form the outgoing messages */ 557 /* plug in the headers */ 558 for (i=0; i<nrqs; i++) { 559 j = pa[i]; 560 outdat[j][0] = 0; 561 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 562 ptr[j] = outdat[j] + 2*w3[j] + 1; 563 } 564 565 /* Memory for doing local proc's work */ 566 { 567 PetscInt M_BPB_imax = 0; 568 #if defined(PETSC_USE_CTABLE) 569 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 570 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 571 for (i=0; i<imax; i++) { 572 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 573 } 574 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 575 for (i=0; i<imax; i++) { 576 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 577 } 578 #else 579 PetscInt Mimax = 0; 580 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 581 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 582 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 583 for (i=0; i<imax; i++) { 584 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 585 data[i] = d_p + M*i; 586 } 587 #endif 588 } 589 590 /* Parse the IS and update local tables and the outgoing buf with the data */ 591 { 592 PetscInt n_i,isz_i,*outdat_j,ctr_j; 593 PetscBT table_i; 594 595 for (i=0; i<imax; i++) { 596 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 597 n_i = n[i]; 598 table_i = table[i]; 599 idx_i = idx[i]; 600 #if defined(PETSC_USE_CTABLE) 601 table_data_i = table_data[i]; 602 #else 603 data_i = data[i]; 604 #endif 605 isz_i = isz[i]; 606 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 607 row = idx_i[j]; 608 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 609 if (proc != rank) { /* copy to the outgoing buffer */ 610 ctr[proc]++; 611 *ptr[proc] = row; 612 ptr[proc]++; 613 } else if (!PetscBTLookupSet(table_i,row)) { 614 #if defined(PETSC_USE_CTABLE) 615 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 616 #else 617 data_i[isz_i] = row; /* Update the local table */ 618 #endif 619 isz_i++; 620 } 621 } 622 /* Update the headers for the current IS */ 623 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 624 if ((ctr_j = ctr[j])) { 625 outdat_j = outdat[j]; 626 k = ++outdat_j[0]; 627 outdat_j[2*k] = ctr_j; 628 outdat_j[2*k-1] = i; 629 } 630 } 631 isz[i] = isz_i; 632 } 633 } 634 635 /* Now post the sends */ 636 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 637 for (i=0; i<nrqs; ++i) { 638 j = pa[i]; 639 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 640 } 641 642 /* No longer need the original indices */ 643 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 644 for (i=0; i<imax; ++i) { 645 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 646 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 647 } 648 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 649 650 for (i=0; i<imax; ++i) { 651 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 652 } 653 654 /* Do Local work */ 655 #if defined(PETSC_USE_CTABLE) 656 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 657 #else 658 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 659 #endif 660 661 /* Receive messages */ 662 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 663 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 664 665 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 666 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 667 668 /* Phase 1 sends are complete - deallocate buffers */ 669 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 670 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 671 672 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 673 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 674 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 675 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 676 ierr = PetscFree(rbuf);CHKERRQ(ierr); 677 678 679 /* Send the data back */ 680 /* Do a global reduction to know the buffer space req for incoming messages */ 681 { 682 PetscMPIInt *rw1; 683 684 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 685 686 for (i=0; i<nrqr; ++i) { 687 proc = recv_status[i].MPI_SOURCE; 688 689 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 690 rw1[proc] = isz1[i]; 691 } 692 ierr = PetscFree(onodes1);CHKERRQ(ierr); 693 ierr = PetscFree(olengths1);CHKERRQ(ierr); 694 695 /* Determine the number of messages to expect, their lengths, from from-ids */ 696 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 697 ierr = PetscFree(rw1);CHKERRQ(ierr); 698 } 699 /* Now post the Irecvs corresponding to these messages */ 700 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 701 702 /* Now post the sends */ 703 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 704 for (i=0; i<nrqr; ++i) { 705 j = recv_status[i].MPI_SOURCE; 706 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 707 } 708 709 /* receive work done on other processors */ 710 { 711 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 712 PetscMPIInt idex; 713 PetscBT table_i; 714 MPI_Status *status2; 715 716 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 717 for (i=0; i<nrqs; ++i) { 718 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 719 /* Process the message */ 720 rbuf2_i = rbuf2[idex]; 721 ct1 = 2*rbuf2_i[0]+1; 722 jmax = rbuf2[idex][0]; 723 for (j=1; j<=jmax; j++) { 724 max = rbuf2_i[2*j]; 725 is_no = rbuf2_i[2*j-1]; 726 isz_i = isz[is_no]; 727 table_i = table[is_no]; 728 #if defined(PETSC_USE_CTABLE) 729 table_data_i = table_data[is_no]; 730 #else 731 data_i = data[is_no]; 732 #endif 733 for (k=0; k<max; k++,ct1++) { 734 row = rbuf2_i[ct1]; 735 if (!PetscBTLookupSet(table_i,row)) { 736 #if defined(PETSC_USE_CTABLE) 737 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 738 #else 739 data_i[isz_i] = row; 740 #endif 741 isz_i++; 742 } 743 } 744 isz[is_no] = isz_i; 745 } 746 } 747 748 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 749 ierr = PetscFree(status2);CHKERRQ(ierr); 750 } 751 752 #if defined(PETSC_USE_CTABLE) 753 tcount_max = 0; 754 for (i=0; i<imax; ++i) { 755 table_data_i = table_data[i]; 756 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 757 if (tcount_max < tcount) tcount_max = tcount; 758 } 759 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 760 #endif 761 762 for (i=0; i<imax; ++i) { 763 #if defined(PETSC_USE_CTABLE) 764 PetscTablePosition tpos; 765 table_data_i = table_data[i]; 766 767 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 768 while (tpos) { 769 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 770 tdata[--j] = --k; 771 } 772 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 773 #else 774 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 775 #endif 776 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 777 } 778 779 ierr = PetscFree(iscomms);CHKERRQ(ierr); 780 ierr = PetscFree(onodes2);CHKERRQ(ierr); 781 ierr = PetscFree(olengths2);CHKERRQ(ierr); 782 783 ierr = PetscFree(pa);CHKERRQ(ierr); 784 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 785 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 786 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 787 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 788 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 789 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 790 ierr = PetscFree(s_status);CHKERRQ(ierr); 791 ierr = PetscFree(recv_status);CHKERRQ(ierr); 792 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 793 ierr = PetscFree(xdata);CHKERRQ(ierr); 794 ierr = PetscFree(isz1);CHKERRQ(ierr); 795 #if defined(PETSC_USE_CTABLE) 796 for (i=0; i<imax; i++) { 797 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 798 } 799 ierr = PetscFree(table_data);CHKERRQ(ierr); 800 ierr = PetscFree(tdata);CHKERRQ(ierr); 801 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 802 #else 803 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 804 #endif 805 PetscFunctionReturn(0); 806 } 807 808 /* 809 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 810 the work on the local processor. 811 812 Inputs: 813 C - MAT_MPIAIJ; 814 imax - total no of index sets processed at a time; 815 table - an array of char - size = m bits. 816 817 Output: 818 isz - array containing the count of the solution elements corresponding 819 to each index set; 820 data or table_data - pointer to the solutions 821 */ 822 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 823 { 824 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 825 Mat A = c->A,B = c->B; 826 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 827 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 828 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 829 PetscBT table_i; 830 #if defined(PETSC_USE_CTABLE) 831 PetscTable table_data_i; 832 PetscErrorCode ierr; 833 PetscTablePosition tpos; 834 PetscInt tcount,*tdata; 835 #else 836 PetscInt *data_i; 837 #endif 838 839 PetscFunctionBegin; 840 rstart = C->rmap->rstart; 841 cstart = C->cmap->rstart; 842 ai = a->i; 843 aj = a->j; 844 bi = b->i; 845 bj = b->j; 846 garray = c->garray; 847 848 for (i=0; i<imax; i++) { 849 #if defined(PETSC_USE_CTABLE) 850 /* copy existing entries of table_data_i into tdata[] */ 851 table_data_i = table_data[i]; 852 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 853 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 854 855 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 856 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 857 while (tpos) { 858 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 859 tdata[--j] = --row; 860 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 861 } 862 #else 863 data_i = data[i]; 864 #endif 865 table_i = table[i]; 866 isz_i = isz[i]; 867 max = isz[i]; 868 869 for (j=0; j<max; j++) { 870 #if defined(PETSC_USE_CTABLE) 871 row = tdata[j] - rstart; 872 #else 873 row = data_i[j] - rstart; 874 #endif 875 start = ai[row]; 876 end = ai[row+1]; 877 for (k=start; k<end; k++) { /* Amat */ 878 val = aj[k] + cstart; 879 if (!PetscBTLookupSet(table_i,val)) { 880 #if defined(PETSC_USE_CTABLE) 881 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 882 #else 883 data_i[isz_i] = val; 884 #endif 885 isz_i++; 886 } 887 } 888 start = bi[row]; 889 end = bi[row+1]; 890 for (k=start; k<end; k++) { /* Bmat */ 891 val = garray[bj[k]]; 892 if (!PetscBTLookupSet(table_i,val)) { 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 895 #else 896 data_i[isz_i] = val; 897 #endif 898 isz_i++; 899 } 900 } 901 } 902 isz[i] = isz_i; 903 904 #if defined(PETSC_USE_CTABLE) 905 ierr = PetscFree(tdata);CHKERRQ(ierr); 906 #endif 907 } 908 PetscFunctionReturn(0); 909 } 910 911 /* 912 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 913 and return the output 914 915 Input: 916 C - the matrix 917 nrqr - no of messages being processed. 918 rbuf - an array of pointers to the received requests 919 920 Output: 921 xdata - array of messages to be sent back 922 isz1 - size of each message 923 924 For better efficiency perhaps we should malloc separately each xdata[i], 925 then if a remalloc is required we need only copy the data for that one row 926 rather then all previous rows as it is now where a single large chunck of 927 memory is used. 928 929 */ 930 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 931 { 932 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 933 Mat A = c->A,B = c->B; 934 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 935 PetscErrorCode ierr; 936 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 937 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 938 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 939 PetscInt *rbuf_i,kmax,rbuf_0; 940 PetscBT xtable; 941 942 PetscFunctionBegin; 943 m = C->rmap->N; 944 rstart = C->rmap->rstart; 945 cstart = C->cmap->rstart; 946 ai = a->i; 947 aj = a->j; 948 bi = b->i; 949 bj = b->j; 950 garray = c->garray; 951 952 953 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 954 rbuf_i = rbuf[i]; 955 rbuf_0 = rbuf_i[0]; 956 ct += rbuf_0; 957 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 958 } 959 960 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 961 else max1 = 1; 962 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 963 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 964 ++no_malloc; 965 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 966 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 967 968 ct3 = 0; 969 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 970 rbuf_i = rbuf[i]; 971 rbuf_0 = rbuf_i[0]; 972 ct1 = 2*rbuf_0+1; 973 ct2 = ct1; 974 ct3 += ct1; 975 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 976 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 977 oct2 = ct2; 978 kmax = rbuf_i[2*j]; 979 for (k=0; k<kmax; k++,ct1++) { 980 row = rbuf_i[ct1]; 981 if (!PetscBTLookupSet(xtable,row)) { 982 if (!(ct3 < mem_estimate)) { 983 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 984 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 985 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 986 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 987 xdata[0] = tmp; 988 mem_estimate = new_estimate; ++no_malloc; 989 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 990 } 991 xdata[i][ct2++] = row; 992 ct3++; 993 } 994 } 995 for (k=oct2,max2=ct2; k<max2; k++) { 996 row = xdata[i][k] - rstart; 997 start = ai[row]; 998 end = ai[row+1]; 999 for (l=start; l<end; l++) { 1000 val = aj[l] + cstart; 1001 if (!PetscBTLookupSet(xtable,val)) { 1002 if (!(ct3 < mem_estimate)) { 1003 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1004 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1005 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1006 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1007 xdata[0] = tmp; 1008 mem_estimate = new_estimate; ++no_malloc; 1009 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1010 } 1011 xdata[i][ct2++] = val; 1012 ct3++; 1013 } 1014 } 1015 start = bi[row]; 1016 end = bi[row+1]; 1017 for (l=start; l<end; l++) { 1018 val = garray[bj[l]]; 1019 if (!PetscBTLookupSet(xtable,val)) { 1020 if (!(ct3 < mem_estimate)) { 1021 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1022 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1023 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1024 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1025 xdata[0] = tmp; 1026 mem_estimate = new_estimate; ++no_malloc; 1027 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1028 } 1029 xdata[i][ct2++] = val; 1030 ct3++; 1031 } 1032 } 1033 } 1034 /* Update the header*/ 1035 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1036 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1037 } 1038 xdata[i][0] = rbuf_0; 1039 xdata[i+1] = xdata[i] + ct2; 1040 isz1[i] = ct2; /* size of each message */ 1041 } 1042 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1043 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1044 PetscFunctionReturn(0); 1045 } 1046 /* -------------------------------------------------------------------------*/ 1047 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1048 /* 1049 Every processor gets the entire matrix 1050 */ 1051 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1052 { 1053 Mat B; 1054 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1055 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1056 PetscErrorCode ierr; 1057 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1058 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1059 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1060 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 1061 1062 PetscFunctionBegin; 1063 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 1064 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 1065 1066 if (scall == MAT_INITIAL_MATRIX) { 1067 /* ---------------------------------------------------------------- 1068 Tell every processor the number of nonzeros per row 1069 */ 1070 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1071 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1072 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1073 } 1074 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1075 for (i=0; i<size; i++) { 1076 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1077 displs[i] = A->rmap->range[i]; 1078 } 1079 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1080 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1081 #else 1082 sendcount = A->rmap->rend - A->rmap->rstart; 1083 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1084 #endif 1085 /* --------------------------------------------------------------- 1086 Create the sequential matrix of the same type as the local block diagonal 1087 */ 1088 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1089 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1090 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1091 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1092 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1093 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1094 **Bin = B; 1095 b = (Mat_SeqAIJ*)B->data; 1096 1097 /*-------------------------------------------------------------------- 1098 Copy my part of matrix column indices over 1099 */ 1100 sendcount = ad->nz + bd->nz; 1101 jsendbuf = b->j + b->i[rstarts[rank]]; 1102 a_jsendbuf = ad->j; 1103 b_jsendbuf = bd->j; 1104 n = A->rmap->rend - A->rmap->rstart; 1105 cnt = 0; 1106 for (i=0; i<n; i++) { 1107 1108 /* put in lower diagonal portion */ 1109 m = bd->i[i+1] - bd->i[i]; 1110 while (m > 0) { 1111 /* is it above diagonal (in bd (compressed) numbering) */ 1112 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1113 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1114 m--; 1115 } 1116 1117 /* put in diagonal portion */ 1118 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1119 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1120 } 1121 1122 /* put in upper diagonal portion */ 1123 while (m-- > 0) { 1124 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1125 } 1126 } 1127 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1128 1129 /*-------------------------------------------------------------------- 1130 Gather all column indices to all processors 1131 */ 1132 for (i=0; i<size; i++) { 1133 recvcounts[i] = 0; 1134 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1135 recvcounts[i] += lens[j]; 1136 } 1137 } 1138 displs[0] = 0; 1139 for (i=1; i<size; i++) { 1140 displs[i] = displs[i-1] + recvcounts[i-1]; 1141 } 1142 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1143 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1144 #else 1145 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1146 #endif 1147 /*-------------------------------------------------------------------- 1148 Assemble the matrix into useable form (note numerical values not yet set) 1149 */ 1150 /* set the b->ilen (length of each row) values */ 1151 ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr); 1152 /* set the b->i indices */ 1153 b->i[0] = 0; 1154 for (i=1; i<=A->rmap->N; i++) { 1155 b->i[i] = b->i[i-1] + lens[i-1]; 1156 } 1157 ierr = PetscFree(lens);CHKERRQ(ierr); 1158 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1159 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1160 1161 } else { 1162 B = **Bin; 1163 b = (Mat_SeqAIJ*)B->data; 1164 } 1165 1166 /*-------------------------------------------------------------------- 1167 Copy my part of matrix numerical values into the values location 1168 */ 1169 if (flag == MAT_GET_VALUES) { 1170 sendcount = ad->nz + bd->nz; 1171 sendbuf = b->a + b->i[rstarts[rank]]; 1172 a_sendbuf = ad->a; 1173 b_sendbuf = bd->a; 1174 b_sendj = bd->j; 1175 n = A->rmap->rend - A->rmap->rstart; 1176 cnt = 0; 1177 for (i=0; i<n; i++) { 1178 1179 /* put in lower diagonal portion */ 1180 m = bd->i[i+1] - bd->i[i]; 1181 while (m > 0) { 1182 /* is it above diagonal (in bd (compressed) numbering) */ 1183 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1184 sendbuf[cnt++] = *b_sendbuf++; 1185 m--; 1186 b_sendj++; 1187 } 1188 1189 /* put in diagonal portion */ 1190 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1191 sendbuf[cnt++] = *a_sendbuf++; 1192 } 1193 1194 /* put in upper diagonal portion */ 1195 while (m-- > 0) { 1196 sendbuf[cnt++] = *b_sendbuf++; 1197 b_sendj++; 1198 } 1199 } 1200 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1201 1202 /* ----------------------------------------------------------------- 1203 Gather all numerical values to all processors 1204 */ 1205 if (!recvcounts) { 1206 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1207 } 1208 for (i=0; i<size; i++) { 1209 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1210 } 1211 displs[0] = 0; 1212 for (i=1; i<size; i++) { 1213 displs[i] = displs[i-1] + recvcounts[i-1]; 1214 } 1215 recvbuf = b->a; 1216 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1217 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1218 #else 1219 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 1220 #endif 1221 } /* endof (flag == MAT_GET_VALUES) */ 1222 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1223 1224 if (A->symmetric) { 1225 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1226 } else if (A->hermitian) { 1227 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1228 } else if (A->structurally_symmetric) { 1229 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1230 } 1231 PetscFunctionReturn(0); 1232 } 1233 1234 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1235 { 1236 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1237 Mat submat,A = c->A,B = c->B; 1238 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1239 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1240 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1241 const PetscInt *icol,*irow; 1242 PetscInt nrow,ncol,start; 1243 PetscErrorCode ierr; 1244 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1245 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1246 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1247 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1248 PetscInt *lens,rmax,ncols,*cols,Crow; 1249 #if defined(PETSC_USE_CTABLE) 1250 PetscTable cmap,rmap; 1251 PetscInt *cmap_loc,*rmap_loc; 1252 #else 1253 PetscInt *cmap,*rmap; 1254 #endif 1255 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1256 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1257 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*subvals=NULL; 1258 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1259 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1260 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1261 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1262 MPI_Comm comm; 1263 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1264 PetscMPIInt *onodes1,*olengths1,idex,end; 1265 Mat_SubSppt *smatis1; 1266 PetscBool isrowsorted,iscolsorted; 1267 1268 PetscFunctionBegin; 1269 PetscValidLogicalCollectiveInt(C,ismax,2); 1270 PetscValidLogicalCollectiveEnum(C,scall,5); 1271 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1272 1273 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1274 size = c->size; 1275 rank = c->rank; 1276 1277 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1278 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1279 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1280 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1281 if (allcolumns) { 1282 icol = NULL; 1283 ncol = C->cmap->N; 1284 } else { 1285 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1286 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1287 } 1288 1289 if (scall == MAT_INITIAL_MATRIX) { 1290 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1291 1292 /* Get some new tags to keep the communication clean */ 1293 tag1 = ((PetscObject)C)->tag; 1294 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1295 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1296 1297 /* evaluate communication - mesg to who, length of mesg, and buffer space 1298 required. Based on this, buffers are allocated, and data copied into them */ 1299 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1300 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1301 1302 /* w1[proc] = num of rows owned by proc -- to be requested */ 1303 proc = 0; 1304 nrqs = 0; /* num of outgoing messages */ 1305 for (j=0; j<nrow; j++) { 1306 row = irow[j]; 1307 if (!isrowsorted) proc = 0; 1308 while (row >= C->rmap->range[proc+1]) proc++; 1309 w1[proc]++; 1310 row2proc[j] = proc; /* map row index to proc */ 1311 1312 if (proc != rank && !w2[proc]) { 1313 w2[proc] = 1; nrqs++; 1314 } 1315 } 1316 w1[rank] = 0; /* rows owned by self will not be requested */ 1317 1318 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1319 for (proc=0,j=0; proc<size; proc++) { 1320 if (w1[proc]) { pa[j++] = proc;} 1321 } 1322 1323 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1324 msz = 0; /* total mesg length (for all procs) */ 1325 for (i=0; i<nrqs; i++) { 1326 proc = pa[i]; 1327 w1[proc] += 3; 1328 msz += w1[proc]; 1329 } 1330 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1331 1332 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1333 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1334 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1335 1336 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1337 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1338 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1339 1340 /* Now post the Irecvs corresponding to these messages */ 1341 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1342 1343 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1344 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1345 1346 /* Allocate Memory for outgoing messages */ 1347 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1348 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 1349 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 1350 1351 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1352 iptr = tmp; 1353 for (i=0; i<nrqs; i++) { 1354 proc = pa[i]; 1355 sbuf1[proc] = iptr; 1356 iptr += w1[proc]; 1357 } 1358 1359 /* Form the outgoing messages */ 1360 /* Initialize the header space */ 1361 for (i=0; i<nrqs; i++) { 1362 proc = pa[i]; 1363 ierr = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr); 1364 ptr[proc] = sbuf1[proc] + 3; 1365 } 1366 1367 /* Parse the isrow and copy data into outbuf */ 1368 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 1369 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1370 proc = row2proc[j]; 1371 if (proc != rank) { /* copy to the outgoing buf*/ 1372 *ptr[proc] = irow[j]; 1373 ctr[proc]++; ptr[proc]++; 1374 } 1375 } 1376 1377 /* Update the headers for the current IS */ 1378 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1379 if ((ctr_j = ctr[j])) { 1380 sbuf1_j = sbuf1[j]; 1381 k = ++sbuf1_j[0]; 1382 sbuf1_j[2*k] = ctr_j; 1383 sbuf1_j[2*k-1] = 0; 1384 } 1385 } 1386 1387 /* Now post the sends */ 1388 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1389 for (i=0; i<nrqs; ++i) { 1390 proc = pa[i]; 1391 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRQ(ierr); 1392 } 1393 1394 /* Post Receives to capture the buffer size */ 1395 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1396 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1397 1398 rbuf2[0] = tmp + msz; 1399 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1400 1401 for (i=0; i<nrqs; ++i) { 1402 proc = pa[i]; 1403 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRQ(ierr); 1404 } 1405 1406 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1407 1408 /* Send to other procs the buf size they should allocate */ 1409 /* Receive messages*/ 1410 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1411 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1412 1413 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 1414 for (i=0; i<nrqr; ++i) { 1415 req_size[i] = 0; 1416 rbuf1_i = rbuf1[i]; 1417 start = 2*rbuf1_i[0] + 1; 1418 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1419 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1420 sbuf2_i = sbuf2[i]; 1421 for (j=start; j<end; j++) { 1422 k = rbuf1_i[j] - rstart; 1423 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1424 sbuf2_i[j] = ncols; 1425 req_size[i] += ncols; 1426 } 1427 req_source1[i] = r_status1[i].MPI_SOURCE; 1428 1429 /* form the header */ 1430 sbuf2_i[0] = req_size[i]; 1431 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1432 1433 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 1434 } 1435 1436 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1437 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1438 1439 /* rbuf2 is received, Post recv column indices a->j */ 1440 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 1441 1442 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1443 for (i=0; i<nrqs; ++i) { 1444 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1445 req_source2[i] = r_status2[i].MPI_SOURCE; 1446 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 1447 } 1448 1449 /* Wait on sends1 and sends2 */ 1450 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1451 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 1452 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1453 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1454 1455 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 1456 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1457 1458 /* Now allocate sending buffers for a->j, and send them off */ 1459 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1460 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1461 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1462 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1463 1464 for (i=0; i<nrqr; i++) { /* for each requested message */ 1465 rbuf1_i = rbuf1[i]; 1466 sbuf_aj_i = sbuf_aj[i]; 1467 ct1 = 2*rbuf1_i[0] + 1; 1468 ct2 = 0; 1469 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1470 1471 kmax = rbuf1[i][2]; 1472 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1473 row = rbuf1_i[ct1] - rstart; 1474 nzA = ai[row+1] - ai[row]; 1475 nzB = bi[row+1] - bi[row]; 1476 ncols = nzA + nzB; 1477 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1478 1479 /* load the column indices for this row into cols*/ 1480 cols = sbuf_aj_i + ct2; 1481 1482 lwrite = 0; 1483 for (l=0; l<nzB; l++) { 1484 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1485 } 1486 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1487 for (l=0; l<nzB; l++) { 1488 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1489 } 1490 1491 ct2 += ncols; 1492 } 1493 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 1494 } 1495 1496 /* create column map (cmap): global col of C -> local col of submat */ 1497 #if defined(PETSC_USE_CTABLE) 1498 if (!allcolumns) { 1499 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1500 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1501 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1502 if (icol[j] >= cstart && icol[j] <cend) { 1503 cmap_loc[icol[j] - cstart] = j+1; 1504 } else { /* use PetscTable for non-local col indices */ 1505 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1506 } 1507 } 1508 } else { 1509 cmap = NULL; 1510 cmap_loc = NULL; 1511 } 1512 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1513 #else 1514 if (!allcolumns) { 1515 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1516 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1517 } else { 1518 cmap = NULL; 1519 } 1520 #endif 1521 1522 /* Create lens for MatSeqAIJSetPreallocation() */ 1523 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1524 1525 /* Compute lens from local part of C */ 1526 for (j=0; j<nrow; j++) { 1527 row = irow[j]; 1528 proc = row2proc[j]; 1529 if (proc == rank) { 1530 /* diagonal part A = c->A */ 1531 ncols = ai[row-rstart+1] - ai[row-rstart]; 1532 cols = aj + ai[row-rstart]; 1533 if (!allcolumns) { 1534 for (k=0; k<ncols; k++) { 1535 #if defined(PETSC_USE_CTABLE) 1536 tcol = cmap_loc[cols[k]]; 1537 #else 1538 tcol = cmap[cols[k]+cstart]; 1539 #endif 1540 if (tcol) lens[j]++; 1541 } 1542 } else { /* allcolumns */ 1543 lens[j] = ncols; 1544 } 1545 1546 /* off-diagonal part B = c->B */ 1547 ncols = bi[row-rstart+1] - bi[row-rstart]; 1548 cols = bj + bi[row-rstart]; 1549 if (!allcolumns) { 1550 for (k=0; k<ncols; k++) { 1551 #if defined(PETSC_USE_CTABLE) 1552 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1553 #else 1554 tcol = cmap[bmap[cols[k]]]; 1555 #endif 1556 if (tcol) lens[j]++; 1557 } 1558 } else { /* allcolumns */ 1559 lens[j] += ncols; 1560 } 1561 } 1562 } 1563 1564 /* Create row map (rmap): global row of C -> local row of submat */ 1565 #if defined(PETSC_USE_CTABLE) 1566 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1567 for (j=0; j<nrow; j++) { 1568 row = irow[j]; 1569 proc = row2proc[j]; 1570 if (proc == rank) { /* a local row */ 1571 rmap_loc[row - rstart] = j; 1572 } else { 1573 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1574 } 1575 } 1576 #else 1577 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1578 for (j=0; j<nrow; j++) { 1579 rmap[irow[j]] = j; 1580 } 1581 #endif 1582 1583 /* Update lens from offproc data */ 1584 /* recv a->j is done */ 1585 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1586 for (i=0; i<nrqs; i++) { 1587 proc = pa[i]; 1588 sbuf1_i = sbuf1[proc]; 1589 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1590 ct1 = 2 + 1; 1591 ct2 = 0; 1592 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1593 rbuf3_i = rbuf3[i]; /* received C->j */ 1594 1595 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1596 max1 = sbuf1_i[2]; 1597 for (k=0; k<max1; k++,ct1++) { 1598 #if defined(PETSC_USE_CTABLE) 1599 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1600 row--; 1601 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1602 #else 1603 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1604 #endif 1605 /* Now, store row index of submat in sbuf1_i[ct1] */ 1606 sbuf1_i[ct1] = row; 1607 1608 nnz = rbuf2_i[ct1]; 1609 if (!allcolumns) { 1610 for (l=0; l<nnz; l++,ct2++) { 1611 #if defined(PETSC_USE_CTABLE) 1612 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1613 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1614 } else { 1615 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1616 } 1617 #else 1618 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1619 #endif 1620 if (tcol) lens[row]++; 1621 } 1622 } else { /* allcolumns */ 1623 lens[row] += nnz; 1624 } 1625 } 1626 } 1627 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr); 1628 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1629 1630 /* Create the submatrices */ 1631 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1632 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1633 1634 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1635 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1636 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1637 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1638 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1639 1640 /* create struct Mat_SubSppt and attached it to submat */ 1641 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1642 subc = (Mat_SeqAIJ*)submat->data; 1643 subc->submatis1 = smatis1; 1644 1645 smatis1->id = 0; 1646 smatis1->nrqs = nrqs; 1647 smatis1->nrqr = nrqr; 1648 smatis1->rbuf1 = rbuf1; 1649 smatis1->rbuf2 = rbuf2; 1650 smatis1->rbuf3 = rbuf3; 1651 smatis1->sbuf2 = sbuf2; 1652 smatis1->req_source2 = req_source2; 1653 1654 smatis1->sbuf1 = sbuf1; 1655 smatis1->ptr = ptr; 1656 smatis1->tmp = tmp; 1657 smatis1->ctr = ctr; 1658 1659 smatis1->pa = pa; 1660 smatis1->req_size = req_size; 1661 smatis1->req_source1 = req_source1; 1662 1663 smatis1->allcolumns = allcolumns; 1664 smatis1->singleis = PETSC_TRUE; 1665 smatis1->row2proc = row2proc; 1666 smatis1->rmap = rmap; 1667 smatis1->cmap = cmap; 1668 #if defined(PETSC_USE_CTABLE) 1669 smatis1->rmap_loc = rmap_loc; 1670 smatis1->cmap_loc = cmap_loc; 1671 #endif 1672 1673 smatis1->destroy = submat->ops->destroy; 1674 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1675 submat->factortype = C->factortype; 1676 1677 /* compute rmax */ 1678 rmax = 0; 1679 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1680 1681 } else { /* scall == MAT_REUSE_MATRIX */ 1682 submat = submats[0]; 1683 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1684 1685 subc = (Mat_SeqAIJ*)submat->data; 1686 rmax = subc->rmax; 1687 smatis1 = subc->submatis1; 1688 nrqs = smatis1->nrqs; 1689 nrqr = smatis1->nrqr; 1690 rbuf1 = smatis1->rbuf1; 1691 rbuf2 = smatis1->rbuf2; 1692 rbuf3 = smatis1->rbuf3; 1693 req_source2 = smatis1->req_source2; 1694 1695 sbuf1 = smatis1->sbuf1; 1696 sbuf2 = smatis1->sbuf2; 1697 ptr = smatis1->ptr; 1698 tmp = smatis1->tmp; 1699 ctr = smatis1->ctr; 1700 1701 pa = smatis1->pa; 1702 req_size = smatis1->req_size; 1703 req_source1 = smatis1->req_source1; 1704 1705 allcolumns = smatis1->allcolumns; 1706 row2proc = smatis1->row2proc; 1707 rmap = smatis1->rmap; 1708 cmap = smatis1->cmap; 1709 #if defined(PETSC_USE_CTABLE) 1710 rmap_loc = smatis1->rmap_loc; 1711 cmap_loc = smatis1->cmap_loc; 1712 #endif 1713 } 1714 1715 /* Post recv matrix values */ 1716 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1717 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1718 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1719 for (i=0; i<nrqs; ++i) { 1720 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1721 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 1722 } 1723 1724 /* Allocate sending buffers for a->a, and send them off */ 1725 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1726 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1727 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1728 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1729 1730 for (i=0; i<nrqr; i++) { 1731 rbuf1_i = rbuf1[i]; 1732 sbuf_aa_i = sbuf_aa[i]; 1733 ct1 = 2*rbuf1_i[0]+1; 1734 ct2 = 0; 1735 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1736 1737 kmax = rbuf1_i[2]; 1738 for (k=0; k<kmax; k++,ct1++) { 1739 row = rbuf1_i[ct1] - rstart; 1740 nzA = ai[row+1] - ai[row]; 1741 nzB = bi[row+1] - bi[row]; 1742 ncols = nzA + nzB; 1743 cworkB = bj + bi[row]; 1744 vworkA = a_a + ai[row]; 1745 vworkB = b_a + bi[row]; 1746 1747 /* load the column values for this row into vals*/ 1748 vals = sbuf_aa_i + ct2; 1749 1750 lwrite = 0; 1751 for (l=0; l<nzB; l++) { 1752 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1753 } 1754 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1755 for (l=0; l<nzB; l++) { 1756 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1757 } 1758 1759 ct2 += ncols; 1760 } 1761 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 1762 } 1763 1764 /* Assemble submat */ 1765 /* First assemble the local rows */ 1766 for (j=0; j<nrow; j++) { 1767 row = irow[j]; 1768 proc = row2proc[j]; 1769 if (proc == rank) { 1770 Crow = row - rstart; /* local row index of C */ 1771 #if defined(PETSC_USE_CTABLE) 1772 row = rmap_loc[Crow]; /* row index of submat */ 1773 #else 1774 row = rmap[row]; 1775 #endif 1776 1777 if (allcolumns) { 1778 /* diagonal part A = c->A */ 1779 ncols = ai[Crow+1] - ai[Crow]; 1780 cols = aj + ai[Crow]; 1781 vals = a->a + ai[Crow]; 1782 i = 0; 1783 for (k=0; k<ncols; k++) { 1784 subcols[i] = cols[k] + cstart; 1785 subvals[i++] = vals[k]; 1786 } 1787 1788 /* off-diagonal part B = c->B */ 1789 ncols = bi[Crow+1] - bi[Crow]; 1790 cols = bj + bi[Crow]; 1791 vals = b->a + bi[Crow]; 1792 for (k=0; k<ncols; k++) { 1793 subcols[i] = bmap[cols[k]]; 1794 subvals[i++] = vals[k]; 1795 } 1796 1797 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1798 1799 } else { /* !allcolumns */ 1800 #if defined(PETSC_USE_CTABLE) 1801 /* diagonal part A = c->A */ 1802 ncols = ai[Crow+1] - ai[Crow]; 1803 cols = aj + ai[Crow]; 1804 vals = a->a + ai[Crow]; 1805 i = 0; 1806 for (k=0; k<ncols; k++) { 1807 tcol = cmap_loc[cols[k]]; 1808 if (tcol) { 1809 subcols[i] = --tcol; 1810 subvals[i++] = vals[k]; 1811 } 1812 } 1813 1814 /* off-diagonal part B = c->B */ 1815 ncols = bi[Crow+1] - bi[Crow]; 1816 cols = bj + bi[Crow]; 1817 vals = b->a + bi[Crow]; 1818 for (k=0; k<ncols; k++) { 1819 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1820 if (tcol) { 1821 subcols[i] = --tcol; 1822 subvals[i++] = vals[k]; 1823 } 1824 } 1825 #else 1826 /* diagonal part A = c->A */ 1827 ncols = ai[Crow+1] - ai[Crow]; 1828 cols = aj + ai[Crow]; 1829 vals = a->a + ai[Crow]; 1830 i = 0; 1831 for (k=0; k<ncols; k++) { 1832 tcol = cmap[cols[k]+cstart]; 1833 if (tcol) { 1834 subcols[i] = --tcol; 1835 subvals[i++] = vals[k]; 1836 } 1837 } 1838 1839 /* off-diagonal part B = c->B */ 1840 ncols = bi[Crow+1] - bi[Crow]; 1841 cols = bj + bi[Crow]; 1842 vals = b->a + bi[Crow]; 1843 for (k=0; k<ncols; k++) { 1844 tcol = cmap[bmap[cols[k]]]; 1845 if (tcol) { 1846 subcols[i] = --tcol; 1847 subvals[i++] = vals[k]; 1848 } 1849 } 1850 #endif 1851 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1852 } 1853 } 1854 } 1855 1856 /* Now assemble the off-proc rows */ 1857 for (i=0; i<nrqs; i++) { /* for each requested message */ 1858 /* recv values from other processes */ 1859 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1860 proc = pa[idex]; 1861 sbuf1_i = sbuf1[proc]; 1862 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1863 ct1 = 2 + 1; 1864 ct2 = 0; /* count of received C->j */ 1865 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1866 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1867 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1868 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1869 1870 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1871 max1 = sbuf1_i[2]; /* num of rows */ 1872 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1873 row = sbuf1_i[ct1]; /* row index of submat */ 1874 if (!allcolumns) { 1875 idex = 0; 1876 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1877 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1878 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1879 #if defined(PETSC_USE_CTABLE) 1880 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1881 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1882 } else { 1883 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1884 } 1885 #else 1886 tcol = cmap[rbuf3_i[ct2]]; 1887 #endif 1888 if (tcol) { 1889 subcols[idex] = --tcol; /* may not be sorted */ 1890 subvals[idex++] = rbuf4_i[ct2]; 1891 1892 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1893 For reuse, we replace received C->j with index that should be inserted to submat */ 1894 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1895 } 1896 } 1897 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1898 } else { /* scall == MAT_REUSE_MATRIX */ 1899 submat = submats[0]; 1900 subc = (Mat_SeqAIJ*)submat->data; 1901 1902 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1903 for (l=0; l<nnz; l++) { 1904 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1905 subvals[idex++] = rbuf4_i[ct2]; 1906 } 1907 1908 bj = subc->j + subc->i[row]; /* sorted column indices */ 1909 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1910 } 1911 } else { /* allcolumns */ 1912 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1913 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1914 ct2 += nnz; 1915 } 1916 } 1917 } 1918 1919 /* sending a->a are done */ 1920 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr); 1921 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1922 1923 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1924 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1925 submats[0] = submat; 1926 1927 /* Restore the indices */ 1928 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1929 if (!allcolumns) { 1930 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1931 } 1932 1933 /* Destroy allocated memory */ 1934 for (i=0; i<nrqs; ++i) { 1935 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1936 } 1937 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1938 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1939 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1940 1941 if (scall == MAT_INITIAL_MATRIX) { 1942 ierr = PetscFree(lens);CHKERRQ(ierr); 1943 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1944 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1945 } 1946 PetscFunctionReturn(0); 1947 } 1948 1949 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1950 { 1951 PetscErrorCode ierr; 1952 PetscInt ncol; 1953 PetscBool colflag,allcolumns=PETSC_FALSE; 1954 1955 PetscFunctionBegin; 1956 /* Allocate memory to hold all the submatrices */ 1957 if (scall == MAT_INITIAL_MATRIX) { 1958 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1959 } 1960 1961 /* Check for special case: each processor gets entire matrix columns */ 1962 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1963 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1964 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1965 1966 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1967 PetscFunctionReturn(0); 1968 } 1969 1970 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1971 { 1972 PetscErrorCode ierr; 1973 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1974 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1975 Mat_SeqAIJ *subc; 1976 Mat_SubSppt *smat; 1977 1978 PetscFunctionBegin; 1979 /* Check for special case: each processor has a single IS */ 1980 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1981 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1982 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1983 PetscFunctionReturn(0); 1984 } 1985 1986 /* Collect global wantallmatrix and nstages */ 1987 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1988 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1989 if (!nmax) nmax = 1; 1990 1991 if (scall == MAT_INITIAL_MATRIX) { 1992 /* Collect global wantallmatrix and nstages */ 1993 if (ismax == 1 && C->rmap->N == C->cmap->N) { 1994 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 1995 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 1996 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 1997 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 1998 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 1999 wantallmatrix = PETSC_TRUE; 2000 2001 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2002 } 2003 } 2004 2005 /* Determine the number of stages through which submatrices are done 2006 Each stage will extract nmax submatrices. 2007 nmax is determined by the matrix column dimension. 2008 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2009 */ 2010 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2011 2012 in[0] = -1*(PetscInt)wantallmatrix; 2013 in[1] = nstages; 2014 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2015 wantallmatrix = (PetscBool)(-out[0]); 2016 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2017 2018 } else { /* MAT_REUSE_MATRIX */ 2019 if (ismax) { 2020 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2021 smat = subc->submatis1; 2022 } else { /* (*submat)[0] is a dummy matrix */ 2023 smat = (Mat_SubSppt*)(*submat)[0]->data; 2024 } 2025 if (!smat) { 2026 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2027 wantallmatrix = PETSC_TRUE; 2028 } else if (smat->singleis) { 2029 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2030 PetscFunctionReturn(0); 2031 } else { 2032 nstages = smat->nstages; 2033 } 2034 } 2035 2036 if (wantallmatrix) { 2037 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2038 PetscFunctionReturn(0); 2039 } 2040 2041 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2042 if (scall == MAT_INITIAL_MATRIX) { 2043 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2044 } 2045 2046 for (i=0,pos=0; i<nstages; i++) { 2047 if (pos+nmax <= ismax) max_no = nmax; 2048 else if (pos >= ismax) max_no = 0; 2049 else max_no = ismax-pos; 2050 2051 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2052 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2053 smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++; 2054 smat->nstages = nstages; 2055 } 2056 pos += max_no; 2057 } 2058 2059 if (ismax && scall == MAT_INITIAL_MATRIX) { 2060 /* save nstages for reuse */ 2061 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2062 smat = subc->submatis1; 2063 smat->nstages = nstages; 2064 } 2065 PetscFunctionReturn(0); 2066 } 2067 2068 /* -------------------------------------------------------------------------*/ 2069 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2070 { 2071 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2072 Mat A = c->A; 2073 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2074 const PetscInt **icol,**irow; 2075 PetscInt *nrow,*ncol,start; 2076 PetscErrorCode ierr; 2077 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2078 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2079 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2080 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2081 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2082 #if defined(PETSC_USE_CTABLE) 2083 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2084 #else 2085 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2086 #endif 2087 const PetscInt *irow_i; 2088 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2089 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2090 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2091 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2092 MPI_Status *r_status3,*r_status4,*s_status4; 2093 MPI_Comm comm; 2094 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2095 PetscMPIInt *onodes1,*olengths1,end; 2096 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2097 Mat_SubSppt *smat_i; 2098 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2099 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2100 2101 PetscFunctionBegin; 2102 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2103 size = c->size; 2104 rank = c->rank; 2105 2106 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2107 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2108 2109 for (i=0; i<ismax; i++) { 2110 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2111 if (!issorted[i]) iscsorted = issorted[i]; 2112 2113 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2114 2115 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2116 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2117 2118 /* Check for special case: allcolumn */ 2119 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2120 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2121 if (colflag && ncol[i] == C->cmap->N) { 2122 allcolumns[i] = PETSC_TRUE; 2123 icol[i] = NULL; 2124 } else { 2125 allcolumns[i] = PETSC_FALSE; 2126 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2127 } 2128 } 2129 2130 if (scall == MAT_REUSE_MATRIX) { 2131 /* Assumes new rows are same length as the old rows */ 2132 for (i=0; i<ismax; i++) { 2133 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2134 subc = (Mat_SeqAIJ*)submats[i]->data; 2135 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2136 2137 /* Initial matrix as if empty */ 2138 ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr); 2139 2140 smat_i = subc->submatis1; 2141 2142 nrqs = smat_i->nrqs; 2143 nrqr = smat_i->nrqr; 2144 rbuf1 = smat_i->rbuf1; 2145 rbuf2 = smat_i->rbuf2; 2146 rbuf3 = smat_i->rbuf3; 2147 req_source2 = smat_i->req_source2; 2148 2149 sbuf1 = smat_i->sbuf1; 2150 sbuf2 = smat_i->sbuf2; 2151 ptr = smat_i->ptr; 2152 tmp = smat_i->tmp; 2153 ctr = smat_i->ctr; 2154 2155 pa = smat_i->pa; 2156 req_size = smat_i->req_size; 2157 req_source1 = smat_i->req_source1; 2158 2159 allcolumns[i] = smat_i->allcolumns; 2160 row2proc[i] = smat_i->row2proc; 2161 rmap[i] = smat_i->rmap; 2162 cmap[i] = smat_i->cmap; 2163 } 2164 2165 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2166 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2167 smat_i = (Mat_SubSppt*)submats[0]->data; 2168 2169 nrqs = smat_i->nrqs; 2170 nrqr = smat_i->nrqr; 2171 rbuf1 = smat_i->rbuf1; 2172 rbuf2 = smat_i->rbuf2; 2173 rbuf3 = smat_i->rbuf3; 2174 req_source2 = smat_i->req_source2; 2175 2176 sbuf1 = smat_i->sbuf1; 2177 sbuf2 = smat_i->sbuf2; 2178 ptr = smat_i->ptr; 2179 tmp = smat_i->tmp; 2180 ctr = smat_i->ctr; 2181 2182 pa = smat_i->pa; 2183 req_size = smat_i->req_size; 2184 req_source1 = smat_i->req_source1; 2185 2186 allcolumns[0] = PETSC_FALSE; 2187 } 2188 } else { /* scall == MAT_INITIAL_MATRIX */ 2189 /* Get some new tags to keep the communication clean */ 2190 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2191 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2192 2193 /* evaluate communication - mesg to who, length of mesg, and buffer space 2194 required. Based on this, buffers are allocated, and data copied into them*/ 2195 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2196 2197 for (i=0; i<ismax; i++) { 2198 jmax = nrow[i]; 2199 irow_i = irow[i]; 2200 2201 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2202 row2proc[i] = row2proc_i; 2203 2204 if (issorted[i]) proc = 0; 2205 for (j=0; j<jmax; j++) { 2206 if (!issorted[i]) proc = 0; 2207 row = irow_i[j]; 2208 while (row >= C->rmap->range[proc+1]) proc++; 2209 w4[proc]++; 2210 row2proc_i[j] = proc; /* map row index to proc */ 2211 } 2212 for (j=0; j<size; j++) { 2213 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2214 } 2215 } 2216 2217 nrqs = 0; /* no of outgoing messages */ 2218 msz = 0; /* total mesg length (for all procs) */ 2219 w1[rank] = 0; /* no mesg sent to self */ 2220 w3[rank] = 0; 2221 for (i=0; i<size; i++) { 2222 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2223 } 2224 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2225 for (i=0,j=0; i<size; i++) { 2226 if (w1[i]) { pa[j] = i; j++; } 2227 } 2228 2229 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2230 for (i=0; i<nrqs; i++) { 2231 j = pa[i]; 2232 w1[j] += w2[j] + 2* w3[j]; 2233 msz += w1[j]; 2234 } 2235 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2236 2237 /* Determine the number of messages to expect, their lengths, from from-ids */ 2238 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2239 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2240 2241 /* Now post the Irecvs corresponding to these messages */ 2242 tag0 = ((PetscObject)C)->tag; 2243 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2244 2245 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2246 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2247 2248 /* Allocate Memory for outgoing messages */ 2249 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2250 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 2251 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 2252 2253 { 2254 PetscInt *iptr = tmp; 2255 k = 0; 2256 for (i=0; i<nrqs; i++) { 2257 j = pa[i]; 2258 iptr += k; 2259 sbuf1[j] = iptr; 2260 k = w1[j]; 2261 } 2262 } 2263 2264 /* Form the outgoing messages. Initialize the header space */ 2265 for (i=0; i<nrqs; i++) { 2266 j = pa[i]; 2267 sbuf1[j][0] = 0; 2268 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 2269 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2270 } 2271 2272 /* Parse the isrow and copy data into outbuf */ 2273 for (i=0; i<ismax; i++) { 2274 row2proc_i = row2proc[i]; 2275 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 2276 irow_i = irow[i]; 2277 jmax = nrow[i]; 2278 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2279 proc = row2proc_i[j]; 2280 if (proc != rank) { /* copy to the outgoing buf*/ 2281 ctr[proc]++; 2282 *ptr[proc] = irow_i[j]; 2283 ptr[proc]++; 2284 } 2285 } 2286 /* Update the headers for the current IS */ 2287 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2288 if ((ctr_j = ctr[j])) { 2289 sbuf1_j = sbuf1[j]; 2290 k = ++sbuf1_j[0]; 2291 sbuf1_j[2*k] = ctr_j; 2292 sbuf1_j[2*k-1] = i; 2293 } 2294 } 2295 } 2296 2297 /* Now post the sends */ 2298 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2299 for (i=0; i<nrqs; ++i) { 2300 j = pa[i]; 2301 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 2302 } 2303 2304 /* Post Receives to capture the buffer size */ 2305 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2306 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2307 rbuf2[0] = tmp + msz; 2308 for (i=1; i<nrqs; ++i) { 2309 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2310 } 2311 for (i=0; i<nrqs; ++i) { 2312 j = pa[i]; 2313 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr); 2314 } 2315 2316 /* Send to other procs the buf size they should allocate */ 2317 /* Receive messages*/ 2318 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2319 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2320 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2321 { 2322 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2323 PetscInt *sbuf2_i; 2324 2325 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr); 2326 for (i=0; i<nrqr; ++i) { 2327 req_size[i] = 0; 2328 rbuf1_i = rbuf1[i]; 2329 start = 2*rbuf1_i[0] + 1; 2330 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2331 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2332 sbuf2_i = sbuf2[i]; 2333 for (j=start; j<end; j++) { 2334 id = rbuf1_i[j] - rstart; 2335 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2336 sbuf2_i[j] = ncols; 2337 req_size[i] += ncols; 2338 } 2339 req_source1[i] = r_status1[i].MPI_SOURCE; 2340 /* form the header */ 2341 sbuf2_i[0] = req_size[i]; 2342 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2343 2344 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr); 2345 } 2346 } 2347 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2348 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2349 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2350 2351 /* Receive messages*/ 2352 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2353 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2354 2355 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr); 2356 for (i=0; i<nrqs; ++i) { 2357 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2358 req_source2[i] = r_status2[i].MPI_SOURCE; 2359 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr); 2360 } 2361 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2362 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2363 2364 /* Wait on sends1 and sends2 */ 2365 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2366 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2367 2368 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 2369 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 2370 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2371 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2372 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2373 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2374 2375 /* Now allocate sending buffers for a->j, and send them off */ 2376 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2377 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2378 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2379 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2380 2381 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2382 { 2383 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2384 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2385 PetscInt cend = C->cmap->rend; 2386 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2387 2388 for (i=0; i<nrqr; i++) { 2389 rbuf1_i = rbuf1[i]; 2390 sbuf_aj_i = sbuf_aj[i]; 2391 ct1 = 2*rbuf1_i[0] + 1; 2392 ct2 = 0; 2393 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2394 kmax = rbuf1[i][2*j]; 2395 for (k=0; k<kmax; k++,ct1++) { 2396 row = rbuf1_i[ct1] - rstart; 2397 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2398 ncols = nzA + nzB; 2399 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 2400 2401 /* load the column indices for this row into cols */ 2402 cols = sbuf_aj_i + ct2; 2403 2404 lwrite = 0; 2405 for (l=0; l<nzB; l++) { 2406 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2407 } 2408 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2409 for (l=0; l<nzB; l++) { 2410 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2411 } 2412 2413 ct2 += ncols; 2414 } 2415 } 2416 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr); 2417 } 2418 } 2419 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2420 2421 /* create col map: global col of C -> local col of submatrices */ 2422 { 2423 const PetscInt *icol_i; 2424 #if defined(PETSC_USE_CTABLE) 2425 for (i=0; i<ismax; i++) { 2426 if (!allcolumns[i]) { 2427 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2428 2429 jmax = ncol[i]; 2430 icol_i = icol[i]; 2431 cmap_i = cmap[i]; 2432 for (j=0; j<jmax; j++) { 2433 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2434 } 2435 } else cmap[i] = NULL; 2436 } 2437 #else 2438 for (i=0; i<ismax; i++) { 2439 if (!allcolumns[i]) { 2440 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2441 jmax = ncol[i]; 2442 icol_i = icol[i]; 2443 cmap_i = cmap[i]; 2444 for (j=0; j<jmax; j++) { 2445 cmap_i[icol_i[j]] = j+1; 2446 } 2447 } else cmap[i] = NULL; 2448 } 2449 #endif 2450 } 2451 2452 /* Create lens which is required for MatCreate... */ 2453 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2454 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2455 2456 if (ismax) { 2457 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2458 } 2459 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2460 2461 /* Update lens from local data */ 2462 for (i=0; i<ismax; i++) { 2463 row2proc_i = row2proc[i]; 2464 jmax = nrow[i]; 2465 if (!allcolumns[i]) cmap_i = cmap[i]; 2466 irow_i = irow[i]; 2467 lens_i = lens[i]; 2468 for (j=0; j<jmax; j++) { 2469 row = irow_i[j]; 2470 proc = row2proc_i[j]; 2471 if (proc == rank) { 2472 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2473 if (!allcolumns[i]) { 2474 for (k=0; k<ncols; k++) { 2475 #if defined(PETSC_USE_CTABLE) 2476 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2477 #else 2478 tcol = cmap_i[cols[k]]; 2479 #endif 2480 if (tcol) lens_i[j]++; 2481 } 2482 } else { /* allcolumns */ 2483 lens_i[j] = ncols; 2484 } 2485 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2486 } 2487 } 2488 } 2489 2490 /* Create row map: global row of C -> local row of submatrices */ 2491 #if defined(PETSC_USE_CTABLE) 2492 for (i=0; i<ismax; i++) { 2493 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2494 irow_i = irow[i]; 2495 jmax = nrow[i]; 2496 for (j=0; j<jmax; j++) { 2497 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2498 } 2499 } 2500 #else 2501 for (i=0; i<ismax; i++) { 2502 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2503 rmap_i = rmap[i]; 2504 irow_i = irow[i]; 2505 jmax = nrow[i]; 2506 for (j=0; j<jmax; j++) { 2507 rmap_i[irow_i[j]] = j; 2508 } 2509 } 2510 #endif 2511 2512 /* Update lens from offproc data */ 2513 { 2514 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2515 2516 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2517 for (tmp2=0; tmp2<nrqs; tmp2++) { 2518 sbuf1_i = sbuf1[pa[tmp2]]; 2519 jmax = sbuf1_i[0]; 2520 ct1 = 2*jmax+1; 2521 ct2 = 0; 2522 rbuf2_i = rbuf2[tmp2]; 2523 rbuf3_i = rbuf3[tmp2]; 2524 for (j=1; j<=jmax; j++) { 2525 is_no = sbuf1_i[2*j-1]; 2526 max1 = sbuf1_i[2*j]; 2527 lens_i = lens[is_no]; 2528 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2529 rmap_i = rmap[is_no]; 2530 for (k=0; k<max1; k++,ct1++) { 2531 #if defined(PETSC_USE_CTABLE) 2532 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2533 row--; 2534 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2535 #else 2536 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2537 #endif 2538 max2 = rbuf2_i[ct1]; 2539 for (l=0; l<max2; l++,ct2++) { 2540 if (!allcolumns[is_no]) { 2541 #if defined(PETSC_USE_CTABLE) 2542 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2543 #else 2544 tcol = cmap_i[rbuf3_i[ct2]]; 2545 #endif 2546 if (tcol) lens_i[row]++; 2547 } else { /* allcolumns */ 2548 lens_i[row]++; /* lens_i[row] += max2 ? */ 2549 } 2550 } 2551 } 2552 } 2553 } 2554 } 2555 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2556 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 2557 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2558 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2559 2560 /* Create the submatrices */ 2561 for (i=0; i<ismax; i++) { 2562 PetscInt rbs,cbs; 2563 2564 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2565 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2566 2567 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2568 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2569 2570 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2571 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2572 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2573 2574 /* create struct Mat_SubSppt and attached it to submat */ 2575 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2576 subc = (Mat_SeqAIJ*)submats[i]->data; 2577 subc->submatis1 = smat_i; 2578 2579 smat_i->destroy = submats[i]->ops->destroy; 2580 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2581 submats[i]->factortype = C->factortype; 2582 2583 smat_i->id = i; 2584 smat_i->nrqs = nrqs; 2585 smat_i->nrqr = nrqr; 2586 smat_i->rbuf1 = rbuf1; 2587 smat_i->rbuf2 = rbuf2; 2588 smat_i->rbuf3 = rbuf3; 2589 smat_i->sbuf2 = sbuf2; 2590 smat_i->req_source2 = req_source2; 2591 2592 smat_i->sbuf1 = sbuf1; 2593 smat_i->ptr = ptr; 2594 smat_i->tmp = tmp; 2595 smat_i->ctr = ctr; 2596 2597 smat_i->pa = pa; 2598 smat_i->req_size = req_size; 2599 smat_i->req_source1 = req_source1; 2600 2601 smat_i->allcolumns = allcolumns[i]; 2602 smat_i->singleis = PETSC_FALSE; 2603 smat_i->row2proc = row2proc[i]; 2604 smat_i->rmap = rmap[i]; 2605 smat_i->cmap = cmap[i]; 2606 } 2607 2608 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2609 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2610 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2611 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2612 2613 /* create struct Mat_SubSppt and attached it to submat */ 2614 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2615 submats[0]->data = (void*)smat_i; 2616 2617 smat_i->destroy = submats[0]->ops->destroy; 2618 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2619 submats[0]->factortype = C->factortype; 2620 2621 smat_i->id = 0; 2622 smat_i->nrqs = nrqs; 2623 smat_i->nrqr = nrqr; 2624 smat_i->rbuf1 = rbuf1; 2625 smat_i->rbuf2 = rbuf2; 2626 smat_i->rbuf3 = rbuf3; 2627 smat_i->sbuf2 = sbuf2; 2628 smat_i->req_source2 = req_source2; 2629 2630 smat_i->sbuf1 = sbuf1; 2631 smat_i->ptr = ptr; 2632 smat_i->tmp = tmp; 2633 smat_i->ctr = ctr; 2634 2635 smat_i->pa = pa; 2636 smat_i->req_size = req_size; 2637 smat_i->req_source1 = req_source1; 2638 2639 smat_i->allcolumns = PETSC_FALSE; 2640 smat_i->singleis = PETSC_FALSE; 2641 smat_i->row2proc = NULL; 2642 smat_i->rmap = NULL; 2643 smat_i->cmap = NULL; 2644 } 2645 2646 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2647 ierr = PetscFree(lens);CHKERRQ(ierr); 2648 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2649 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2650 2651 } /* endof scall == MAT_INITIAL_MATRIX */ 2652 2653 /* Post recv matrix values */ 2654 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2655 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2656 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2657 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2658 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2659 for (i=0; i<nrqs; ++i) { 2660 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2661 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr); 2662 } 2663 2664 /* Allocate sending buffers for a->a, and send them off */ 2665 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2666 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2667 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2668 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2669 2670 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2671 { 2672 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2673 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2674 PetscInt cend = C->cmap->rend; 2675 PetscInt *b_j = b->j; 2676 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 2677 2678 for (i=0; i<nrqr; i++) { 2679 rbuf1_i = rbuf1[i]; 2680 sbuf_aa_i = sbuf_aa[i]; 2681 ct1 = 2*rbuf1_i[0]+1; 2682 ct2 = 0; 2683 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2684 kmax = rbuf1_i[2*j]; 2685 for (k=0; k<kmax; k++,ct1++) { 2686 row = rbuf1_i[ct1] - rstart; 2687 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 2688 ncols = nzA + nzB; 2689 cworkB = b_j + b_i[row]; 2690 vworkA = a_a + a_i[row]; 2691 vworkB = b_a + b_i[row]; 2692 2693 /* load the column values for this row into vals*/ 2694 vals = sbuf_aa_i+ct2; 2695 2696 lwrite = 0; 2697 for (l=0; l<nzB; l++) { 2698 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2699 } 2700 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2701 for (l=0; l<nzB; l++) { 2702 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2703 } 2704 2705 ct2 += ncols; 2706 } 2707 } 2708 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr); 2709 } 2710 } 2711 2712 /* Assemble the matrices */ 2713 /* First assemble the local rows */ 2714 for (i=0; i<ismax; i++) { 2715 row2proc_i = row2proc[i]; 2716 subc = (Mat_SeqAIJ*)submats[i]->data; 2717 imat_ilen = subc->ilen; 2718 imat_j = subc->j; 2719 imat_i = subc->i; 2720 imat_a = subc->a; 2721 2722 if (!allcolumns[i]) cmap_i = cmap[i]; 2723 rmap_i = rmap[i]; 2724 irow_i = irow[i]; 2725 jmax = nrow[i]; 2726 for (j=0; j<jmax; j++) { 2727 row = irow_i[j]; 2728 proc = row2proc_i[j]; 2729 if (proc == rank) { 2730 old_row = row; 2731 #if defined(PETSC_USE_CTABLE) 2732 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2733 row--; 2734 #else 2735 row = rmap_i[row]; 2736 #endif 2737 ilen_row = imat_ilen[row]; 2738 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2739 mat_i = imat_i[row]; 2740 mat_a = imat_a + mat_i; 2741 mat_j = imat_j + mat_i; 2742 if (!allcolumns[i]) { 2743 for (k=0; k<ncols; k++) { 2744 #if defined(PETSC_USE_CTABLE) 2745 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2746 #else 2747 tcol = cmap_i[cols[k]]; 2748 #endif 2749 if (tcol) { 2750 *mat_j++ = tcol - 1; 2751 *mat_a++ = vals[k]; 2752 ilen_row++; 2753 } 2754 } 2755 } else { /* allcolumns */ 2756 for (k=0; k<ncols; k++) { 2757 *mat_j++ = cols[k]; /* global col index! */ 2758 *mat_a++ = vals[k]; 2759 ilen_row++; 2760 } 2761 } 2762 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2763 2764 imat_ilen[row] = ilen_row; 2765 } 2766 } 2767 } 2768 2769 /* Now assemble the off proc rows */ 2770 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2771 for (tmp2=0; tmp2<nrqs; tmp2++) { 2772 sbuf1_i = sbuf1[pa[tmp2]]; 2773 jmax = sbuf1_i[0]; 2774 ct1 = 2*jmax + 1; 2775 ct2 = 0; 2776 rbuf2_i = rbuf2[tmp2]; 2777 rbuf3_i = rbuf3[tmp2]; 2778 rbuf4_i = rbuf4[tmp2]; 2779 for (j=1; j<=jmax; j++) { 2780 is_no = sbuf1_i[2*j-1]; 2781 rmap_i = rmap[is_no]; 2782 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2783 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2784 imat_ilen = subc->ilen; 2785 imat_j = subc->j; 2786 imat_i = subc->i; 2787 imat_a = subc->a; 2788 max1 = sbuf1_i[2*j]; 2789 for (k=0; k<max1; k++,ct1++) { 2790 row = sbuf1_i[ct1]; 2791 #if defined(PETSC_USE_CTABLE) 2792 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2793 row--; 2794 #else 2795 row = rmap_i[row]; 2796 #endif 2797 ilen = imat_ilen[row]; 2798 mat_i = imat_i[row]; 2799 mat_a = imat_a + mat_i; 2800 mat_j = imat_j + mat_i; 2801 max2 = rbuf2_i[ct1]; 2802 if (!allcolumns[is_no]) { 2803 for (l=0; l<max2; l++,ct2++) { 2804 #if defined(PETSC_USE_CTABLE) 2805 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2806 #else 2807 tcol = cmap_i[rbuf3_i[ct2]]; 2808 #endif 2809 if (tcol) { 2810 *mat_j++ = tcol - 1; 2811 *mat_a++ = rbuf4_i[ct2]; 2812 ilen++; 2813 } 2814 } 2815 } else { /* allcolumns */ 2816 for (l=0; l<max2; l++,ct2++) { 2817 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2818 *mat_a++ = rbuf4_i[ct2]; 2819 ilen++; 2820 } 2821 } 2822 imat_ilen[row] = ilen; 2823 } 2824 } 2825 } 2826 2827 if (!iscsorted) { /* sort column indices of the rows */ 2828 for (i=0; i<ismax; i++) { 2829 subc = (Mat_SeqAIJ*)submats[i]->data; 2830 imat_j = subc->j; 2831 imat_i = subc->i; 2832 imat_a = subc->a; 2833 imat_ilen = subc->ilen; 2834 2835 if (allcolumns[i]) continue; 2836 jmax = nrow[i]; 2837 for (j=0; j<jmax; j++) { 2838 mat_i = imat_i[j]; 2839 mat_a = imat_a + mat_i; 2840 mat_j = imat_j + mat_i; 2841 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2842 } 2843 } 2844 } 2845 2846 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2847 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2848 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 2849 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2850 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2851 2852 /* Restore the indices */ 2853 for (i=0; i<ismax; i++) { 2854 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2855 if (!allcolumns[i]) { 2856 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2857 } 2858 } 2859 2860 for (i=0; i<ismax; i++) { 2861 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2862 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2863 } 2864 2865 /* Destroy allocated memory */ 2866 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2867 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2868 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2869 2870 for (i=0; i<nrqs; ++i) { 2871 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2872 } 2873 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2874 2875 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2876 PetscFunctionReturn(0); 2877 } 2878 2879 /* 2880 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2881 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2882 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2883 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2884 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2885 state, and needs to be "assembled" later by compressing B's column space. 2886 2887 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2888 Following this call, C->A & C->B have been created, even if empty. 2889 */ 2890 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2891 { 2892 /* If making this function public, change the error returned in this function away from _PLIB. */ 2893 PetscErrorCode ierr; 2894 Mat_MPIAIJ *aij; 2895 Mat_SeqAIJ *Baij; 2896 PetscBool seqaij,Bdisassembled; 2897 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2898 PetscScalar v; 2899 const PetscInt *rowindices,*colindices; 2900 2901 PetscFunctionBegin; 2902 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2903 if (A) { 2904 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2905 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2906 if (rowemb) { 2907 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2908 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2909 } else { 2910 if (C->rmap->n != A->rmap->n) { 2911 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2912 } 2913 } 2914 if (dcolemb) { 2915 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2916 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2917 } else { 2918 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2919 } 2920 } 2921 if (B) { 2922 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2923 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2924 if (rowemb) { 2925 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2926 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2927 } else { 2928 if (C->rmap->n != B->rmap->n) { 2929 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2930 } 2931 } 2932 if (ocolemb) { 2933 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2934 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2935 } else { 2936 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2937 } 2938 } 2939 2940 aij = (Mat_MPIAIJ*)C->data; 2941 if (!aij->A) { 2942 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2943 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2944 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2945 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2946 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2947 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2948 } 2949 if (A) { 2950 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2951 } else { 2952 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2953 } 2954 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2955 /* 2956 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2957 need to "disassemble" B -- convert it to using C's global indices. 2958 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2959 2960 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2961 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2962 2963 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2964 At least avoid calling MatSetValues() and the implied searches? 2965 */ 2966 2967 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2968 #if defined(PETSC_USE_CTABLE) 2969 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2970 #else 2971 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2972 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2973 if (aij->B) { 2974 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2975 } 2976 #endif 2977 ngcol = 0; 2978 if (aij->lvec) { 2979 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2980 } 2981 if (aij->garray) { 2982 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2983 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2984 } 2985 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2986 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 2987 } 2988 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 2989 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 2990 } 2991 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 2992 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 2993 } 2994 } 2995 Bdisassembled = PETSC_FALSE; 2996 if (!aij->B) { 2997 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 2998 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 2999 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 3000 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 3001 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3002 Bdisassembled = PETSC_TRUE; 3003 } 3004 if (B) { 3005 Baij = (Mat_SeqAIJ*)B->data; 3006 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3007 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3008 for (i=0; i<B->rmap->n; i++) { 3009 nz[i] = Baij->i[i+1] - Baij->i[i]; 3010 } 3011 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3012 ierr = PetscFree(nz);CHKERRQ(ierr); 3013 } 3014 3015 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3016 shift = rend-rstart; 3017 count = 0; 3018 rowindices = NULL; 3019 colindices = NULL; 3020 if (rowemb) { 3021 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3022 } 3023 if (ocolemb) { 3024 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3025 } 3026 for (i=0; i<B->rmap->n; i++) { 3027 PetscInt row; 3028 row = i; 3029 if (rowindices) row = rowindices[i]; 3030 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3031 col = Baij->j[count]; 3032 if (colindices) col = colindices[col]; 3033 if (Bdisassembled && col>=rstart) col += shift; 3034 v = Baij->a[count]; 3035 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3036 ++count; 3037 } 3038 } 3039 /* No assembly for aij->B is necessary. */ 3040 /* FIXME: set aij->B's nonzerostate correctly. */ 3041 } else { 3042 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3043 } 3044 C->preallocated = PETSC_TRUE; 3045 C->was_assembled = PETSC_FALSE; 3046 C->assembled = PETSC_FALSE; 3047 /* 3048 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3049 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3050 */ 3051 PetscFunctionReturn(0); 3052 } 3053 3054 /* 3055 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3056 */ 3057 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3058 { 3059 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3060 3061 PetscFunctionBegin; 3062 PetscValidPointer(A,2); 3063 PetscValidPointer(B,3); 3064 /* FIXME: make sure C is assembled */ 3065 *A = aij->A; 3066 *B = aij->B; 3067 /* Note that we don't incref *A and *B, so be careful! */ 3068 PetscFunctionReturn(0); 3069 } 3070 3071 /* 3072 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3073 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3074 */ 3075 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3076 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3077 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3078 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3079 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3080 { 3081 PetscErrorCode ierr; 3082 PetscMPIInt size,flag; 3083 PetscInt i,ii,cismax,ispar; 3084 Mat *A,*B; 3085 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3086 3087 PetscFunctionBegin; 3088 if (!ismax) PetscFunctionReturn(0); 3089 3090 for (i = 0, cismax = 0; i < ismax; ++i) { 3091 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 3092 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3093 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 3094 if (size > 1) ++cismax; 3095 } 3096 3097 /* 3098 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3099 ispar counts the number of parallel ISs across C's comm. 3100 */ 3101 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3102 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3103 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3104 PetscFunctionReturn(0); 3105 } 3106 3107 /* if (ispar) */ 3108 /* 3109 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3110 These are used to extract the off-diag portion of the resulting parallel matrix. 3111 The row IS for the off-diag portion is the same as for the diag portion, 3112 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3113 */ 3114 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3115 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3116 for (i = 0, ii = 0; i < ismax; ++i) { 3117 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRQ(ierr); 3118 if (size > 1) { 3119 /* 3120 TODO: This is the part that's ***NOT SCALABLE***. 3121 To fix this we need to extract just the indices of C's nonzero columns 3122 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3123 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3124 to be done without serializing on the IS list, so, most likely, it is best 3125 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3126 */ 3127 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3128 /* Now we have to 3129 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3130 were sorted on each rank, concatenated they might no longer be sorted; 3131 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3132 indices in the nondecreasing order to the original index positions. 3133 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3134 */ 3135 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3136 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3137 ++ii; 3138 } 3139 } 3140 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3141 for (i = 0, ii = 0; i < ismax; ++i) { 3142 PetscInt j,issize; 3143 const PetscInt *indices; 3144 3145 /* 3146 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3147 */ 3148 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3149 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3150 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3151 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3152 for (j = 1; j < issize; ++j) { 3153 if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3154 } 3155 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3156 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3157 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3158 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3159 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3160 for (j = 1; j < issize; ++j) { 3161 if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3162 } 3163 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3164 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRQ(ierr); 3165 if (size > 1) { 3166 cisrow[ii] = isrow[i]; 3167 ++ii; 3168 } 3169 } 3170 /* 3171 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3172 array of sequential matrices underlying the resulting parallel matrices. 3173 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3174 contain duplicates. 3175 3176 There are as many diag matrices as there are original index sets. There are only as many parallel 3177 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3178 3179 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3180 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3181 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3182 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3183 with A[i] and B[ii] extracted from the corresponding MPI submat. 3184 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3185 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3186 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3187 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3188 values into A[i] and B[ii] sitting inside the corresponding submat. 3189 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3190 A[i], B[ii], AA[i] or BB[ii] matrices. 3191 */ 3192 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3193 if (scall == MAT_INITIAL_MATRIX) { 3194 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3195 } 3196 3197 /* Now obtain the sequential A and B submatrices separately. */ 3198 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3199 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3200 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3201 3202 /* 3203 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3204 matrices A & B have been extracted directly into the parallel matrices containing them, or 3205 simply into the sequential matrix identical with the corresponding A (if size == 1). 3206 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3207 to have the same sparsity pattern. 3208 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3209 must be constructed for C. This is done by setseqmat(s). 3210 */ 3211 for (i = 0, ii = 0; i < ismax; ++i) { 3212 /* 3213 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3214 That way we can avoid sorting and computing permutations when reusing. 3215 To this end: 3216 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3217 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3218 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3219 */ 3220 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3221 3222 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRQ(ierr); 3223 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3224 if (size > 1) { 3225 if (scall == MAT_INITIAL_MATRIX) { 3226 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3227 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3228 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3229 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3230 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3231 } 3232 /* 3233 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3234 */ 3235 { 3236 Mat AA = A[i],BB = B[ii]; 3237 3238 if (AA || BB) { 3239 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3240 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3241 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3242 } 3243 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3244 } 3245 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3246 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3247 ++ii; 3248 } else { /* if (size == 1) */ 3249 if (scall == MAT_REUSE_MATRIX) { 3250 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3251 } 3252 if (isrow_p[i] || iscol_p[i]) { 3253 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3254 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3255 /* Otherwise A is extracted straight into (*submats)[i]. */ 3256 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3257 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3258 } else (*submat)[i] = A[i]; 3259 } 3260 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3261 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3262 } 3263 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3264 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3265 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3266 ierr = PetscFree(A);CHKERRQ(ierr); 3267 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3268 PetscFunctionReturn(0); 3269 } 3270 3271 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3272 { 3273 PetscErrorCode ierr; 3274 3275 PetscFunctionBegin; 3276 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3277 PetscFunctionReturn(0); 3278 } 3279