1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++) { 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++) { 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank) { 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows) { 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++) { 115 if (tosizes_temp[i]) { 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 toffsets[0] = 0; 123 for (i=0; i<nto; i++) { 124 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 125 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 126 } 127 /* send information to other processors */ 128 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 129 nrecvrows = 0; 130 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 131 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 132 nrecvrows = 0; 133 for (i=0; i<nfrom; i++) { 134 for (j=0; j<fromsizes[2*i]; j++) { 135 remote[nrecvrows].rank = fromranks[i]; 136 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 137 } 138 } 139 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 140 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 141 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 142 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 143 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 144 /* message pair <no of is, row> */ 145 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 146 for (i=0; i<nrrows; i++) { 147 owner = rrow_ranks[i]; /* processor */ 148 j = tosizes_temp[owner]; /* index */ 149 todata[toffsets[j]++] = rrow_isids[i]; 150 todata[toffsets[j]++] = remoterows[i]; 151 } 152 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 153 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 154 ierr = PetscFree(toffsets);CHKERRQ(ierr); 155 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 156 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);CHKERRQ(ierr); 157 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 158 /* send rows belonging to the remote so that then we could get the overlapping data back */ 159 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 160 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 162 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 163 ierr = PetscFree(fromranks);CHKERRQ(ierr); 164 nrecvrows = 0; 165 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 166 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 167 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 168 nrecvrows = 0; 169 for (i=0; i<nto; i++) { 170 for (j=0; j<tosizes[2*i]; j++) { 171 remote[nrecvrows].rank = toranks[i]; 172 remote[nrecvrows++].index = tosizes[2*i+1]+j; 173 } 174 } 175 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 176 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 177 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 178 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 179 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 180 /* overlap communication and computation */ 181 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 182 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 183 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);CHKERRQ(ierr); 184 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 185 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 186 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 187 ierr = PetscFree(toranks);CHKERRQ(ierr); 188 ierr = PetscFree(tosizes);CHKERRQ(ierr); 189 ierr = PetscFree(todata);CHKERRQ(ierr); 190 PetscFunctionReturn(0); 191 } 192 193 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 194 { 195 PetscInt *isz,isz_i,i,j,is_id, data_size; 196 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 197 const PetscInt *indices_i_temp; 198 MPI_Comm *iscomms; 199 PetscErrorCode ierr; 200 201 PetscFunctionBegin; 202 max_lsize = 0; 203 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 204 for (i=0; i<nidx; i++){ 205 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 206 max_lsize = lsize>max_lsize ? lsize:max_lsize; 207 isz[i] = lsize; 208 } 209 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 212 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 213 ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr); 214 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 215 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 216 } 217 /* retrieve information to get row id and its overlap */ 218 for (i=0; i<nrecvs;){ 219 is_id = recvdata[i++]; 220 data_size = recvdata[i++]; 221 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 222 isz_i = isz[is_id]; 223 for (j=0; j< data_size; j++){ 224 col = recvdata[i++]; 225 indices_i[isz_i++] = col; 226 } 227 isz[is_id] = isz_i; 228 } 229 /* remove duplicate entities */ 230 for (i=0; i<nidx; i++){ 231 indices_i = indices_temp+(max_lsize+nrecvs)*i; 232 isz_i = isz[i]; 233 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 234 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 235 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 236 } 237 ierr = PetscFree(isz);CHKERRQ(ierr); 238 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 243 { 244 PetscLayout rmap,cmap; 245 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 246 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 247 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 248 const PetscInt *gcols,*ai,*aj,*bi,*bj; 249 Mat amat,bmat; 250 PetscMPIInt rank; 251 PetscBool done; 252 MPI_Comm comm; 253 PetscErrorCode ierr; 254 255 PetscFunctionBegin; 256 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 257 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 258 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 259 /* Even if the mat is symmetric, we still assume it is not symmetric */ 260 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 261 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 262 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 263 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 264 /* total number of nonzero values is used to estimate the memory usage in the next step */ 265 tnz = ai[an]+bi[bn]; 266 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 267 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 268 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 269 /* to find the longest message */ 270 max_fszs = 0; 271 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 272 /* better way to estimate number of nonzero in the mat??? */ 273 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 274 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 275 rows_pos = 0; 276 totalrows = 0; 277 for (i=0; i<nfrom; i++){ 278 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 279 /* group data together */ 280 for (j=0; j<fromsizes[2*i]; j+=2){ 281 is_id = fromrows[rows_pos++];/* no of is */ 282 rows_i = rows_data[is_id]; 283 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 284 } 285 /* estimate a space to avoid multiple allocations */ 286 for (j=0; j<nidx; j++){ 287 indvc_ij = 0; 288 rows_i = rows_data[j]; 289 for (l=0; l<rows_pos_i[j]; l++){ 290 row = rows_i[l]-rstart; 291 start = ai[row]; 292 end = ai[row+1]; 293 for (k=start; k<end; k++){ /* Amat */ 294 col = aj[k] + cstart; 295 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 296 } 297 start = bi[row]; 298 end = bi[row+1]; 299 for (k=start; k<end; k++) { /* Bmat */ 300 col = gcols[bj[k]]; 301 indices_tmp[indvc_ij++] = col; 302 } 303 } 304 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 305 indv_counts[i*nidx+j] = indvc_ij; 306 totalrows += indvc_ij; 307 } 308 } 309 /* message triple <no of is, number of rows, rows> */ 310 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 311 totalrows = 0; 312 rows_pos = 0; 313 /* use this code again */ 314 for (i=0;i<nfrom;i++){ 315 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 316 for (j=0; j<fromsizes[2*i]; j+=2){ 317 is_id = fromrows[rows_pos++]; 318 rows_i = rows_data[is_id]; 319 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 320 } 321 /* add data */ 322 for (j=0; j<nidx; j++){ 323 if (!indv_counts[i*nidx+j]) continue; 324 indvc_ij = 0; 325 sbdata[totalrows++] = j; 326 sbdata[totalrows++] = indv_counts[i*nidx+j]; 327 sbsizes[2*i] += 2; 328 rows_i = rows_data[j]; 329 for (l=0; l<rows_pos_i[j]; l++){ 330 row = rows_i[l]-rstart; 331 start = ai[row]; 332 end = ai[row+1]; 333 for (k=start; k<end; k++){ /* Amat */ 334 col = aj[k] + cstart; 335 indices_tmp[indvc_ij++] = col; 336 } 337 start = bi[row]; 338 end = bi[row+1]; 339 for (k=start; k<end; k++) { /* Bmat */ 340 col = gcols[bj[k]]; 341 indices_tmp[indvc_ij++] = col; 342 } 343 } 344 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 345 sbsizes[2*i] += indvc_ij; 346 ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr); 347 totalrows += indvc_ij; 348 } 349 } 350 ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 351 offsets[0] = 0; 352 for (i=0; i<nfrom; i++){ 353 offsets[i+1] = offsets[i] + sbsizes[2*i]; 354 sbsizes[2*i+1] = offsets[i]; 355 } 356 ierr = PetscFree(offsets);CHKERRQ(ierr); 357 if (sbrowsizes) *sbrowsizes = sbsizes; 358 if (sbrows) *sbrows = sbdata; 359 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 360 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 361 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 362 PetscFunctionReturn(0); 363 } 364 365 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 366 { 367 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 368 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 369 PetscInt lsize,lsize_tmp; 370 PetscMPIInt rank,owner; 371 Mat amat,bmat; 372 PetscBool done; 373 PetscLayout cmap,rmap; 374 MPI_Comm comm; 375 PetscErrorCode ierr; 376 377 PetscFunctionBegin; 378 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 379 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 380 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 381 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 382 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 383 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 384 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 385 /* is it a safe way to compute number of nonzero values ? */ 386 tnz = ai[an]+bi[bn]; 387 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 388 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 389 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 390 /* it is a better way to estimate memory than the old implementation 391 * where global size of matrix is used 392 * */ 393 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 394 for (i=0; i<nidx; i++) { 395 MPI_Comm iscomm; 396 397 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 398 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 399 lsize_tmp = 0; 400 for (j=0; j<lsize; j++) { 401 owner = -1; 402 row = indices[j]; 403 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 404 if (owner != rank) continue; 405 /* local number */ 406 row -= rstart; 407 start = ai[row]; 408 end = ai[row+1]; 409 for (k=start; k<end; k++) { /* Amat */ 410 col = aj[k] + cstart; 411 indices_temp[lsize_tmp++] = col; 412 } 413 start = bi[row]; 414 end = bi[row+1]; 415 for (k=start; k<end; k++) { /* Bmat */ 416 col = gcols[bj[k]]; 417 indices_temp[lsize_tmp++] = col; 418 } 419 } 420 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 421 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 422 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 423 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 424 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 425 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 426 } 427 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 428 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 429 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 430 PetscFunctionReturn(0); 431 } 432 433 434 /* 435 Sample message format: 436 If a processor A wants processor B to process some elements corresponding 437 to index sets is[1],is[5] 438 mesg [0] = 2 (no of index sets in the mesg) 439 ----------- 440 mesg [1] = 1 => is[1] 441 mesg [2] = sizeof(is[1]); 442 ----------- 443 mesg [3] = 5 => is[5] 444 mesg [4] = sizeof(is[5]); 445 ----------- 446 mesg [5] 447 mesg [n] datas[1] 448 ----------- 449 mesg[n+1] 450 mesg[m] data(is[5]) 451 ----------- 452 453 Notes: 454 nrqs - no of requests sent (or to be sent out) 455 nrqr - no of requests received (which have to be or which have been processed) 456 */ 457 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 458 { 459 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 460 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 461 const PetscInt **idx,*idx_i; 462 PetscInt *n,**data,len; 463 #if defined(PETSC_USE_CTABLE) 464 PetscTable *table_data,table_data_i; 465 PetscInt *tdata,tcount,tcount_max; 466 #else 467 PetscInt *data_i,*d_p; 468 #endif 469 PetscErrorCode ierr; 470 PetscMPIInt size,rank,tag1,tag2,proc = 0; 471 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 472 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 473 PetscBT *table; 474 MPI_Comm comm; 475 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 476 MPI_Status *s_status,*recv_status; 477 MPI_Comm *iscomms; 478 char *t_p; 479 480 PetscFunctionBegin; 481 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 482 size = c->size; 483 rank = c->rank; 484 M = C->rmap->N; 485 486 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 487 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 488 489 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 490 491 for (i=0; i<imax; i++) { 492 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 493 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 494 } 495 496 /* evaluate communication - mesg to who,length of mesg, and buffer space 497 required. Based on this, buffers are allocated, and data copied into them */ 498 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 499 for (i=0; i<imax; i++) { 500 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 501 idx_i = idx[i]; 502 len = n[i]; 503 for (j=0; j<len; j++) { 504 row = idx_i[j]; 505 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 506 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 507 w4[proc]++; 508 } 509 for (j=0; j<size; j++) { 510 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 511 } 512 } 513 514 nrqs = 0; /* no of outgoing messages */ 515 msz = 0; /* total mesg length (for all proc */ 516 w1[rank] = 0; /* no mesg sent to intself */ 517 w3[rank] = 0; 518 for (i=0; i<size; i++) { 519 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 520 } 521 /* pa - is list of processors to communicate with */ 522 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 523 for (i=0,j=0; i<size; i++) { 524 if (w1[i]) {pa[j] = i; j++;} 525 } 526 527 /* Each message would have a header = 1 + 2*(no of IS) + data */ 528 for (i=0; i<nrqs; i++) { 529 j = pa[i]; 530 w1[j] += w2[j] + 2*w3[j]; 531 msz += w1[j]; 532 } 533 534 /* Determine the number of messages to expect, their lengths, from from-ids */ 535 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 536 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 537 538 /* Now post the Irecvs corresponding to these messages */ 539 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 540 541 /* Allocate Memory for outgoing messages */ 542 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 543 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 544 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 545 546 { 547 PetscInt *iptr = tmp,ict = 0; 548 for (i=0; i<nrqs; i++) { 549 j = pa[i]; 550 iptr += ict; 551 outdat[j] = iptr; 552 ict = w1[j]; 553 } 554 } 555 556 /* Form the outgoing messages */ 557 /* plug in the headers */ 558 for (i=0; i<nrqs; i++) { 559 j = pa[i]; 560 outdat[j][0] = 0; 561 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 562 ptr[j] = outdat[j] + 2*w3[j] + 1; 563 } 564 565 /* Memory for doing local proc's work */ 566 { 567 PetscInt M_BPB_imax = 0; 568 #if defined(PETSC_USE_CTABLE) 569 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 570 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 571 for (i=0; i<imax; i++) { 572 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 573 } 574 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 575 for (i=0; i<imax; i++) { 576 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 577 } 578 #else 579 PetscInt Mimax = 0; 580 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 581 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 582 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 583 for (i=0; i<imax; i++) { 584 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 585 data[i] = d_p + M*i; 586 } 587 #endif 588 } 589 590 /* Parse the IS and update local tables and the outgoing buf with the data */ 591 { 592 PetscInt n_i,isz_i,*outdat_j,ctr_j; 593 PetscBT table_i; 594 595 for (i=0; i<imax; i++) { 596 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 597 n_i = n[i]; 598 table_i = table[i]; 599 idx_i = idx[i]; 600 #if defined(PETSC_USE_CTABLE) 601 table_data_i = table_data[i]; 602 #else 603 data_i = data[i]; 604 #endif 605 isz_i = isz[i]; 606 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 607 row = idx_i[j]; 608 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 609 if (proc != rank) { /* copy to the outgoing buffer */ 610 ctr[proc]++; 611 *ptr[proc] = row; 612 ptr[proc]++; 613 } else if (!PetscBTLookupSet(table_i,row)) { 614 #if defined(PETSC_USE_CTABLE) 615 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 616 #else 617 data_i[isz_i] = row; /* Update the local table */ 618 #endif 619 isz_i++; 620 } 621 } 622 /* Update the headers for the current IS */ 623 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 624 if ((ctr_j = ctr[j])) { 625 outdat_j = outdat[j]; 626 k = ++outdat_j[0]; 627 outdat_j[2*k] = ctr_j; 628 outdat_j[2*k-1] = i; 629 } 630 } 631 isz[i] = isz_i; 632 } 633 } 634 635 /* Now post the sends */ 636 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 637 for (i=0; i<nrqs; ++i) { 638 j = pa[i]; 639 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 640 } 641 642 /* No longer need the original indices */ 643 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 644 for (i=0; i<imax; ++i) { 645 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 646 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 647 } 648 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 649 650 for (i=0; i<imax; ++i) { 651 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 652 } 653 654 /* Do Local work */ 655 #if defined(PETSC_USE_CTABLE) 656 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 657 #else 658 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 659 #endif 660 661 /* Receive messages */ 662 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 663 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);} 664 665 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 666 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRMPI(ierr);} 667 668 /* Phase 1 sends are complete - deallocate buffers */ 669 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 670 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 671 672 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 673 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 674 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 675 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 676 ierr = PetscFree(rbuf);CHKERRQ(ierr); 677 678 679 /* Send the data back */ 680 /* Do a global reduction to know the buffer space req for incoming messages */ 681 { 682 PetscMPIInt *rw1; 683 684 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 685 686 for (i=0; i<nrqr; ++i) { 687 proc = recv_status[i].MPI_SOURCE; 688 689 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 690 rw1[proc] = isz1[i]; 691 } 692 ierr = PetscFree(onodes1);CHKERRQ(ierr); 693 ierr = PetscFree(olengths1);CHKERRQ(ierr); 694 695 /* Determine the number of messages to expect, their lengths, from from-ids */ 696 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 697 ierr = PetscFree(rw1);CHKERRQ(ierr); 698 } 699 /* Now post the Irecvs corresponding to these messages */ 700 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 701 702 /* Now post the sends */ 703 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 704 for (i=0; i<nrqr; ++i) { 705 j = recv_status[i].MPI_SOURCE; 706 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr); 707 } 708 709 /* receive work done on other processors */ 710 { 711 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 712 PetscMPIInt idex; 713 PetscBT table_i; 714 MPI_Status *status2; 715 716 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 717 for (i=0; i<nrqs; ++i) { 718 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRMPI(ierr); 719 /* Process the message */ 720 rbuf2_i = rbuf2[idex]; 721 ct1 = 2*rbuf2_i[0]+1; 722 jmax = rbuf2[idex][0]; 723 for (j=1; j<=jmax; j++) { 724 max = rbuf2_i[2*j]; 725 is_no = rbuf2_i[2*j-1]; 726 isz_i = isz[is_no]; 727 table_i = table[is_no]; 728 #if defined(PETSC_USE_CTABLE) 729 table_data_i = table_data[is_no]; 730 #else 731 data_i = data[is_no]; 732 #endif 733 for (k=0; k<max; k++,ct1++) { 734 row = rbuf2_i[ct1]; 735 if (!PetscBTLookupSet(table_i,row)) { 736 #if defined(PETSC_USE_CTABLE) 737 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 738 #else 739 data_i[isz_i] = row; 740 #endif 741 isz_i++; 742 } 743 } 744 isz[is_no] = isz_i; 745 } 746 } 747 748 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRMPI(ierr);} 749 ierr = PetscFree(status2);CHKERRQ(ierr); 750 } 751 752 #if defined(PETSC_USE_CTABLE) 753 tcount_max = 0; 754 for (i=0; i<imax; ++i) { 755 table_data_i = table_data[i]; 756 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 757 if (tcount_max < tcount) tcount_max = tcount; 758 } 759 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 760 #endif 761 762 for (i=0; i<imax; ++i) { 763 #if defined(PETSC_USE_CTABLE) 764 PetscTablePosition tpos; 765 table_data_i = table_data[i]; 766 767 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 768 while (tpos) { 769 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 770 tdata[--j] = --k; 771 } 772 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 773 #else 774 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 775 #endif 776 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 777 } 778 779 ierr = PetscFree(iscomms);CHKERRQ(ierr); 780 ierr = PetscFree(onodes2);CHKERRQ(ierr); 781 ierr = PetscFree(olengths2);CHKERRQ(ierr); 782 783 ierr = PetscFree(pa);CHKERRQ(ierr); 784 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 785 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 786 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 787 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 788 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 789 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 790 ierr = PetscFree(s_status);CHKERRQ(ierr); 791 ierr = PetscFree(recv_status);CHKERRQ(ierr); 792 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 793 ierr = PetscFree(xdata);CHKERRQ(ierr); 794 ierr = PetscFree(isz1);CHKERRQ(ierr); 795 #if defined(PETSC_USE_CTABLE) 796 for (i=0; i<imax; i++) { 797 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 798 } 799 ierr = PetscFree(table_data);CHKERRQ(ierr); 800 ierr = PetscFree(tdata);CHKERRQ(ierr); 801 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 802 #else 803 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 804 #endif 805 PetscFunctionReturn(0); 806 } 807 808 /* 809 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 810 the work on the local processor. 811 812 Inputs: 813 C - MAT_MPIAIJ; 814 imax - total no of index sets processed at a time; 815 table - an array of char - size = m bits. 816 817 Output: 818 isz - array containing the count of the solution elements corresponding 819 to each index set; 820 data or table_data - pointer to the solutions 821 */ 822 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 823 { 824 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 825 Mat A = c->A,B = c->B; 826 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 827 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 828 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 829 PetscBT table_i; 830 #if defined(PETSC_USE_CTABLE) 831 PetscTable table_data_i; 832 PetscErrorCode ierr; 833 PetscTablePosition tpos; 834 PetscInt tcount,*tdata; 835 #else 836 PetscInt *data_i; 837 #endif 838 839 PetscFunctionBegin; 840 rstart = C->rmap->rstart; 841 cstart = C->cmap->rstart; 842 ai = a->i; 843 aj = a->j; 844 bi = b->i; 845 bj = b->j; 846 garray = c->garray; 847 848 for (i=0; i<imax; i++) { 849 #if defined(PETSC_USE_CTABLE) 850 /* copy existing entries of table_data_i into tdata[] */ 851 table_data_i = table_data[i]; 852 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 853 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 854 855 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 856 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 857 while (tpos) { 858 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 859 tdata[--j] = --row; 860 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 861 } 862 #else 863 data_i = data[i]; 864 #endif 865 table_i = table[i]; 866 isz_i = isz[i]; 867 max = isz[i]; 868 869 for (j=0; j<max; j++) { 870 #if defined(PETSC_USE_CTABLE) 871 row = tdata[j] - rstart; 872 #else 873 row = data_i[j] - rstart; 874 #endif 875 start = ai[row]; 876 end = ai[row+1]; 877 for (k=start; k<end; k++) { /* Amat */ 878 val = aj[k] + cstart; 879 if (!PetscBTLookupSet(table_i,val)) { 880 #if defined(PETSC_USE_CTABLE) 881 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 882 #else 883 data_i[isz_i] = val; 884 #endif 885 isz_i++; 886 } 887 } 888 start = bi[row]; 889 end = bi[row+1]; 890 for (k=start; k<end; k++) { /* Bmat */ 891 val = garray[bj[k]]; 892 if (!PetscBTLookupSet(table_i,val)) { 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 895 #else 896 data_i[isz_i] = val; 897 #endif 898 isz_i++; 899 } 900 } 901 } 902 isz[i] = isz_i; 903 904 #if defined(PETSC_USE_CTABLE) 905 ierr = PetscFree(tdata);CHKERRQ(ierr); 906 #endif 907 } 908 PetscFunctionReturn(0); 909 } 910 911 /* 912 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 913 and return the output 914 915 Input: 916 C - the matrix 917 nrqr - no of messages being processed. 918 rbuf - an array of pointers to the received requests 919 920 Output: 921 xdata - array of messages to be sent back 922 isz1 - size of each message 923 924 For better efficiency perhaps we should malloc separately each xdata[i], 925 then if a remalloc is required we need only copy the data for that one row 926 rather then all previous rows as it is now where a single large chunck of 927 memory is used. 928 929 */ 930 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 931 { 932 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 933 Mat A = c->A,B = c->B; 934 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 935 PetscErrorCode ierr; 936 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 937 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 938 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 939 PetscInt *rbuf_i,kmax,rbuf_0; 940 PetscBT xtable; 941 942 PetscFunctionBegin; 943 m = C->rmap->N; 944 rstart = C->rmap->rstart; 945 cstart = C->cmap->rstart; 946 ai = a->i; 947 aj = a->j; 948 bi = b->i; 949 bj = b->j; 950 garray = c->garray; 951 952 953 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 954 rbuf_i = rbuf[i]; 955 rbuf_0 = rbuf_i[0]; 956 ct += rbuf_0; 957 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 958 } 959 960 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 961 else max1 = 1; 962 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 963 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 964 ++no_malloc; 965 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 966 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 967 968 ct3 = 0; 969 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 970 rbuf_i = rbuf[i]; 971 rbuf_0 = rbuf_i[0]; 972 ct1 = 2*rbuf_0+1; 973 ct2 = ct1; 974 ct3 += ct1; 975 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 976 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 977 oct2 = ct2; 978 kmax = rbuf_i[2*j]; 979 for (k=0; k<kmax; k++,ct1++) { 980 row = rbuf_i[ct1]; 981 if (!PetscBTLookupSet(xtable,row)) { 982 if (!(ct3 < mem_estimate)) { 983 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 984 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 985 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 986 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 987 xdata[0] = tmp; 988 mem_estimate = new_estimate; ++no_malloc; 989 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 990 } 991 xdata[i][ct2++] = row; 992 ct3++; 993 } 994 } 995 for (k=oct2,max2=ct2; k<max2; k++) { 996 row = xdata[i][k] - rstart; 997 start = ai[row]; 998 end = ai[row+1]; 999 for (l=start; l<end; l++) { 1000 val = aj[l] + cstart; 1001 if (!PetscBTLookupSet(xtable,val)) { 1002 if (!(ct3 < mem_estimate)) { 1003 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1004 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1005 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1006 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1007 xdata[0] = tmp; 1008 mem_estimate = new_estimate; ++no_malloc; 1009 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1010 } 1011 xdata[i][ct2++] = val; 1012 ct3++; 1013 } 1014 } 1015 start = bi[row]; 1016 end = bi[row+1]; 1017 for (l=start; l<end; l++) { 1018 val = garray[bj[l]]; 1019 if (!PetscBTLookupSet(xtable,val)) { 1020 if (!(ct3 < mem_estimate)) { 1021 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1022 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1023 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1024 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1025 xdata[0] = tmp; 1026 mem_estimate = new_estimate; ++no_malloc; 1027 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1028 } 1029 xdata[i][ct2++] = val; 1030 ct3++; 1031 } 1032 } 1033 } 1034 /* Update the header*/ 1035 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1036 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1037 } 1038 xdata[i][0] = rbuf_0; 1039 xdata[i+1] = xdata[i] + ct2; 1040 isz1[i] = ct2; /* size of each message */ 1041 } 1042 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1043 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1044 PetscFunctionReturn(0); 1045 } 1046 /* -------------------------------------------------------------------------*/ 1047 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1048 /* 1049 Every processor gets the entire matrix 1050 */ 1051 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1052 { 1053 Mat B; 1054 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1055 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1056 PetscErrorCode ierr; 1057 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1058 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1059 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1060 1061 PetscFunctionBegin; 1062 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 1063 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr); 1064 if (scall == MAT_INITIAL_MATRIX) { 1065 /* ---------------------------------------------------------------- 1066 Tell every processor the number of nonzeros per row 1067 */ 1068 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1069 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1070 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1071 } 1072 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1073 for (i=0; i<size; i++) { 1074 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1075 displs[i] = A->rmap->range[i]; 1076 } 1077 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1078 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1079 #else 1080 sendcount = A->rmap->rend - A->rmap->rstart; 1081 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1082 #endif 1083 /* --------------------------------------------------------------- 1084 Create the sequential matrix of the same type as the local block diagonal 1085 */ 1086 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1087 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1088 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1089 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1090 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1091 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1092 **Bin = B; 1093 b = (Mat_SeqAIJ*)B->data; 1094 1095 /*-------------------------------------------------------------------- 1096 Copy my part of matrix column indices over 1097 */ 1098 sendcount = ad->nz + bd->nz; 1099 jsendbuf = b->j + b->i[rstarts[rank]]; 1100 a_jsendbuf = ad->j; 1101 b_jsendbuf = bd->j; 1102 n = A->rmap->rend - A->rmap->rstart; 1103 cnt = 0; 1104 for (i=0; i<n; i++) { 1105 /* put in lower diagonal portion */ 1106 m = bd->i[i+1] - bd->i[i]; 1107 while (m > 0) { 1108 /* is it above diagonal (in bd (compressed) numbering) */ 1109 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1110 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1111 m--; 1112 } 1113 1114 /* put in diagonal portion */ 1115 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1116 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1117 } 1118 1119 /* put in upper diagonal portion */ 1120 while (m-- > 0) { 1121 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1122 } 1123 } 1124 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1125 1126 /*-------------------------------------------------------------------- 1127 Gather all column indices to all processors 1128 */ 1129 for (i=0; i<size; i++) { 1130 recvcounts[i] = 0; 1131 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1132 recvcounts[i] += lens[j]; 1133 } 1134 } 1135 displs[0] = 0; 1136 for (i=1; i<size; i++) { 1137 displs[i] = displs[i-1] + recvcounts[i-1]; 1138 } 1139 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1140 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1141 #else 1142 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1143 #endif 1144 /*-------------------------------------------------------------------- 1145 Assemble the matrix into useable form (note numerical values not yet set) 1146 */ 1147 /* set the b->ilen (length of each row) values */ 1148 ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr); 1149 /* set the b->i indices */ 1150 b->i[0] = 0; 1151 for (i=1; i<=A->rmap->N; i++) { 1152 b->i[i] = b->i[i-1] + lens[i-1]; 1153 } 1154 ierr = PetscFree(lens);CHKERRQ(ierr); 1155 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1156 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1157 1158 } else { 1159 B = **Bin; 1160 b = (Mat_SeqAIJ*)B->data; 1161 } 1162 1163 /*-------------------------------------------------------------------- 1164 Copy my part of matrix numerical values into the values location 1165 */ 1166 if (flag == MAT_GET_VALUES) { 1167 const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf; 1168 MatScalar *sendbuf,*recvbuf; 1169 1170 ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr); 1171 ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr); 1172 sendcount = ad->nz + bd->nz; 1173 sendbuf = b->a + b->i[rstarts[rank]]; 1174 a_sendbuf = ada; 1175 b_sendbuf = bda; 1176 b_sendj = bd->j; 1177 n = A->rmap->rend - A->rmap->rstart; 1178 cnt = 0; 1179 for (i=0; i<n; i++) { 1180 /* put in lower diagonal portion */ 1181 m = bd->i[i+1] - bd->i[i]; 1182 while (m > 0) { 1183 /* is it above diagonal (in bd (compressed) numbering) */ 1184 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1185 sendbuf[cnt++] = *b_sendbuf++; 1186 m--; 1187 b_sendj++; 1188 } 1189 1190 /* put in diagonal portion */ 1191 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1192 sendbuf[cnt++] = *a_sendbuf++; 1193 } 1194 1195 /* put in upper diagonal portion */ 1196 while (m-- > 0) { 1197 sendbuf[cnt++] = *b_sendbuf++; 1198 b_sendj++; 1199 } 1200 } 1201 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1202 1203 /* ----------------------------------------------------------------- 1204 Gather all numerical values to all processors 1205 */ 1206 if (!recvcounts) { 1207 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1208 } 1209 for (i=0; i<size; i++) { 1210 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1211 } 1212 displs[0] = 0; 1213 for (i=1; i<size; i++) { 1214 displs[i] = displs[i-1] + recvcounts[i-1]; 1215 } 1216 recvbuf = b->a; 1217 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1218 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1219 #else 1220 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1221 #endif 1222 ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr); 1223 ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr); 1224 } /* endof (flag == MAT_GET_VALUES) */ 1225 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1226 1227 if (A->symmetric) { 1228 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1229 } else if (A->hermitian) { 1230 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1231 } else if (A->structurally_symmetric) { 1232 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1233 } 1234 PetscFunctionReturn(0); 1235 } 1236 1237 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1238 { 1239 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1240 Mat submat,A = c->A,B = c->B; 1241 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1242 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1243 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1244 const PetscInt *icol,*irow; 1245 PetscInt nrow,ncol,start; 1246 PetscErrorCode ierr; 1247 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1248 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1249 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1250 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1251 PetscInt *lens,rmax,ncols,*cols,Crow; 1252 #if defined(PETSC_USE_CTABLE) 1253 PetscTable cmap,rmap; 1254 PetscInt *cmap_loc,*rmap_loc; 1255 #else 1256 PetscInt *cmap,*rmap; 1257 #endif 1258 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1259 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1260 PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL; 1261 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1262 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1263 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1264 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1265 MPI_Comm comm; 1266 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1267 PetscMPIInt *onodes1,*olengths1,idex,end; 1268 Mat_SubSppt *smatis1; 1269 PetscBool isrowsorted,iscolsorted; 1270 1271 PetscFunctionBegin; 1272 PetscValidLogicalCollectiveInt(C,ismax,2); 1273 PetscValidLogicalCollectiveEnum(C,scall,5); 1274 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1275 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1276 ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1277 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1278 size = c->size; 1279 rank = c->rank; 1280 1281 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1282 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1283 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1284 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1285 if (allcolumns) { 1286 icol = NULL; 1287 ncol = C->cmap->N; 1288 } else { 1289 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1290 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1291 } 1292 1293 if (scall == MAT_INITIAL_MATRIX) { 1294 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1295 1296 /* Get some new tags to keep the communication clean */ 1297 tag1 = ((PetscObject)C)->tag; 1298 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1299 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1300 1301 /* evaluate communication - mesg to who, length of mesg, and buffer space 1302 required. Based on this, buffers are allocated, and data copied into them */ 1303 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1304 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1305 1306 /* w1[proc] = num of rows owned by proc -- to be requested */ 1307 proc = 0; 1308 nrqs = 0; /* num of outgoing messages */ 1309 for (j=0; j<nrow; j++) { 1310 row = irow[j]; 1311 if (!isrowsorted) proc = 0; 1312 while (row >= C->rmap->range[proc+1]) proc++; 1313 w1[proc]++; 1314 row2proc[j] = proc; /* map row index to proc */ 1315 1316 if (proc != rank && !w2[proc]) { 1317 w2[proc] = 1; nrqs++; 1318 } 1319 } 1320 w1[rank] = 0; /* rows owned by self will not be requested */ 1321 1322 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1323 for (proc=0,j=0; proc<size; proc++) { 1324 if (w1[proc]) { pa[j++] = proc;} 1325 } 1326 1327 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1328 msz = 0; /* total mesg length (for all procs) */ 1329 for (i=0; i<nrqs; i++) { 1330 proc = pa[i]; 1331 w1[proc] += 3; 1332 msz += w1[proc]; 1333 } 1334 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1335 1336 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1337 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1338 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1339 1340 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1341 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1342 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1343 1344 /* Now post the Irecvs corresponding to these messages */ 1345 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1346 1347 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1348 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1349 1350 /* Allocate Memory for outgoing messages */ 1351 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1352 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 1353 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 1354 1355 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1356 iptr = tmp; 1357 for (i=0; i<nrqs; i++) { 1358 proc = pa[i]; 1359 sbuf1[proc] = iptr; 1360 iptr += w1[proc]; 1361 } 1362 1363 /* Form the outgoing messages */ 1364 /* Initialize the header space */ 1365 for (i=0; i<nrqs; i++) { 1366 proc = pa[i]; 1367 ierr = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr); 1368 ptr[proc] = sbuf1[proc] + 3; 1369 } 1370 1371 /* Parse the isrow and copy data into outbuf */ 1372 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 1373 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1374 proc = row2proc[j]; 1375 if (proc != rank) { /* copy to the outgoing buf*/ 1376 *ptr[proc] = irow[j]; 1377 ctr[proc]++; ptr[proc]++; 1378 } 1379 } 1380 1381 /* Update the headers for the current IS */ 1382 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1383 if ((ctr_j = ctr[j])) { 1384 sbuf1_j = sbuf1[j]; 1385 k = ++sbuf1_j[0]; 1386 sbuf1_j[2*k] = ctr_j; 1387 sbuf1_j[2*k-1] = 0; 1388 } 1389 } 1390 1391 /* Now post the sends */ 1392 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1393 for (i=0; i<nrqs; ++i) { 1394 proc = pa[i]; 1395 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 1396 } 1397 1398 /* Post Receives to capture the buffer size */ 1399 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1400 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1401 1402 rbuf2[0] = tmp + msz; 1403 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1404 1405 for (i=0; i<nrqs; ++i) { 1406 proc = pa[i]; 1407 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 1408 } 1409 1410 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1411 1412 /* Send to other procs the buf size they should allocate */ 1413 /* Receive messages*/ 1414 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1415 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1416 1417 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 1418 for (i=0; i<nrqr; ++i) { 1419 req_size[i] = 0; 1420 rbuf1_i = rbuf1[i]; 1421 start = 2*rbuf1_i[0] + 1; 1422 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr); 1423 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1424 sbuf2_i = sbuf2[i]; 1425 for (j=start; j<end; j++) { 1426 k = rbuf1_i[j] - rstart; 1427 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1428 sbuf2_i[j] = ncols; 1429 req_size[i] += ncols; 1430 } 1431 req_source1[i] = r_status1[i].MPI_SOURCE; 1432 1433 /* form the header */ 1434 sbuf2_i[0] = req_size[i]; 1435 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1436 1437 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 1438 } 1439 1440 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1441 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1442 1443 /* rbuf2 is received, Post recv column indices a->j */ 1444 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 1445 1446 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1447 for (i=0; i<nrqs; ++i) { 1448 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1449 req_source2[i] = r_status2[i].MPI_SOURCE; 1450 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 1451 } 1452 1453 /* Wait on sends1 and sends2 */ 1454 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1455 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr); 1456 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1457 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1458 1459 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr); 1460 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1461 1462 /* Now allocate sending buffers for a->j, and send them off */ 1463 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1464 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1465 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1466 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1467 1468 for (i=0; i<nrqr; i++) { /* for each requested message */ 1469 rbuf1_i = rbuf1[i]; 1470 sbuf_aj_i = sbuf_aj[i]; 1471 ct1 = 2*rbuf1_i[0] + 1; 1472 ct2 = 0; 1473 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1474 1475 kmax = rbuf1[i][2]; 1476 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1477 row = rbuf1_i[ct1] - rstart; 1478 nzA = ai[row+1] - ai[row]; 1479 nzB = bi[row+1] - bi[row]; 1480 ncols = nzA + nzB; 1481 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1482 1483 /* load the column indices for this row into cols*/ 1484 cols = sbuf_aj_i + ct2; 1485 1486 lwrite = 0; 1487 for (l=0; l<nzB; l++) { 1488 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1489 } 1490 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1491 for (l=0; l<nzB; l++) { 1492 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1493 } 1494 1495 ct2 += ncols; 1496 } 1497 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 1498 } 1499 1500 /* create column map (cmap): global col of C -> local col of submat */ 1501 #if defined(PETSC_USE_CTABLE) 1502 if (!allcolumns) { 1503 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1504 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1505 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1506 if (icol[j] >= cstart && icol[j] <cend) { 1507 cmap_loc[icol[j] - cstart] = j+1; 1508 } else { /* use PetscTable for non-local col indices */ 1509 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1510 } 1511 } 1512 } else { 1513 cmap = NULL; 1514 cmap_loc = NULL; 1515 } 1516 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1517 #else 1518 if (!allcolumns) { 1519 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1520 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1521 } else { 1522 cmap = NULL; 1523 } 1524 #endif 1525 1526 /* Create lens for MatSeqAIJSetPreallocation() */ 1527 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1528 1529 /* Compute lens from local part of C */ 1530 for (j=0; j<nrow; j++) { 1531 row = irow[j]; 1532 proc = row2proc[j]; 1533 if (proc == rank) { 1534 /* diagonal part A = c->A */ 1535 ncols = ai[row-rstart+1] - ai[row-rstart]; 1536 cols = aj + ai[row-rstart]; 1537 if (!allcolumns) { 1538 for (k=0; k<ncols; k++) { 1539 #if defined(PETSC_USE_CTABLE) 1540 tcol = cmap_loc[cols[k]]; 1541 #else 1542 tcol = cmap[cols[k]+cstart]; 1543 #endif 1544 if (tcol) lens[j]++; 1545 } 1546 } else { /* allcolumns */ 1547 lens[j] = ncols; 1548 } 1549 1550 /* off-diagonal part B = c->B */ 1551 ncols = bi[row-rstart+1] - bi[row-rstart]; 1552 cols = bj + bi[row-rstart]; 1553 if (!allcolumns) { 1554 for (k=0; k<ncols; k++) { 1555 #if defined(PETSC_USE_CTABLE) 1556 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1557 #else 1558 tcol = cmap[bmap[cols[k]]]; 1559 #endif 1560 if (tcol) lens[j]++; 1561 } 1562 } else { /* allcolumns */ 1563 lens[j] += ncols; 1564 } 1565 } 1566 } 1567 1568 /* Create row map (rmap): global row of C -> local row of submat */ 1569 #if defined(PETSC_USE_CTABLE) 1570 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1571 for (j=0; j<nrow; j++) { 1572 row = irow[j]; 1573 proc = row2proc[j]; 1574 if (proc == rank) { /* a local row */ 1575 rmap_loc[row - rstart] = j; 1576 } else { 1577 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1578 } 1579 } 1580 #else 1581 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1582 for (j=0; j<nrow; j++) { 1583 rmap[irow[j]] = j; 1584 } 1585 #endif 1586 1587 /* Update lens from offproc data */ 1588 /* recv a->j is done */ 1589 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr); 1590 for (i=0; i<nrqs; i++) { 1591 proc = pa[i]; 1592 sbuf1_i = sbuf1[proc]; 1593 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1594 ct1 = 2 + 1; 1595 ct2 = 0; 1596 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1597 rbuf3_i = rbuf3[i]; /* received C->j */ 1598 1599 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1600 max1 = sbuf1_i[2]; 1601 for (k=0; k<max1; k++,ct1++) { 1602 #if defined(PETSC_USE_CTABLE) 1603 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1604 row--; 1605 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1606 #else 1607 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1608 #endif 1609 /* Now, store row index of submat in sbuf1_i[ct1] */ 1610 sbuf1_i[ct1] = row; 1611 1612 nnz = rbuf2_i[ct1]; 1613 if (!allcolumns) { 1614 for (l=0; l<nnz; l++,ct2++) { 1615 #if defined(PETSC_USE_CTABLE) 1616 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1617 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1618 } else { 1619 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1620 } 1621 #else 1622 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1623 #endif 1624 if (tcol) lens[row]++; 1625 } 1626 } else { /* allcolumns */ 1627 lens[row] += nnz; 1628 } 1629 } 1630 } 1631 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr); 1632 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1633 1634 /* Create the submatrices */ 1635 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1636 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1637 1638 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1639 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1640 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1641 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1642 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1643 1644 /* create struct Mat_SubSppt and attached it to submat */ 1645 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1646 subc = (Mat_SeqAIJ*)submat->data; 1647 subc->submatis1 = smatis1; 1648 1649 smatis1->id = 0; 1650 smatis1->nrqs = nrqs; 1651 smatis1->nrqr = nrqr; 1652 smatis1->rbuf1 = rbuf1; 1653 smatis1->rbuf2 = rbuf2; 1654 smatis1->rbuf3 = rbuf3; 1655 smatis1->sbuf2 = sbuf2; 1656 smatis1->req_source2 = req_source2; 1657 1658 smatis1->sbuf1 = sbuf1; 1659 smatis1->ptr = ptr; 1660 smatis1->tmp = tmp; 1661 smatis1->ctr = ctr; 1662 1663 smatis1->pa = pa; 1664 smatis1->req_size = req_size; 1665 smatis1->req_source1 = req_source1; 1666 1667 smatis1->allcolumns = allcolumns; 1668 smatis1->singleis = PETSC_TRUE; 1669 smatis1->row2proc = row2proc; 1670 smatis1->rmap = rmap; 1671 smatis1->cmap = cmap; 1672 #if defined(PETSC_USE_CTABLE) 1673 smatis1->rmap_loc = rmap_loc; 1674 smatis1->cmap_loc = cmap_loc; 1675 #endif 1676 1677 smatis1->destroy = submat->ops->destroy; 1678 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1679 submat->factortype = C->factortype; 1680 1681 /* compute rmax */ 1682 rmax = 0; 1683 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1684 1685 } else { /* scall == MAT_REUSE_MATRIX */ 1686 submat = submats[0]; 1687 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1688 1689 subc = (Mat_SeqAIJ*)submat->data; 1690 rmax = subc->rmax; 1691 smatis1 = subc->submatis1; 1692 nrqs = smatis1->nrqs; 1693 nrqr = smatis1->nrqr; 1694 rbuf1 = smatis1->rbuf1; 1695 rbuf2 = smatis1->rbuf2; 1696 rbuf3 = smatis1->rbuf3; 1697 req_source2 = smatis1->req_source2; 1698 1699 sbuf1 = smatis1->sbuf1; 1700 sbuf2 = smatis1->sbuf2; 1701 ptr = smatis1->ptr; 1702 tmp = smatis1->tmp; 1703 ctr = smatis1->ctr; 1704 1705 pa = smatis1->pa; 1706 req_size = smatis1->req_size; 1707 req_source1 = smatis1->req_source1; 1708 1709 allcolumns = smatis1->allcolumns; 1710 row2proc = smatis1->row2proc; 1711 rmap = smatis1->rmap; 1712 cmap = smatis1->cmap; 1713 #if defined(PETSC_USE_CTABLE) 1714 rmap_loc = smatis1->rmap_loc; 1715 cmap_loc = smatis1->cmap_loc; 1716 #endif 1717 } 1718 1719 /* Post recv matrix values */ 1720 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1721 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1722 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1723 for (i=0; i<nrqs; ++i) { 1724 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1725 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 1726 } 1727 1728 /* Allocate sending buffers for a->a, and send them off */ 1729 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1730 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1731 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1732 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1733 1734 for (i=0; i<nrqr; i++) { 1735 rbuf1_i = rbuf1[i]; 1736 sbuf_aa_i = sbuf_aa[i]; 1737 ct1 = 2*rbuf1_i[0]+1; 1738 ct2 = 0; 1739 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1740 1741 kmax = rbuf1_i[2]; 1742 for (k=0; k<kmax; k++,ct1++) { 1743 row = rbuf1_i[ct1] - rstart; 1744 nzA = ai[row+1] - ai[row]; 1745 nzB = bi[row+1] - bi[row]; 1746 ncols = nzA + nzB; 1747 cworkB = bj + bi[row]; 1748 vworkA = a_a + ai[row]; 1749 vworkB = b_a + bi[row]; 1750 1751 /* load the column values for this row into vals*/ 1752 vals = sbuf_aa_i + ct2; 1753 1754 lwrite = 0; 1755 for (l=0; l<nzB; l++) { 1756 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1757 } 1758 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1759 for (l=0; l<nzB; l++) { 1760 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1761 } 1762 1763 ct2 += ncols; 1764 } 1765 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 1766 } 1767 1768 /* Assemble submat */ 1769 /* First assemble the local rows */ 1770 for (j=0; j<nrow; j++) { 1771 row = irow[j]; 1772 proc = row2proc[j]; 1773 if (proc == rank) { 1774 Crow = row - rstart; /* local row index of C */ 1775 #if defined(PETSC_USE_CTABLE) 1776 row = rmap_loc[Crow]; /* row index of submat */ 1777 #else 1778 row = rmap[row]; 1779 #endif 1780 1781 if (allcolumns) { 1782 /* diagonal part A = c->A */ 1783 ncols = ai[Crow+1] - ai[Crow]; 1784 cols = aj + ai[Crow]; 1785 vals = a_a + ai[Crow]; 1786 i = 0; 1787 for (k=0; k<ncols; k++) { 1788 subcols[i] = cols[k] + cstart; 1789 subvals[i++] = vals[k]; 1790 } 1791 1792 /* off-diagonal part B = c->B */ 1793 ncols = bi[Crow+1] - bi[Crow]; 1794 cols = bj + bi[Crow]; 1795 vals = b_a + bi[Crow]; 1796 for (k=0; k<ncols; k++) { 1797 subcols[i] = bmap[cols[k]]; 1798 subvals[i++] = vals[k]; 1799 } 1800 1801 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1802 1803 } else { /* !allcolumns */ 1804 #if defined(PETSC_USE_CTABLE) 1805 /* diagonal part A = c->A */ 1806 ncols = ai[Crow+1] - ai[Crow]; 1807 cols = aj + ai[Crow]; 1808 vals = a_a + ai[Crow]; 1809 i = 0; 1810 for (k=0; k<ncols; k++) { 1811 tcol = cmap_loc[cols[k]]; 1812 if (tcol) { 1813 subcols[i] = --tcol; 1814 subvals[i++] = vals[k]; 1815 } 1816 } 1817 1818 /* off-diagonal part B = c->B */ 1819 ncols = bi[Crow+1] - bi[Crow]; 1820 cols = bj + bi[Crow]; 1821 vals = b_a + bi[Crow]; 1822 for (k=0; k<ncols; k++) { 1823 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1824 if (tcol) { 1825 subcols[i] = --tcol; 1826 subvals[i++] = vals[k]; 1827 } 1828 } 1829 #else 1830 /* diagonal part A = c->A */ 1831 ncols = ai[Crow+1] - ai[Crow]; 1832 cols = aj + ai[Crow]; 1833 vals = a_a + ai[Crow]; 1834 i = 0; 1835 for (k=0; k<ncols; k++) { 1836 tcol = cmap[cols[k]+cstart]; 1837 if (tcol) { 1838 subcols[i] = --tcol; 1839 subvals[i++] = vals[k]; 1840 } 1841 } 1842 1843 /* off-diagonal part B = c->B */ 1844 ncols = bi[Crow+1] - bi[Crow]; 1845 cols = bj + bi[Crow]; 1846 vals = b_a + bi[Crow]; 1847 for (k=0; k<ncols; k++) { 1848 tcol = cmap[bmap[cols[k]]]; 1849 if (tcol) { 1850 subcols[i] = --tcol; 1851 subvals[i++] = vals[k]; 1852 } 1853 } 1854 #endif 1855 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1856 } 1857 } 1858 } 1859 1860 /* Now assemble the off-proc rows */ 1861 for (i=0; i<nrqs; i++) { /* for each requested message */ 1862 /* recv values from other processes */ 1863 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRMPI(ierr); 1864 proc = pa[idex]; 1865 sbuf1_i = sbuf1[proc]; 1866 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1867 ct1 = 2 + 1; 1868 ct2 = 0; /* count of received C->j */ 1869 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1870 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1871 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1872 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1873 1874 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1875 max1 = sbuf1_i[2]; /* num of rows */ 1876 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1877 row = sbuf1_i[ct1]; /* row index of submat */ 1878 if (!allcolumns) { 1879 idex = 0; 1880 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1881 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1882 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1883 #if defined(PETSC_USE_CTABLE) 1884 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1885 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1886 } else { 1887 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1888 } 1889 #else 1890 tcol = cmap[rbuf3_i[ct2]]; 1891 #endif 1892 if (tcol) { 1893 subcols[idex] = --tcol; /* may not be sorted */ 1894 subvals[idex++] = rbuf4_i[ct2]; 1895 1896 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1897 For reuse, we replace received C->j with index that should be inserted to submat */ 1898 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1899 } 1900 } 1901 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1902 } else { /* scall == MAT_REUSE_MATRIX */ 1903 submat = submats[0]; 1904 subc = (Mat_SeqAIJ*)submat->data; 1905 1906 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1907 for (l=0; l<nnz; l++) { 1908 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1909 subvals[idex++] = rbuf4_i[ct2]; 1910 } 1911 1912 bj = subc->j + subc->i[row]; /* sorted column indices */ 1913 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1914 } 1915 } else { /* allcolumns */ 1916 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1917 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1918 ct2 += nnz; 1919 } 1920 } 1921 } 1922 1923 /* sending a->a are done */ 1924 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr); 1925 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1926 1927 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1928 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1929 submats[0] = submat; 1930 1931 /* Restore the indices */ 1932 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1933 if (!allcolumns) { 1934 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1935 } 1936 1937 /* Destroy allocated memory */ 1938 for (i=0; i<nrqs; ++i) { 1939 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1940 } 1941 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1942 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1943 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1944 1945 if (scall == MAT_INITIAL_MATRIX) { 1946 ierr = PetscFree(lens);CHKERRQ(ierr); 1947 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1948 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1949 } 1950 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1951 ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1952 PetscFunctionReturn(0); 1953 } 1954 1955 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1956 { 1957 PetscErrorCode ierr; 1958 PetscInt ncol; 1959 PetscBool colflag,allcolumns=PETSC_FALSE; 1960 1961 PetscFunctionBegin; 1962 /* Allocate memory to hold all the submatrices */ 1963 if (scall == MAT_INITIAL_MATRIX) { 1964 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1965 } 1966 1967 /* Check for special case: each processor gets entire matrix columns */ 1968 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1969 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1970 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1971 1972 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1973 PetscFunctionReturn(0); 1974 } 1975 1976 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1977 { 1978 PetscErrorCode ierr; 1979 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1980 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1981 Mat_SeqAIJ *subc; 1982 Mat_SubSppt *smat; 1983 1984 PetscFunctionBegin; 1985 /* Check for special case: each processor has a single IS */ 1986 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1987 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1988 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1989 PetscFunctionReturn(0); 1990 } 1991 1992 /* Collect global wantallmatrix and nstages */ 1993 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1994 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1995 if (!nmax) nmax = 1; 1996 1997 if (scall == MAT_INITIAL_MATRIX) { 1998 /* Collect global wantallmatrix and nstages */ 1999 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2000 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2001 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2002 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2003 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2004 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2005 wantallmatrix = PETSC_TRUE; 2006 2007 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2008 } 2009 } 2010 2011 /* Determine the number of stages through which submatrices are done 2012 Each stage will extract nmax submatrices. 2013 nmax is determined by the matrix column dimension. 2014 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2015 */ 2016 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2017 2018 in[0] = -1*(PetscInt)wantallmatrix; 2019 in[1] = nstages; 2020 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2021 wantallmatrix = (PetscBool)(-out[0]); 2022 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2023 2024 } else { /* MAT_REUSE_MATRIX */ 2025 if (ismax) { 2026 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2027 smat = subc->submatis1; 2028 } else { /* (*submat)[0] is a dummy matrix */ 2029 smat = (Mat_SubSppt*)(*submat)[0]->data; 2030 } 2031 if (!smat) { 2032 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2033 wantallmatrix = PETSC_TRUE; 2034 } else if (smat->singleis) { 2035 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2036 PetscFunctionReturn(0); 2037 } else { 2038 nstages = smat->nstages; 2039 } 2040 } 2041 2042 if (wantallmatrix) { 2043 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2044 PetscFunctionReturn(0); 2045 } 2046 2047 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2048 if (scall == MAT_INITIAL_MATRIX) { 2049 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2050 } 2051 2052 for (i=0,pos=0; i<nstages; i++) { 2053 if (pos+nmax <= ismax) max_no = nmax; 2054 else if (pos >= ismax) max_no = 0; 2055 else max_no = ismax-pos; 2056 2057 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2058 if (!max_no) { 2059 if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2060 smat = (Mat_SubSppt*)(*submat)[pos]->data; 2061 smat->nstages = nstages; 2062 } 2063 pos++; /* advance to next dummy matrix if any */ 2064 } else pos += max_no; 2065 } 2066 2067 if (ismax && scall == MAT_INITIAL_MATRIX) { 2068 /* save nstages for reuse */ 2069 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2070 smat = subc->submatis1; 2071 smat->nstages = nstages; 2072 } 2073 PetscFunctionReturn(0); 2074 } 2075 2076 /* -------------------------------------------------------------------------*/ 2077 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2078 { 2079 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2080 Mat A = c->A; 2081 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2082 const PetscInt **icol,**irow; 2083 PetscInt *nrow,*ncol,start; 2084 PetscErrorCode ierr; 2085 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2086 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2087 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2088 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2089 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2090 #if defined(PETSC_USE_CTABLE) 2091 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2092 #else 2093 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2094 #endif 2095 const PetscInt *irow_i; 2096 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2097 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2098 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2099 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2100 MPI_Status *r_status3,*r_status4,*s_status4; 2101 MPI_Comm comm; 2102 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2103 PetscMPIInt *onodes1,*olengths1,end; 2104 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2105 Mat_SubSppt *smat_i; 2106 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2107 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2108 2109 PetscFunctionBegin; 2110 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2111 size = c->size; 2112 rank = c->rank; 2113 2114 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2115 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2116 2117 for (i=0; i<ismax; i++) { 2118 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2119 if (!issorted[i]) iscsorted = issorted[i]; 2120 2121 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2122 2123 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2124 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2125 2126 /* Check for special case: allcolumn */ 2127 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2128 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2129 if (colflag && ncol[i] == C->cmap->N) { 2130 allcolumns[i] = PETSC_TRUE; 2131 icol[i] = NULL; 2132 } else { 2133 allcolumns[i] = PETSC_FALSE; 2134 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2135 } 2136 } 2137 2138 if (scall == MAT_REUSE_MATRIX) { 2139 /* Assumes new rows are same length as the old rows */ 2140 for (i=0; i<ismax; i++) { 2141 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2142 subc = (Mat_SeqAIJ*)submats[i]->data; 2143 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2144 2145 /* Initial matrix as if empty */ 2146 ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr); 2147 2148 smat_i = subc->submatis1; 2149 2150 nrqs = smat_i->nrqs; 2151 nrqr = smat_i->nrqr; 2152 rbuf1 = smat_i->rbuf1; 2153 rbuf2 = smat_i->rbuf2; 2154 rbuf3 = smat_i->rbuf3; 2155 req_source2 = smat_i->req_source2; 2156 2157 sbuf1 = smat_i->sbuf1; 2158 sbuf2 = smat_i->sbuf2; 2159 ptr = smat_i->ptr; 2160 tmp = smat_i->tmp; 2161 ctr = smat_i->ctr; 2162 2163 pa = smat_i->pa; 2164 req_size = smat_i->req_size; 2165 req_source1 = smat_i->req_source1; 2166 2167 allcolumns[i] = smat_i->allcolumns; 2168 row2proc[i] = smat_i->row2proc; 2169 rmap[i] = smat_i->rmap; 2170 cmap[i] = smat_i->cmap; 2171 } 2172 2173 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2174 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2175 smat_i = (Mat_SubSppt*)submats[0]->data; 2176 2177 nrqs = smat_i->nrqs; 2178 nrqr = smat_i->nrqr; 2179 rbuf1 = smat_i->rbuf1; 2180 rbuf2 = smat_i->rbuf2; 2181 rbuf3 = smat_i->rbuf3; 2182 req_source2 = smat_i->req_source2; 2183 2184 sbuf1 = smat_i->sbuf1; 2185 sbuf2 = smat_i->sbuf2; 2186 ptr = smat_i->ptr; 2187 tmp = smat_i->tmp; 2188 ctr = smat_i->ctr; 2189 2190 pa = smat_i->pa; 2191 req_size = smat_i->req_size; 2192 req_source1 = smat_i->req_source1; 2193 2194 allcolumns[0] = PETSC_FALSE; 2195 } 2196 } else { /* scall == MAT_INITIAL_MATRIX */ 2197 /* Get some new tags to keep the communication clean */ 2198 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2199 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2200 2201 /* evaluate communication - mesg to who, length of mesg, and buffer space 2202 required. Based on this, buffers are allocated, and data copied into them*/ 2203 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2204 2205 for (i=0; i<ismax; i++) { 2206 jmax = nrow[i]; 2207 irow_i = irow[i]; 2208 2209 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2210 row2proc[i] = row2proc_i; 2211 2212 if (issorted[i]) proc = 0; 2213 for (j=0; j<jmax; j++) { 2214 if (!issorted[i]) proc = 0; 2215 row = irow_i[j]; 2216 while (row >= C->rmap->range[proc+1]) proc++; 2217 w4[proc]++; 2218 row2proc_i[j] = proc; /* map row index to proc */ 2219 } 2220 for (j=0; j<size; j++) { 2221 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2222 } 2223 } 2224 2225 nrqs = 0; /* no of outgoing messages */ 2226 msz = 0; /* total mesg length (for all procs) */ 2227 w1[rank] = 0; /* no mesg sent to self */ 2228 w3[rank] = 0; 2229 for (i=0; i<size; i++) { 2230 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2231 } 2232 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2233 for (i=0,j=0; i<size; i++) { 2234 if (w1[i]) { pa[j] = i; j++; } 2235 } 2236 2237 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2238 for (i=0; i<nrqs; i++) { 2239 j = pa[i]; 2240 w1[j] += w2[j] + 2* w3[j]; 2241 msz += w1[j]; 2242 } 2243 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2244 2245 /* Determine the number of messages to expect, their lengths, from from-ids */ 2246 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2247 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2248 2249 /* Now post the Irecvs corresponding to these messages */ 2250 tag0 = ((PetscObject)C)->tag; 2251 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2252 2253 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2254 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2255 2256 /* Allocate Memory for outgoing messages */ 2257 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2258 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 2259 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 2260 2261 { 2262 PetscInt *iptr = tmp; 2263 k = 0; 2264 for (i=0; i<nrqs; i++) { 2265 j = pa[i]; 2266 iptr += k; 2267 sbuf1[j] = iptr; 2268 k = w1[j]; 2269 } 2270 } 2271 2272 /* Form the outgoing messages. Initialize the header space */ 2273 for (i=0; i<nrqs; i++) { 2274 j = pa[i]; 2275 sbuf1[j][0] = 0; 2276 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 2277 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2278 } 2279 2280 /* Parse the isrow and copy data into outbuf */ 2281 for (i=0; i<ismax; i++) { 2282 row2proc_i = row2proc[i]; 2283 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 2284 irow_i = irow[i]; 2285 jmax = nrow[i]; 2286 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2287 proc = row2proc_i[j]; 2288 if (proc != rank) { /* copy to the outgoing buf*/ 2289 ctr[proc]++; 2290 *ptr[proc] = irow_i[j]; 2291 ptr[proc]++; 2292 } 2293 } 2294 /* Update the headers for the current IS */ 2295 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2296 if ((ctr_j = ctr[j])) { 2297 sbuf1_j = sbuf1[j]; 2298 k = ++sbuf1_j[0]; 2299 sbuf1_j[2*k] = ctr_j; 2300 sbuf1_j[2*k-1] = i; 2301 } 2302 } 2303 } 2304 2305 /* Now post the sends */ 2306 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2307 for (i=0; i<nrqs; ++i) { 2308 j = pa[i]; 2309 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr); 2310 } 2311 2312 /* Post Receives to capture the buffer size */ 2313 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2314 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2315 rbuf2[0] = tmp + msz; 2316 for (i=1; i<nrqs; ++i) { 2317 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2318 } 2319 for (i=0; i<nrqs; ++i) { 2320 j = pa[i]; 2321 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 2322 } 2323 2324 /* Send to other procs the buf size they should allocate */ 2325 /* Receive messages*/ 2326 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2327 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2328 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2329 { 2330 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2331 PetscInt *sbuf2_i; 2332 2333 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 2334 for (i=0; i<nrqr; ++i) { 2335 req_size[i] = 0; 2336 rbuf1_i = rbuf1[i]; 2337 start = 2*rbuf1_i[0] + 1; 2338 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRMPI(ierr); 2339 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2340 sbuf2_i = sbuf2[i]; 2341 for (j=start; j<end; j++) { 2342 id = rbuf1_i[j] - rstart; 2343 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2344 sbuf2_i[j] = ncols; 2345 req_size[i] += ncols; 2346 } 2347 req_source1[i] = r_status1[i].MPI_SOURCE; 2348 /* form the header */ 2349 sbuf2_i[0] = req_size[i]; 2350 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2351 2352 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 2353 } 2354 } 2355 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2356 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2357 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2358 2359 /* Receive messages*/ 2360 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2361 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2362 2363 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 2364 for (i=0; i<nrqs; ++i) { 2365 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2366 req_source2[i] = r_status2[i].MPI_SOURCE; 2367 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 2368 } 2369 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2370 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2371 2372 /* Wait on sends1 and sends2 */ 2373 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2374 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2375 2376 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);} 2377 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);} 2378 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2379 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2380 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2381 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2382 2383 /* Now allocate sending buffers for a->j, and send them off */ 2384 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2385 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2386 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2387 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2388 2389 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2390 { 2391 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2392 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2393 PetscInt cend = C->cmap->rend; 2394 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2395 2396 for (i=0; i<nrqr; i++) { 2397 rbuf1_i = rbuf1[i]; 2398 sbuf_aj_i = sbuf_aj[i]; 2399 ct1 = 2*rbuf1_i[0] + 1; 2400 ct2 = 0; 2401 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2402 kmax = rbuf1[i][2*j]; 2403 for (k=0; k<kmax; k++,ct1++) { 2404 row = rbuf1_i[ct1] - rstart; 2405 nzA = a_i[row+1] - a_i[row]; 2406 nzB = b_i[row+1] - b_i[row]; 2407 ncols = nzA + nzB; 2408 cworkA = a_j + a_i[row]; 2409 cworkB = b_j + b_i[row]; 2410 2411 /* load the column indices for this row into cols */ 2412 cols = sbuf_aj_i + ct2; 2413 2414 lwrite = 0; 2415 for (l=0; l<nzB; l++) { 2416 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2417 } 2418 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2419 for (l=0; l<nzB; l++) { 2420 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2421 } 2422 2423 ct2 += ncols; 2424 } 2425 } 2426 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 2427 } 2428 } 2429 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2430 2431 /* create col map: global col of C -> local col of submatrices */ 2432 { 2433 const PetscInt *icol_i; 2434 #if defined(PETSC_USE_CTABLE) 2435 for (i=0; i<ismax; i++) { 2436 if (!allcolumns[i]) { 2437 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2438 2439 jmax = ncol[i]; 2440 icol_i = icol[i]; 2441 cmap_i = cmap[i]; 2442 for (j=0; j<jmax; j++) { 2443 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2444 } 2445 } else cmap[i] = NULL; 2446 } 2447 #else 2448 for (i=0; i<ismax; i++) { 2449 if (!allcolumns[i]) { 2450 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2451 jmax = ncol[i]; 2452 icol_i = icol[i]; 2453 cmap_i = cmap[i]; 2454 for (j=0; j<jmax; j++) { 2455 cmap_i[icol_i[j]] = j+1; 2456 } 2457 } else cmap[i] = NULL; 2458 } 2459 #endif 2460 } 2461 2462 /* Create lens which is required for MatCreate... */ 2463 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2464 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2465 2466 if (ismax) { 2467 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2468 } 2469 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2470 2471 /* Update lens from local data */ 2472 for (i=0; i<ismax; i++) { 2473 row2proc_i = row2proc[i]; 2474 jmax = nrow[i]; 2475 if (!allcolumns[i]) cmap_i = cmap[i]; 2476 irow_i = irow[i]; 2477 lens_i = lens[i]; 2478 for (j=0; j<jmax; j++) { 2479 row = irow_i[j]; 2480 proc = row2proc_i[j]; 2481 if (proc == rank) { 2482 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2483 if (!allcolumns[i]) { 2484 for (k=0; k<ncols; k++) { 2485 #if defined(PETSC_USE_CTABLE) 2486 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2487 #else 2488 tcol = cmap_i[cols[k]]; 2489 #endif 2490 if (tcol) lens_i[j]++; 2491 } 2492 } else { /* allcolumns */ 2493 lens_i[j] = ncols; 2494 } 2495 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2496 } 2497 } 2498 } 2499 2500 /* Create row map: global row of C -> local row of submatrices */ 2501 #if defined(PETSC_USE_CTABLE) 2502 for (i=0; i<ismax; i++) { 2503 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2504 irow_i = irow[i]; 2505 jmax = nrow[i]; 2506 for (j=0; j<jmax; j++) { 2507 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2508 } 2509 } 2510 #else 2511 for (i=0; i<ismax; i++) { 2512 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2513 rmap_i = rmap[i]; 2514 irow_i = irow[i]; 2515 jmax = nrow[i]; 2516 for (j=0; j<jmax; j++) { 2517 rmap_i[irow_i[j]] = j; 2518 } 2519 } 2520 #endif 2521 2522 /* Update lens from offproc data */ 2523 { 2524 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2525 2526 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRMPI(ierr); 2527 for (tmp2=0; tmp2<nrqs; tmp2++) { 2528 sbuf1_i = sbuf1[pa[tmp2]]; 2529 jmax = sbuf1_i[0]; 2530 ct1 = 2*jmax+1; 2531 ct2 = 0; 2532 rbuf2_i = rbuf2[tmp2]; 2533 rbuf3_i = rbuf3[tmp2]; 2534 for (j=1; j<=jmax; j++) { 2535 is_no = sbuf1_i[2*j-1]; 2536 max1 = sbuf1_i[2*j]; 2537 lens_i = lens[is_no]; 2538 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2539 rmap_i = rmap[is_no]; 2540 for (k=0; k<max1; k++,ct1++) { 2541 #if defined(PETSC_USE_CTABLE) 2542 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2543 row--; 2544 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2545 #else 2546 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2547 #endif 2548 max2 = rbuf2_i[ct1]; 2549 for (l=0; l<max2; l++,ct2++) { 2550 if (!allcolumns[is_no]) { 2551 #if defined(PETSC_USE_CTABLE) 2552 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2553 #else 2554 tcol = cmap_i[rbuf3_i[ct2]]; 2555 #endif 2556 if (tcol) lens_i[row]++; 2557 } else { /* allcolumns */ 2558 lens_i[row]++; /* lens_i[row] += max2 ? */ 2559 } 2560 } 2561 } 2562 } 2563 } 2564 } 2565 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2566 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);} 2567 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2568 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2569 2570 /* Create the submatrices */ 2571 for (i=0; i<ismax; i++) { 2572 PetscInt rbs,cbs; 2573 2574 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2575 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2576 2577 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2578 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2579 2580 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2581 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2582 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2583 2584 /* create struct Mat_SubSppt and attached it to submat */ 2585 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2586 subc = (Mat_SeqAIJ*)submats[i]->data; 2587 subc->submatis1 = smat_i; 2588 2589 smat_i->destroy = submats[i]->ops->destroy; 2590 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2591 submats[i]->factortype = C->factortype; 2592 2593 smat_i->id = i; 2594 smat_i->nrqs = nrqs; 2595 smat_i->nrqr = nrqr; 2596 smat_i->rbuf1 = rbuf1; 2597 smat_i->rbuf2 = rbuf2; 2598 smat_i->rbuf3 = rbuf3; 2599 smat_i->sbuf2 = sbuf2; 2600 smat_i->req_source2 = req_source2; 2601 2602 smat_i->sbuf1 = sbuf1; 2603 smat_i->ptr = ptr; 2604 smat_i->tmp = tmp; 2605 smat_i->ctr = ctr; 2606 2607 smat_i->pa = pa; 2608 smat_i->req_size = req_size; 2609 smat_i->req_source1 = req_source1; 2610 2611 smat_i->allcolumns = allcolumns[i]; 2612 smat_i->singleis = PETSC_FALSE; 2613 smat_i->row2proc = row2proc[i]; 2614 smat_i->rmap = rmap[i]; 2615 smat_i->cmap = cmap[i]; 2616 } 2617 2618 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2619 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2620 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2621 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2622 2623 /* create struct Mat_SubSppt and attached it to submat */ 2624 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2625 submats[0]->data = (void*)smat_i; 2626 2627 smat_i->destroy = submats[0]->ops->destroy; 2628 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2629 submats[0]->factortype = C->factortype; 2630 2631 smat_i->id = 0; 2632 smat_i->nrqs = nrqs; 2633 smat_i->nrqr = nrqr; 2634 smat_i->rbuf1 = rbuf1; 2635 smat_i->rbuf2 = rbuf2; 2636 smat_i->rbuf3 = rbuf3; 2637 smat_i->sbuf2 = sbuf2; 2638 smat_i->req_source2 = req_source2; 2639 2640 smat_i->sbuf1 = sbuf1; 2641 smat_i->ptr = ptr; 2642 smat_i->tmp = tmp; 2643 smat_i->ctr = ctr; 2644 2645 smat_i->pa = pa; 2646 smat_i->req_size = req_size; 2647 smat_i->req_source1 = req_source1; 2648 2649 smat_i->allcolumns = PETSC_FALSE; 2650 smat_i->singleis = PETSC_FALSE; 2651 smat_i->row2proc = NULL; 2652 smat_i->rmap = NULL; 2653 smat_i->cmap = NULL; 2654 } 2655 2656 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2657 ierr = PetscFree(lens);CHKERRQ(ierr); 2658 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2659 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2660 2661 } /* endof scall == MAT_INITIAL_MATRIX */ 2662 2663 /* Post recv matrix values */ 2664 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2665 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2666 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2667 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2668 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2669 for (i=0; i<nrqs; ++i) { 2670 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2671 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 2672 } 2673 2674 /* Allocate sending buffers for a->a, and send them off */ 2675 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2676 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2677 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2678 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2679 2680 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2681 { 2682 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2683 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2684 PetscInt cend = C->cmap->rend; 2685 PetscInt *b_j = b->j; 2686 PetscScalar *vworkA,*vworkB,*a_a,*b_a; 2687 2688 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2689 ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2690 for (i=0; i<nrqr; i++) { 2691 rbuf1_i = rbuf1[i]; 2692 sbuf_aa_i = sbuf_aa[i]; 2693 ct1 = 2*rbuf1_i[0]+1; 2694 ct2 = 0; 2695 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2696 kmax = rbuf1_i[2*j]; 2697 for (k=0; k<kmax; k++,ct1++) { 2698 row = rbuf1_i[ct1] - rstart; 2699 nzA = a_i[row+1] - a_i[row]; 2700 nzB = b_i[row+1] - b_i[row]; 2701 ncols = nzA + nzB; 2702 cworkB = b_j + b_i[row]; 2703 vworkA = a_a + a_i[row]; 2704 vworkB = b_a + b_i[row]; 2705 2706 /* load the column values for this row into vals*/ 2707 vals = sbuf_aa_i+ct2; 2708 2709 lwrite = 0; 2710 for (l=0; l<nzB; l++) { 2711 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2712 } 2713 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2714 for (l=0; l<nzB; l++) { 2715 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2716 } 2717 2718 ct2 += ncols; 2719 } 2720 } 2721 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 2722 } 2723 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2724 ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2725 } 2726 2727 /* Assemble the matrices */ 2728 /* First assemble the local rows */ 2729 for (i=0; i<ismax; i++) { 2730 row2proc_i = row2proc[i]; 2731 subc = (Mat_SeqAIJ*)submats[i]->data; 2732 imat_ilen = subc->ilen; 2733 imat_j = subc->j; 2734 imat_i = subc->i; 2735 imat_a = subc->a; 2736 2737 if (!allcolumns[i]) cmap_i = cmap[i]; 2738 rmap_i = rmap[i]; 2739 irow_i = irow[i]; 2740 jmax = nrow[i]; 2741 for (j=0; j<jmax; j++) { 2742 row = irow_i[j]; 2743 proc = row2proc_i[j]; 2744 if (proc == rank) { 2745 old_row = row; 2746 #if defined(PETSC_USE_CTABLE) 2747 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2748 row--; 2749 #else 2750 row = rmap_i[row]; 2751 #endif 2752 ilen_row = imat_ilen[row]; 2753 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2754 mat_i = imat_i[row]; 2755 mat_a = imat_a + mat_i; 2756 mat_j = imat_j + mat_i; 2757 if (!allcolumns[i]) { 2758 for (k=0; k<ncols; k++) { 2759 #if defined(PETSC_USE_CTABLE) 2760 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2761 #else 2762 tcol = cmap_i[cols[k]]; 2763 #endif 2764 if (tcol) { 2765 *mat_j++ = tcol - 1; 2766 *mat_a++ = vals[k]; 2767 ilen_row++; 2768 } 2769 } 2770 } else { /* allcolumns */ 2771 for (k=0; k<ncols; k++) { 2772 *mat_j++ = cols[k]; /* global col index! */ 2773 *mat_a++ = vals[k]; 2774 ilen_row++; 2775 } 2776 } 2777 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2778 2779 imat_ilen[row] = ilen_row; 2780 } 2781 } 2782 } 2783 2784 /* Now assemble the off proc rows */ 2785 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRMPI(ierr); 2786 for (tmp2=0; tmp2<nrqs; tmp2++) { 2787 sbuf1_i = sbuf1[pa[tmp2]]; 2788 jmax = sbuf1_i[0]; 2789 ct1 = 2*jmax + 1; 2790 ct2 = 0; 2791 rbuf2_i = rbuf2[tmp2]; 2792 rbuf3_i = rbuf3[tmp2]; 2793 rbuf4_i = rbuf4[tmp2]; 2794 for (j=1; j<=jmax; j++) { 2795 is_no = sbuf1_i[2*j-1]; 2796 rmap_i = rmap[is_no]; 2797 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2798 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2799 imat_ilen = subc->ilen; 2800 imat_j = subc->j; 2801 imat_i = subc->i; 2802 imat_a = subc->a; 2803 max1 = sbuf1_i[2*j]; 2804 for (k=0; k<max1; k++,ct1++) { 2805 row = sbuf1_i[ct1]; 2806 #if defined(PETSC_USE_CTABLE) 2807 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2808 row--; 2809 #else 2810 row = rmap_i[row]; 2811 #endif 2812 ilen = imat_ilen[row]; 2813 mat_i = imat_i[row]; 2814 mat_a = imat_a + mat_i; 2815 mat_j = imat_j + mat_i; 2816 max2 = rbuf2_i[ct1]; 2817 if (!allcolumns[is_no]) { 2818 for (l=0; l<max2; l++,ct2++) { 2819 #if defined(PETSC_USE_CTABLE) 2820 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2821 #else 2822 tcol = cmap_i[rbuf3_i[ct2]]; 2823 #endif 2824 if (tcol) { 2825 *mat_j++ = tcol - 1; 2826 *mat_a++ = rbuf4_i[ct2]; 2827 ilen++; 2828 } 2829 } 2830 } else { /* allcolumns */ 2831 for (l=0; l<max2; l++,ct2++) { 2832 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2833 *mat_a++ = rbuf4_i[ct2]; 2834 ilen++; 2835 } 2836 } 2837 imat_ilen[row] = ilen; 2838 } 2839 } 2840 } 2841 2842 if (!iscsorted) { /* sort column indices of the rows */ 2843 for (i=0; i<ismax; i++) { 2844 subc = (Mat_SeqAIJ*)submats[i]->data; 2845 imat_j = subc->j; 2846 imat_i = subc->i; 2847 imat_a = subc->a; 2848 imat_ilen = subc->ilen; 2849 2850 if (allcolumns[i]) continue; 2851 jmax = nrow[i]; 2852 for (j=0; j<jmax; j++) { 2853 mat_i = imat_i[j]; 2854 mat_a = imat_a + mat_i; 2855 mat_j = imat_j + mat_i; 2856 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2857 } 2858 } 2859 } 2860 2861 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2862 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2863 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);} 2864 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2865 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2866 2867 /* Restore the indices */ 2868 for (i=0; i<ismax; i++) { 2869 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2870 if (!allcolumns[i]) { 2871 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2872 } 2873 } 2874 2875 for (i=0; i<ismax; i++) { 2876 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2877 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2878 } 2879 2880 /* Destroy allocated memory */ 2881 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2882 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2883 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2884 2885 for (i=0; i<nrqs; ++i) { 2886 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2887 } 2888 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2889 2890 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2891 PetscFunctionReturn(0); 2892 } 2893 2894 /* 2895 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2896 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2897 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2898 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2899 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2900 state, and needs to be "assembled" later by compressing B's column space. 2901 2902 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2903 Following this call, C->A & C->B have been created, even if empty. 2904 */ 2905 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2906 { 2907 /* If making this function public, change the error returned in this function away from _PLIB. */ 2908 PetscErrorCode ierr; 2909 Mat_MPIAIJ *aij; 2910 Mat_SeqAIJ *Baij; 2911 PetscBool seqaij,Bdisassembled; 2912 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2913 PetscScalar v; 2914 const PetscInt *rowindices,*colindices; 2915 2916 PetscFunctionBegin; 2917 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2918 if (A) { 2919 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2920 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2921 if (rowemb) { 2922 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2923 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2924 } else { 2925 if (C->rmap->n != A->rmap->n) { 2926 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2927 } 2928 } 2929 if (dcolemb) { 2930 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2931 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2932 } else { 2933 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2934 } 2935 } 2936 if (B) { 2937 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2938 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2939 if (rowemb) { 2940 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2941 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2942 } else { 2943 if (C->rmap->n != B->rmap->n) { 2944 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2945 } 2946 } 2947 if (ocolemb) { 2948 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2949 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2950 } else { 2951 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2952 } 2953 } 2954 2955 aij = (Mat_MPIAIJ*)C->data; 2956 if (!aij->A) { 2957 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2958 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2959 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2960 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2961 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2962 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2963 } 2964 if (A) { 2965 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2966 } else { 2967 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2968 } 2969 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2970 /* 2971 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2972 need to "disassemble" B -- convert it to using C's global indices. 2973 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2974 2975 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2976 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2977 2978 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2979 At least avoid calling MatSetValues() and the implied searches? 2980 */ 2981 2982 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2983 #if defined(PETSC_USE_CTABLE) 2984 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2985 #else 2986 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2987 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2988 if (aij->B) { 2989 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2990 } 2991 #endif 2992 ngcol = 0; 2993 if (aij->lvec) { 2994 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2995 } 2996 if (aij->garray) { 2997 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2998 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2999 } 3000 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 3001 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 3002 } 3003 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 3004 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 3005 } 3006 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 3007 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 3008 } 3009 } 3010 Bdisassembled = PETSC_FALSE; 3011 if (!aij->B) { 3012 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 3013 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 3014 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 3015 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 3016 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3017 Bdisassembled = PETSC_TRUE; 3018 } 3019 if (B) { 3020 Baij = (Mat_SeqAIJ*)B->data; 3021 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3022 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3023 for (i=0; i<B->rmap->n; i++) { 3024 nz[i] = Baij->i[i+1] - Baij->i[i]; 3025 } 3026 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3027 ierr = PetscFree(nz);CHKERRQ(ierr); 3028 } 3029 3030 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3031 shift = rend-rstart; 3032 count = 0; 3033 rowindices = NULL; 3034 colindices = NULL; 3035 if (rowemb) { 3036 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3037 } 3038 if (ocolemb) { 3039 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3040 } 3041 for (i=0; i<B->rmap->n; i++) { 3042 PetscInt row; 3043 row = i; 3044 if (rowindices) row = rowindices[i]; 3045 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3046 col = Baij->j[count]; 3047 if (colindices) col = colindices[col]; 3048 if (Bdisassembled && col>=rstart) col += shift; 3049 v = Baij->a[count]; 3050 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3051 ++count; 3052 } 3053 } 3054 /* No assembly for aij->B is necessary. */ 3055 /* FIXME: set aij->B's nonzerostate correctly. */ 3056 } else { 3057 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3058 } 3059 C->preallocated = PETSC_TRUE; 3060 C->was_assembled = PETSC_FALSE; 3061 C->assembled = PETSC_FALSE; 3062 /* 3063 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3064 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3065 */ 3066 PetscFunctionReturn(0); 3067 } 3068 3069 /* 3070 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3071 */ 3072 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3073 { 3074 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3075 3076 PetscFunctionBegin; 3077 PetscValidPointer(A,2); 3078 PetscValidPointer(B,3); 3079 /* FIXME: make sure C is assembled */ 3080 *A = aij->A; 3081 *B = aij->B; 3082 /* Note that we don't incref *A and *B, so be careful! */ 3083 PetscFunctionReturn(0); 3084 } 3085 3086 /* 3087 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3088 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3089 */ 3090 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3091 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3092 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3093 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3094 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3095 { 3096 PetscErrorCode ierr; 3097 PetscMPIInt size,flag; 3098 PetscInt i,ii,cismax,ispar; 3099 Mat *A,*B; 3100 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3101 3102 PetscFunctionBegin; 3103 if (!ismax) PetscFunctionReturn(0); 3104 3105 for (i = 0, cismax = 0; i < ismax; ++i) { 3106 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr); 3107 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3108 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr); 3109 if (size > 1) ++cismax; 3110 } 3111 3112 /* 3113 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3114 ispar counts the number of parallel ISs across C's comm. 3115 */ 3116 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3117 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3118 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3119 PetscFunctionReturn(0); 3120 } 3121 3122 /* if (ispar) */ 3123 /* 3124 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3125 These are used to extract the off-diag portion of the resulting parallel matrix. 3126 The row IS for the off-diag portion is the same as for the diag portion, 3127 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3128 */ 3129 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3130 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3131 for (i = 0, ii = 0; i < ismax; ++i) { 3132 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3133 if (size > 1) { 3134 /* 3135 TODO: This is the part that's ***NOT SCALABLE***. 3136 To fix this we need to extract just the indices of C's nonzero columns 3137 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3138 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3139 to be done without serializing on the IS list, so, most likely, it is best 3140 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3141 */ 3142 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3143 /* Now we have to 3144 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3145 were sorted on each rank, concatenated they might no longer be sorted; 3146 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3147 indices in the nondecreasing order to the original index positions. 3148 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3149 */ 3150 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3151 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3152 ++ii; 3153 } 3154 } 3155 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3156 for (i = 0, ii = 0; i < ismax; ++i) { 3157 PetscInt j,issize; 3158 const PetscInt *indices; 3159 3160 /* 3161 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3162 */ 3163 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3164 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3165 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3166 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3167 for (j = 1; j < issize; ++j) { 3168 if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3169 } 3170 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3171 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3172 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3173 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3174 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3175 for (j = 1; j < issize; ++j) { 3176 if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3177 } 3178 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3179 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3180 if (size > 1) { 3181 cisrow[ii] = isrow[i]; 3182 ++ii; 3183 } 3184 } 3185 /* 3186 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3187 array of sequential matrices underlying the resulting parallel matrices. 3188 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3189 contain duplicates. 3190 3191 There are as many diag matrices as there are original index sets. There are only as many parallel 3192 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3193 3194 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3195 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3196 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3197 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3198 with A[i] and B[ii] extracted from the corresponding MPI submat. 3199 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3200 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3201 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3202 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3203 values into A[i] and B[ii] sitting inside the corresponding submat. 3204 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3205 A[i], B[ii], AA[i] or BB[ii] matrices. 3206 */ 3207 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3208 if (scall == MAT_INITIAL_MATRIX) { 3209 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3210 } 3211 3212 /* Now obtain the sequential A and B submatrices separately. */ 3213 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3214 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3215 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3216 3217 /* 3218 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3219 matrices A & B have been extracted directly into the parallel matrices containing them, or 3220 simply into the sequential matrix identical with the corresponding A (if size == 1). 3221 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3222 to have the same sparsity pattern. 3223 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3224 must be constructed for C. This is done by setseqmat(s). 3225 */ 3226 for (i = 0, ii = 0; i < ismax; ++i) { 3227 /* 3228 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3229 That way we can avoid sorting and computing permutations when reusing. 3230 To this end: 3231 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3232 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3233 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3234 */ 3235 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3236 3237 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3238 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3239 if (size > 1) { 3240 if (scall == MAT_INITIAL_MATRIX) { 3241 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3242 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3243 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3244 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3245 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3246 } 3247 /* 3248 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3249 */ 3250 { 3251 Mat AA = A[i],BB = B[ii]; 3252 3253 if (AA || BB) { 3254 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3255 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3256 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3257 } 3258 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3259 } 3260 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3261 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3262 ++ii; 3263 } else { /* if (size == 1) */ 3264 if (scall == MAT_REUSE_MATRIX) { 3265 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3266 } 3267 if (isrow_p[i] || iscol_p[i]) { 3268 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3269 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3270 /* Otherwise A is extracted straight into (*submats)[i]. */ 3271 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3272 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3273 } else (*submat)[i] = A[i]; 3274 } 3275 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3276 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3277 } 3278 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3279 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3280 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3281 ierr = PetscFree(A);CHKERRQ(ierr); 3282 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3283 PetscFunctionReturn(0); 3284 } 3285 3286 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3287 { 3288 PetscErrorCode ierr; 3289 3290 PetscFunctionBegin; 3291 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3292 PetscFunctionReturn(0); 3293 } 3294