1 /* 2 Routines to compute overlapping regions of a parallel MPI matrix 3 and to find submatrices that were shared across processors. 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 #include <petscsf.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*); 17 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*); 18 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **); 19 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *); 20 21 22 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 23 { 24 PetscErrorCode ierr; 25 PetscInt i; 26 27 PetscFunctionBegin; 28 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 29 for (i=0; i<ov; ++i) { 30 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 31 } 32 PetscFunctionReturn(0); 33 } 34 35 PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov) 36 { 37 PetscErrorCode ierr; 38 PetscInt i; 39 40 PetscFunctionBegin; 41 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 42 for (i=0; i<ov; ++i) { 43 ierr = MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);CHKERRQ(ierr); 44 } 45 PetscFunctionReturn(0); 46 } 47 48 49 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[]) 50 { 51 PetscErrorCode ierr; 52 MPI_Comm comm; 53 PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j; 54 PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata; 55 PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL; 56 const PetscInt *indices_i,**indices; 57 PetscLayout rmap; 58 PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner; 59 PetscSF sf; 60 PetscSFNode *remote; 61 62 PetscFunctionBegin; 63 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 64 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 65 ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 66 /* get row map to determine where rows should be going */ 67 ierr = MatGetLayouts(mat,&rmap,NULL);CHKERRQ(ierr); 68 /* retrieve IS data and put all together so that we 69 * can optimize communication 70 * */ 71 ierr = PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);CHKERRQ(ierr); 72 for (i=0,tlength=0; i<nidx; i++){ 73 ierr = ISGetLocalSize(is[i],&length[i]);CHKERRQ(ierr); 74 tlength += length[i]; 75 ierr = ISGetIndices(is[i],&indices[i]);CHKERRQ(ierr); 76 } 77 /* find these rows on remote processors */ 78 ierr = PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);CHKERRQ(ierr); 79 ierr = PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);CHKERRQ(ierr); 80 nrrows = 0; 81 for (i=0; i<nidx; i++) { 82 length_i = length[i]; 83 indices_i = indices[i]; 84 for (j=0; j<length_i; j++) { 85 owner = -1; 86 ierr = PetscLayoutFindOwner(rmap,indices_i[j],&owner);CHKERRQ(ierr); 87 /* remote processors */ 88 if (owner != rank) { 89 tosizes_temp[owner]++; /* number of rows to owner */ 90 rrow_ranks[nrrows] = owner; /* processor */ 91 rrow_isids[nrrows] = i; /* is id */ 92 remoterows[nrrows++] = indices_i[j]; /* row */ 93 } 94 } 95 ierr = ISRestoreIndices(is[i],&indices[i]);CHKERRQ(ierr); 96 } 97 ierr = PetscFree2(*(PetscInt***)&indices,length);CHKERRQ(ierr); 98 /* test if we need to exchange messages 99 * generally speaking, we do not need to exchange 100 * data when overlap is 1 101 * */ 102 ierr = MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);CHKERRQ(ierr); 103 /* we do not have any messages 104 * It usually corresponds to overlap 1 105 * */ 106 if (!reducednrrows) { 107 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 108 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 109 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 110 PetscFunctionReturn(0); 111 } 112 nto = 0; 113 /* send sizes and ranks for building a two-sided communcation */ 114 for (i=0; i<size; i++) { 115 if (tosizes_temp[i]) { 116 tosizes[nto*2] = tosizes_temp[i]*2; /* size */ 117 tosizes_temp[i] = nto; /* a map from processor to index */ 118 toranks[nto++] = i; /* processor */ 119 } 120 } 121 ierr = PetscMalloc1(nto+1,&toffsets);CHKERRQ(ierr); 122 toffsets[0] = 0; 123 for (i=0; i<nto; i++) { 124 toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */ 125 tosizes[2*i+1] = toffsets[i]; /* offsets to send */ 126 } 127 /* send information to other processors */ 128 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);CHKERRQ(ierr); 129 nrecvrows = 0; 130 for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i]; 131 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 132 nrecvrows = 0; 133 for (i=0; i<nfrom; i++) { 134 for (j=0; j<fromsizes[2*i]; j++) { 135 remote[nrecvrows].rank = fromranks[i]; 136 remote[nrecvrows++].index = fromsizes[2*i+1]+j; 137 } 138 } 139 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 140 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 141 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 142 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 143 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 144 /* message pair <no of is, row> */ 145 ierr = PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);CHKERRQ(ierr); 146 for (i=0; i<nrrows; i++) { 147 owner = rrow_ranks[i]; /* processor */ 148 j = tosizes_temp[owner]; /* index */ 149 todata[toffsets[j]++] = rrow_isids[i]; 150 todata[toffsets[j]++] = remoterows[i]; 151 } 152 ierr = PetscFree3(toranks,tosizes,tosizes_temp);CHKERRQ(ierr); 153 ierr = PetscFree3(remoterows,rrow_ranks,rrow_isids);CHKERRQ(ierr); 154 ierr = PetscFree(toffsets);CHKERRQ(ierr); 155 ierr = PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 156 ierr = PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata);CHKERRQ(ierr); 157 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 158 /* send rows belonging to the remote so that then we could get the overlapping data back */ 159 ierr = MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);CHKERRQ(ierr); 160 ierr = PetscFree2(todata,fromdata);CHKERRQ(ierr); 161 ierr = PetscFree(fromsizes);CHKERRQ(ierr); 162 ierr = PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);CHKERRQ(ierr); 163 ierr = PetscFree(fromranks);CHKERRQ(ierr); 164 nrecvrows = 0; 165 for (i=0; i<nto; i++) nrecvrows += tosizes[2*i]; 166 ierr = PetscCalloc1(nrecvrows,&todata);CHKERRQ(ierr); 167 ierr = PetscMalloc1(nrecvrows,&remote);CHKERRQ(ierr); 168 nrecvrows = 0; 169 for (i=0; i<nto; i++) { 170 for (j=0; j<tosizes[2*i]; j++) { 171 remote[nrecvrows].rank = toranks[i]; 172 remote[nrecvrows++].index = tosizes[2*i+1]+j; 173 } 174 } 175 ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr); 176 ierr = PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);CHKERRQ(ierr); 177 /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */ 178 ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr); 179 ierr = PetscSFSetFromOptions(sf);CHKERRQ(ierr); 180 /* overlap communication and computation */ 181 ierr = PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 182 ierr = MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);CHKERRQ(ierr); 183 ierr = PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata);CHKERRQ(ierr); 184 ierr = PetscSFDestroy(&sf);CHKERRQ(ierr); 185 ierr = PetscFree2(sbdata,sbsizes);CHKERRQ(ierr); 186 ierr = MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);CHKERRQ(ierr); 187 ierr = PetscFree(toranks);CHKERRQ(ierr); 188 ierr = PetscFree(tosizes);CHKERRQ(ierr); 189 ierr = PetscFree(todata);CHKERRQ(ierr); 190 PetscFunctionReturn(0); 191 } 192 193 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata) 194 { 195 PetscInt *isz,isz_i,i,j,is_id, data_size; 196 PetscInt col,lsize,max_lsize,*indices_temp, *indices_i; 197 const PetscInt *indices_i_temp; 198 MPI_Comm *iscomms; 199 PetscErrorCode ierr; 200 201 PetscFunctionBegin; 202 max_lsize = 0; 203 ierr = PetscMalloc1(nidx,&isz);CHKERRQ(ierr); 204 for (i=0; i<nidx; i++){ 205 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 206 max_lsize = lsize>max_lsize ? lsize:max_lsize; 207 isz[i] = lsize; 208 } 209 ierr = PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);CHKERRQ(ierr); 210 for (i=0; i<nidx; i++){ 211 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 212 ierr = ISGetIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 213 ierr = PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);CHKERRQ(ierr); 214 ierr = ISRestoreIndices(is[i],&indices_i_temp);CHKERRQ(ierr); 215 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 216 } 217 /* retrieve information to get row id and its overlap */ 218 for (i=0; i<nrecvs;){ 219 is_id = recvdata[i++]; 220 data_size = recvdata[i++]; 221 indices_i = indices_temp+(max_lsize+nrecvs)*is_id; 222 isz_i = isz[is_id]; 223 for (j=0; j< data_size; j++){ 224 col = recvdata[i++]; 225 indices_i[isz_i++] = col; 226 } 227 isz[is_id] = isz_i; 228 } 229 /* remove duplicate entities */ 230 for (i=0; i<nidx; i++){ 231 indices_i = indices_temp+(max_lsize+nrecvs)*i; 232 isz_i = isz[i]; 233 ierr = PetscSortRemoveDupsInt(&isz_i,indices_i);CHKERRQ(ierr); 234 ierr = ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 235 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 236 } 237 ierr = PetscFree(isz);CHKERRQ(ierr); 238 ierr = PetscFree2(indices_temp,iscomms);CHKERRQ(ierr); 239 PetscFunctionReturn(0); 240 } 241 242 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows) 243 { 244 PetscLayout rmap,cmap; 245 PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i; 246 PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata; 247 PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets; 248 const PetscInt *gcols,*ai,*aj,*bi,*bj; 249 Mat amat,bmat; 250 PetscMPIInt rank; 251 PetscBool done; 252 MPI_Comm comm; 253 PetscErrorCode ierr; 254 255 PetscFunctionBegin; 256 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 257 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 258 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 259 /* Even if the mat is symmetric, we still assume it is not symmetric */ 260 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 261 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 262 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 263 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 264 /* total number of nonzero values is used to estimate the memory usage in the next step */ 265 tnz = ai[an]+bi[bn]; 266 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 267 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 268 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 269 /* to find the longest message */ 270 max_fszs = 0; 271 for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs; 272 /* better way to estimate number of nonzero in the mat??? */ 273 ierr = PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);CHKERRQ(ierr); 274 for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i; 275 rows_pos = 0; 276 totalrows = 0; 277 for (i=0; i<nfrom; i++){ 278 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 279 /* group data together */ 280 for (j=0; j<fromsizes[2*i]; j+=2){ 281 is_id = fromrows[rows_pos++];/* no of is */ 282 rows_i = rows_data[is_id]; 283 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */ 284 } 285 /* estimate a space to avoid multiple allocations */ 286 for (j=0; j<nidx; j++){ 287 indvc_ij = 0; 288 rows_i = rows_data[j]; 289 for (l=0; l<rows_pos_i[j]; l++){ 290 row = rows_i[l]-rstart; 291 start = ai[row]; 292 end = ai[row+1]; 293 for (k=start; k<end; k++){ /* Amat */ 294 col = aj[k] + cstart; 295 indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */ 296 } 297 start = bi[row]; 298 end = bi[row+1]; 299 for (k=start; k<end; k++) { /* Bmat */ 300 col = gcols[bj[k]]; 301 indices_tmp[indvc_ij++] = col; 302 } 303 } 304 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 305 indv_counts[i*nidx+j] = indvc_ij; 306 totalrows += indvc_ij; 307 } 308 } 309 /* message triple <no of is, number of rows, rows> */ 310 ierr = PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);CHKERRQ(ierr); 311 totalrows = 0; 312 rows_pos = 0; 313 /* use this code again */ 314 for (i=0;i<nfrom;i++){ 315 ierr = PetscArrayzero(rows_pos_i,nidx);CHKERRQ(ierr); 316 for (j=0; j<fromsizes[2*i]; j+=2){ 317 is_id = fromrows[rows_pos++]; 318 rows_i = rows_data[is_id]; 319 rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++]; 320 } 321 /* add data */ 322 for (j=0; j<nidx; j++){ 323 if (!indv_counts[i*nidx+j]) continue; 324 indvc_ij = 0; 325 sbdata[totalrows++] = j; 326 sbdata[totalrows++] = indv_counts[i*nidx+j]; 327 sbsizes[2*i] += 2; 328 rows_i = rows_data[j]; 329 for (l=0; l<rows_pos_i[j]; l++){ 330 row = rows_i[l]-rstart; 331 start = ai[row]; 332 end = ai[row+1]; 333 for (k=start; k<end; k++){ /* Amat */ 334 col = aj[k] + cstart; 335 indices_tmp[indvc_ij++] = col; 336 } 337 start = bi[row]; 338 end = bi[row+1]; 339 for (k=start; k<end; k++) { /* Bmat */ 340 col = gcols[bj[k]]; 341 indices_tmp[indvc_ij++] = col; 342 } 343 } 344 ierr = PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);CHKERRQ(ierr); 345 sbsizes[2*i] += indvc_ij; 346 ierr = PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);CHKERRQ(ierr); 347 totalrows += indvc_ij; 348 } 349 } 350 ierr = PetscMalloc1(nfrom+1,&offsets);CHKERRQ(ierr); 351 offsets[0] = 0; 352 for (i=0; i<nfrom; i++){ 353 offsets[i+1] = offsets[i] + sbsizes[2*i]; 354 sbsizes[2*i+1] = offsets[i]; 355 } 356 ierr = PetscFree(offsets);CHKERRQ(ierr); 357 if (sbrowsizes) *sbrowsizes = sbsizes; 358 if (sbrows) *sbrows = sbdata; 359 ierr = PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);CHKERRQ(ierr); 360 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 361 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 362 PetscFunctionReturn(0); 363 } 364 365 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[]) 366 { 367 const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices; 368 PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp; 369 PetscInt lsize,lsize_tmp; 370 PetscMPIInt rank,owner; 371 Mat amat,bmat; 372 PetscBool done; 373 PetscLayout cmap,rmap; 374 MPI_Comm comm; 375 PetscErrorCode ierr; 376 377 PetscFunctionBegin; 378 ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 379 ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 380 ierr = MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);CHKERRQ(ierr); 381 ierr = MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 382 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 383 ierr = MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 384 if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n"); 385 /* is it a safe way to compute number of nonzero values ? */ 386 tnz = ai[an]+bi[bn]; 387 ierr = MatGetLayouts(mat,&rmap,&cmap);CHKERRQ(ierr); 388 ierr = PetscLayoutGetRange(rmap,&rstart,NULL);CHKERRQ(ierr); 389 ierr = PetscLayoutGetRange(cmap,&cstart,NULL);CHKERRQ(ierr); 390 /* it is a better way to estimate memory than the old implementation 391 * where global size of matrix is used 392 * */ 393 ierr = PetscMalloc1(tnz,&indices_temp);CHKERRQ(ierr); 394 for (i=0; i<nidx; i++) { 395 MPI_Comm iscomm; 396 397 ierr = ISGetLocalSize(is[i],&lsize);CHKERRQ(ierr); 398 ierr = ISGetIndices(is[i],&indices);CHKERRQ(ierr); 399 lsize_tmp = 0; 400 for (j=0; j<lsize; j++) { 401 owner = -1; 402 row = indices[j]; 403 ierr = PetscLayoutFindOwner(rmap,row,&owner);CHKERRQ(ierr); 404 if (owner != rank) continue; 405 /* local number */ 406 row -= rstart; 407 start = ai[row]; 408 end = ai[row+1]; 409 for (k=start; k<end; k++) { /* Amat */ 410 col = aj[k] + cstart; 411 indices_temp[lsize_tmp++] = col; 412 } 413 start = bi[row]; 414 end = bi[row+1]; 415 for (k=start; k<end; k++) { /* Bmat */ 416 col = gcols[bj[k]]; 417 indices_temp[lsize_tmp++] = col; 418 } 419 } 420 ierr = ISRestoreIndices(is[i],&indices);CHKERRQ(ierr); 421 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);CHKERRQ(ierr); 422 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 423 ierr = PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);CHKERRQ(ierr); 424 ierr = ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);CHKERRQ(ierr); 425 ierr = PetscCommDestroy(&iscomm);CHKERRQ(ierr); 426 } 427 ierr = PetscFree(indices_temp);CHKERRQ(ierr); 428 ierr = MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);CHKERRQ(ierr); 429 ierr = MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);CHKERRQ(ierr); 430 PetscFunctionReturn(0); 431 } 432 433 434 /* 435 Sample message format: 436 If a processor A wants processor B to process some elements corresponding 437 to index sets is[1],is[5] 438 mesg [0] = 2 (no of index sets in the mesg) 439 ----------- 440 mesg [1] = 1 => is[1] 441 mesg [2] = sizeof(is[1]); 442 ----------- 443 mesg [3] = 5 => is[5] 444 mesg [4] = sizeof(is[5]); 445 ----------- 446 mesg [5] 447 mesg [n] datas[1] 448 ----------- 449 mesg[n+1] 450 mesg[m] data(is[5]) 451 ----------- 452 453 Notes: 454 nrqs - no of requests sent (or to be sent out) 455 nrqr - no of requests received (which have to be or which have been processed) 456 */ 457 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 458 { 459 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 460 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 461 const PetscInt **idx,*idx_i; 462 PetscInt *n,**data,len; 463 #if defined(PETSC_USE_CTABLE) 464 PetscTable *table_data,table_data_i; 465 PetscInt *tdata,tcount,tcount_max; 466 #else 467 PetscInt *data_i,*d_p; 468 #endif 469 PetscErrorCode ierr; 470 PetscMPIInt size,rank,tag1,tag2,proc = 0; 471 PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr; 472 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 473 PetscBT *table; 474 MPI_Comm comm; 475 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 476 MPI_Status *s_status,*recv_status; 477 MPI_Comm *iscomms; 478 char *t_p; 479 480 PetscFunctionBegin; 481 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 482 size = c->size; 483 rank = c->rank; 484 M = C->rmap->N; 485 486 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 487 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 488 489 ierr = PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr); 490 491 for (i=0; i<imax; i++) { 492 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 493 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 494 } 495 496 /* evaluate communication - mesg to who,length of mesg, and buffer space 497 required. Based on this, buffers are allocated, and data copied into them */ 498 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 499 for (i=0; i<imax; i++) { 500 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/ 501 idx_i = idx[i]; 502 len = n[i]; 503 for (j=0; j<len; j++) { 504 row = idx_i[j]; 505 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 506 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 507 w4[proc]++; 508 } 509 for (j=0; j<size; j++) { 510 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 511 } 512 } 513 514 nrqs = 0; /* no of outgoing messages */ 515 msz = 0; /* total mesg length (for all proc */ 516 w1[rank] = 0; /* no mesg sent to intself */ 517 w3[rank] = 0; 518 for (i=0; i<size; i++) { 519 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 520 } 521 /* pa - is list of processors to communicate with */ 522 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 523 for (i=0,j=0; i<size; i++) { 524 if (w1[i]) {pa[j] = i; j++;} 525 } 526 527 /* Each message would have a header = 1 + 2*(no of IS) + data */ 528 for (i=0; i<nrqs; i++) { 529 j = pa[i]; 530 w1[j] += w2[j] + 2*w3[j]; 531 msz += w1[j]; 532 } 533 534 /* Determine the number of messages to expect, their lengths, from from-ids */ 535 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 536 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 537 538 /* Now post the Irecvs corresponding to these messages */ 539 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 540 541 /* Allocate Memory for outgoing messages */ 542 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 543 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr); 544 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 545 546 { 547 PetscInt *iptr = tmp,ict = 0; 548 for (i=0; i<nrqs; i++) { 549 j = pa[i]; 550 iptr += ict; 551 outdat[j] = iptr; 552 ict = w1[j]; 553 } 554 } 555 556 /* Form the outgoing messages */ 557 /* plug in the headers */ 558 for (i=0; i<nrqs; i++) { 559 j = pa[i]; 560 outdat[j][0] = 0; 561 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr); 562 ptr[j] = outdat[j] + 2*w3[j] + 1; 563 } 564 565 /* Memory for doing local proc's work */ 566 { 567 PetscInt M_BPB_imax = 0; 568 #if defined(PETSC_USE_CTABLE) 569 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 570 ierr = PetscMalloc1(imax,&table_data);CHKERRQ(ierr); 571 for (i=0; i<imax; i++) { 572 ierr = PetscTableCreate(n[i]+1,M+1,&table_data[i]);CHKERRQ(ierr); 573 } 574 ierr = PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);CHKERRQ(ierr); 575 for (i=0; i<imax; i++) { 576 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 577 } 578 #else 579 PetscInt Mimax = 0; 580 ierr = PetscIntMultError(M,imax, &Mimax);CHKERRQ(ierr); 581 ierr = PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);CHKERRQ(ierr); 582 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);CHKERRQ(ierr); 583 for (i=0; i<imax; i++) { 584 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 585 data[i] = d_p + M*i; 586 } 587 #endif 588 } 589 590 /* Parse the IS and update local tables and the outgoing buf with the data */ 591 { 592 PetscInt n_i,isz_i,*outdat_j,ctr_j; 593 PetscBT table_i; 594 595 for (i=0; i<imax; i++) { 596 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 597 n_i = n[i]; 598 table_i = table[i]; 599 idx_i = idx[i]; 600 #if defined(PETSC_USE_CTABLE) 601 table_data_i = table_data[i]; 602 #else 603 data_i = data[i]; 604 #endif 605 isz_i = isz[i]; 606 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 607 row = idx_i[j]; 608 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 609 if (proc != rank) { /* copy to the outgoing buffer */ 610 ctr[proc]++; 611 *ptr[proc] = row; 612 ptr[proc]++; 613 } else if (!PetscBTLookupSet(table_i,row)) { 614 #if defined(PETSC_USE_CTABLE) 615 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 616 #else 617 data_i[isz_i] = row; /* Update the local table */ 618 #endif 619 isz_i++; 620 } 621 } 622 /* Update the headers for the current IS */ 623 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 624 if ((ctr_j = ctr[j])) { 625 outdat_j = outdat[j]; 626 k = ++outdat_j[0]; 627 outdat_j[2*k] = ctr_j; 628 outdat_j[2*k-1] = i; 629 } 630 } 631 isz[i] = isz_i; 632 } 633 } 634 635 /* Now post the sends */ 636 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 637 for (i=0; i<nrqs; ++i) { 638 j = pa[i]; 639 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 640 } 641 642 /* No longer need the original indices */ 643 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr); 644 for (i=0; i<imax; ++i) { 645 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 646 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr); 647 } 648 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr); 649 650 for (i=0; i<imax; ++i) { 651 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 652 } 653 654 /* Do Local work */ 655 #if defined(PETSC_USE_CTABLE) 656 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);CHKERRQ(ierr); 657 #else 658 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);CHKERRQ(ierr); 659 #endif 660 661 /* Receive messages */ 662 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 663 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRMPI(ierr);} 664 665 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 666 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRMPI(ierr);} 667 668 /* Phase 1 sends are complete - deallocate buffers */ 669 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 670 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 671 672 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 673 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 674 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 675 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 676 ierr = PetscFree(rbuf);CHKERRQ(ierr); 677 678 679 /* Send the data back */ 680 /* Do a global reduction to know the buffer space req for incoming messages */ 681 { 682 PetscMPIInt *rw1; 683 684 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 685 686 for (i=0; i<nrqr; ++i) { 687 proc = recv_status[i].MPI_SOURCE; 688 689 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 690 rw1[proc] = isz1[i]; 691 } 692 ierr = PetscFree(onodes1);CHKERRQ(ierr); 693 ierr = PetscFree(olengths1);CHKERRQ(ierr); 694 695 /* Determine the number of messages to expect, their lengths, from from-ids */ 696 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 697 ierr = PetscFree(rw1);CHKERRQ(ierr); 698 } 699 /* Now post the Irecvs corresponding to these messages */ 700 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 701 702 /* Now post the sends */ 703 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 704 for (i=0; i<nrqr; ++i) { 705 j = recv_status[i].MPI_SOURCE; 706 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRMPI(ierr); 707 } 708 709 /* receive work done on other processors */ 710 { 711 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax; 712 PetscMPIInt idex; 713 PetscBT table_i; 714 MPI_Status *status2; 715 716 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 717 for (i=0; i<nrqs; ++i) { 718 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRMPI(ierr); 719 /* Process the message */ 720 rbuf2_i = rbuf2[idex]; 721 ct1 = 2*rbuf2_i[0]+1; 722 jmax = rbuf2[idex][0]; 723 for (j=1; j<=jmax; j++) { 724 max = rbuf2_i[2*j]; 725 is_no = rbuf2_i[2*j-1]; 726 isz_i = isz[is_no]; 727 table_i = table[is_no]; 728 #if defined(PETSC_USE_CTABLE) 729 table_data_i = table_data[is_no]; 730 #else 731 data_i = data[is_no]; 732 #endif 733 for (k=0; k<max; k++,ct1++) { 734 row = rbuf2_i[ct1]; 735 if (!PetscBTLookupSet(table_i,row)) { 736 #if defined(PETSC_USE_CTABLE) 737 ierr = PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 738 #else 739 data_i[isz_i] = row; 740 #endif 741 isz_i++; 742 } 743 } 744 isz[is_no] = isz_i; 745 } 746 } 747 748 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRMPI(ierr);} 749 ierr = PetscFree(status2);CHKERRQ(ierr); 750 } 751 752 #if defined(PETSC_USE_CTABLE) 753 tcount_max = 0; 754 for (i=0; i<imax; ++i) { 755 table_data_i = table_data[i]; 756 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 757 if (tcount_max < tcount) tcount_max = tcount; 758 } 759 ierr = PetscMalloc1(tcount_max+1,&tdata);CHKERRQ(ierr); 760 #endif 761 762 for (i=0; i<imax; ++i) { 763 #if defined(PETSC_USE_CTABLE) 764 PetscTablePosition tpos; 765 table_data_i = table_data[i]; 766 767 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 768 while (tpos) { 769 ierr = PetscTableGetNext(table_data_i,&tpos,&k,&j);CHKERRQ(ierr); 770 tdata[--j] = --k; 771 } 772 ierr = ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 773 #else 774 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 775 #endif 776 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr); 777 } 778 779 ierr = PetscFree(iscomms);CHKERRQ(ierr); 780 ierr = PetscFree(onodes2);CHKERRQ(ierr); 781 ierr = PetscFree(olengths2);CHKERRQ(ierr); 782 783 ierr = PetscFree(pa);CHKERRQ(ierr); 784 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 785 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 786 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 787 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 788 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 789 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 790 ierr = PetscFree(s_status);CHKERRQ(ierr); 791 ierr = PetscFree(recv_status);CHKERRQ(ierr); 792 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 793 ierr = PetscFree(xdata);CHKERRQ(ierr); 794 ierr = PetscFree(isz1);CHKERRQ(ierr); 795 #if defined(PETSC_USE_CTABLE) 796 for (i=0; i<imax; i++) { 797 ierr = PetscTableDestroy((PetscTable*)&table_data[i]);CHKERRQ(ierr); 798 } 799 ierr = PetscFree(table_data);CHKERRQ(ierr); 800 ierr = PetscFree(tdata);CHKERRQ(ierr); 801 ierr = PetscFree4(table,data,isz,t_p);CHKERRQ(ierr); 802 #else 803 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 804 #endif 805 PetscFunctionReturn(0); 806 } 807 808 /* 809 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 810 the work on the local processor. 811 812 Inputs: 813 C - MAT_MPIAIJ; 814 imax - total no of index sets processed at a time; 815 table - an array of char - size = m bits. 816 817 Output: 818 isz - array containing the count of the solution elements corresponding 819 to each index set; 820 data or table_data - pointer to the solutions 821 */ 822 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data) 823 { 824 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 825 Mat A = c->A,B = c->B; 826 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 827 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 828 PetscInt *bi,*bj,*garray,i,j,k,row,isz_i; 829 PetscBT table_i; 830 #if defined(PETSC_USE_CTABLE) 831 PetscTable table_data_i; 832 PetscErrorCode ierr; 833 PetscTablePosition tpos; 834 PetscInt tcount,*tdata; 835 #else 836 PetscInt *data_i; 837 #endif 838 839 PetscFunctionBegin; 840 rstart = C->rmap->rstart; 841 cstart = C->cmap->rstart; 842 ai = a->i; 843 aj = a->j; 844 bi = b->i; 845 bj = b->j; 846 garray = c->garray; 847 848 for (i=0; i<imax; i++) { 849 #if defined(PETSC_USE_CTABLE) 850 /* copy existing entries of table_data_i into tdata[] */ 851 table_data_i = table_data[i]; 852 ierr = PetscTableGetCount(table_data_i,&tcount);CHKERRQ(ierr); 853 if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,0," tcount %d != isz[%d] %d",tcount,i,isz[i]); 854 855 ierr = PetscMalloc1(tcount,&tdata);CHKERRQ(ierr); 856 ierr = PetscTableGetHeadPosition(table_data_i,&tpos);CHKERRQ(ierr); 857 while (tpos) { 858 ierr = PetscTableGetNext(table_data_i,&tpos,&row,&j);CHKERRQ(ierr); 859 tdata[--j] = --row; 860 if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,0," j %d >= tcount %d",j,tcount); 861 } 862 #else 863 data_i = data[i]; 864 #endif 865 table_i = table[i]; 866 isz_i = isz[i]; 867 max = isz[i]; 868 869 for (j=0; j<max; j++) { 870 #if defined(PETSC_USE_CTABLE) 871 row = tdata[j] - rstart; 872 #else 873 row = data_i[j] - rstart; 874 #endif 875 start = ai[row]; 876 end = ai[row+1]; 877 for (k=start; k<end; k++) { /* Amat */ 878 val = aj[k] + cstart; 879 if (!PetscBTLookupSet(table_i,val)) { 880 #if defined(PETSC_USE_CTABLE) 881 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 882 #else 883 data_i[isz_i] = val; 884 #endif 885 isz_i++; 886 } 887 } 888 start = bi[row]; 889 end = bi[row+1]; 890 for (k=start; k<end; k++) { /* Bmat */ 891 val = garray[bj[k]]; 892 if (!PetscBTLookupSet(table_i,val)) { 893 #if defined(PETSC_USE_CTABLE) 894 ierr = PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);CHKERRQ(ierr); 895 #else 896 data_i[isz_i] = val; 897 #endif 898 isz_i++; 899 } 900 } 901 } 902 isz[i] = isz_i; 903 904 #if defined(PETSC_USE_CTABLE) 905 ierr = PetscFree(tdata);CHKERRQ(ierr); 906 #endif 907 } 908 PetscFunctionReturn(0); 909 } 910 911 /* 912 MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages, 913 and return the output 914 915 Input: 916 C - the matrix 917 nrqr - no of messages being processed. 918 rbuf - an array of pointers to the received requests 919 920 Output: 921 xdata - array of messages to be sent back 922 isz1 - size of each message 923 924 For better efficiency perhaps we should malloc separately each xdata[i], 925 then if a remalloc is required we need only copy the data for that one row 926 rather then all previous rows as it is now where a single large chunck of 927 memory is used. 928 929 */ 930 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 931 { 932 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 933 Mat A = c->A,B = c->B; 934 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 935 PetscErrorCode ierr; 936 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 937 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 938 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 939 PetscInt *rbuf_i,kmax,rbuf_0; 940 PetscBT xtable; 941 942 PetscFunctionBegin; 943 m = C->rmap->N; 944 rstart = C->rmap->rstart; 945 cstart = C->cmap->rstart; 946 ai = a->i; 947 aj = a->j; 948 bi = b->i; 949 bj = b->j; 950 garray = c->garray; 951 952 953 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 954 rbuf_i = rbuf[i]; 955 rbuf_0 = rbuf_i[0]; 956 ct += rbuf_0; 957 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 958 } 959 960 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 961 else max1 = 1; 962 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 963 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 964 ++no_malloc; 965 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 966 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr); 967 968 ct3 = 0; 969 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 970 rbuf_i = rbuf[i]; 971 rbuf_0 = rbuf_i[0]; 972 ct1 = 2*rbuf_0+1; 973 ct2 = ct1; 974 ct3 += ct1; 975 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 976 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 977 oct2 = ct2; 978 kmax = rbuf_i[2*j]; 979 for (k=0; k<kmax; k++,ct1++) { 980 row = rbuf_i[ct1]; 981 if (!PetscBTLookupSet(xtable,row)) { 982 if (!(ct3 < mem_estimate)) { 983 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 984 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 985 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 986 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 987 xdata[0] = tmp; 988 mem_estimate = new_estimate; ++no_malloc; 989 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 990 } 991 xdata[i][ct2++] = row; 992 ct3++; 993 } 994 } 995 for (k=oct2,max2=ct2; k<max2; k++) { 996 row = xdata[i][k] - rstart; 997 start = ai[row]; 998 end = ai[row+1]; 999 for (l=start; l<end; l++) { 1000 val = aj[l] + cstart; 1001 if (!PetscBTLookupSet(xtable,val)) { 1002 if (!(ct3 < mem_estimate)) { 1003 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1004 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1005 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1006 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1007 xdata[0] = tmp; 1008 mem_estimate = new_estimate; ++no_malloc; 1009 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1010 } 1011 xdata[i][ct2++] = val; 1012 ct3++; 1013 } 1014 } 1015 start = bi[row]; 1016 end = bi[row+1]; 1017 for (l=start; l<end; l++) { 1018 val = garray[bj[l]]; 1019 if (!PetscBTLookupSet(xtable,val)) { 1020 if (!(ct3 < mem_estimate)) { 1021 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 1022 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 1023 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr); 1024 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 1025 xdata[0] = tmp; 1026 mem_estimate = new_estimate; ++no_malloc; 1027 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 1028 } 1029 xdata[i][ct2++] = val; 1030 ct3++; 1031 } 1032 } 1033 } 1034 /* Update the header*/ 1035 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 1036 xdata[i][2*j-1] = rbuf_i[2*j-1]; 1037 } 1038 xdata[i][0] = rbuf_0; 1039 xdata[i+1] = xdata[i] + ct2; 1040 isz1[i] = ct2; /* size of each message */ 1041 } 1042 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 1043 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 1044 PetscFunctionReturn(0); 1045 } 1046 /* -------------------------------------------------------------------------*/ 1047 extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 1048 /* 1049 Every processor gets the entire matrix 1050 */ 1051 PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 1052 { 1053 Mat B; 1054 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 1055 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 1056 PetscErrorCode ierr; 1057 PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL; 1058 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 1059 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 1060 1061 PetscFunctionBegin; 1062 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 1063 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRMPI(ierr); 1064 if (scall == MAT_INITIAL_MATRIX) { 1065 /* ---------------------------------------------------------------- 1066 Tell every processor the number of nonzeros per row 1067 */ 1068 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 1069 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 1070 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 1071 } 1072 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1073 for (i=0; i<size; i++) { 1074 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 1075 displs[i] = A->rmap->range[i]; 1076 } 1077 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1078 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1079 #else 1080 sendcount = A->rmap->rend - A->rmap->rstart; 1081 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1082 #endif 1083 /* --------------------------------------------------------------- 1084 Create the sequential matrix of the same type as the local block diagonal 1085 */ 1086 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 1087 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1088 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 1089 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 1090 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 1091 ierr = PetscCalloc1(2,Bin);CHKERRQ(ierr); 1092 **Bin = B; 1093 b = (Mat_SeqAIJ*)B->data; 1094 1095 /*-------------------------------------------------------------------- 1096 Copy my part of matrix column indices over 1097 */ 1098 sendcount = ad->nz + bd->nz; 1099 jsendbuf = b->j + b->i[rstarts[rank]]; 1100 a_jsendbuf = ad->j; 1101 b_jsendbuf = bd->j; 1102 n = A->rmap->rend - A->rmap->rstart; 1103 cnt = 0; 1104 for (i=0; i<n; i++) { 1105 /* put in lower diagonal portion */ 1106 m = bd->i[i+1] - bd->i[i]; 1107 while (m > 0) { 1108 /* is it above diagonal (in bd (compressed) numbering) */ 1109 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 1110 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1111 m--; 1112 } 1113 1114 /* put in diagonal portion */ 1115 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1116 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 1117 } 1118 1119 /* put in upper diagonal portion */ 1120 while (m-- > 0) { 1121 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 1122 } 1123 } 1124 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1125 1126 /*-------------------------------------------------------------------- 1127 Gather all column indices to all processors 1128 */ 1129 for (i=0; i<size; i++) { 1130 recvcounts[i] = 0; 1131 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 1132 recvcounts[i] += lens[j]; 1133 } 1134 } 1135 displs[0] = 0; 1136 for (i=1; i<size; i++) { 1137 displs[i] = displs[i-1] + recvcounts[i-1]; 1138 } 1139 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1140 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1141 #else 1142 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1143 #endif 1144 /*-------------------------------------------------------------------- 1145 Assemble the matrix into useable form (note numerical values not yet set) 1146 */ 1147 /* set the b->ilen (length of each row) values */ 1148 ierr = PetscArraycpy(b->ilen,lens,A->rmap->N);CHKERRQ(ierr); 1149 /* set the b->i indices */ 1150 b->i[0] = 0; 1151 for (i=1; i<=A->rmap->N; i++) { 1152 b->i[i] = b->i[i-1] + lens[i-1]; 1153 } 1154 ierr = PetscFree(lens);CHKERRQ(ierr); 1155 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1156 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1157 1158 } else { 1159 B = **Bin; 1160 b = (Mat_SeqAIJ*)B->data; 1161 } 1162 1163 /*-------------------------------------------------------------------- 1164 Copy my part of matrix numerical values into the values location 1165 */ 1166 if (flag == MAT_GET_VALUES) { 1167 const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf; 1168 MatScalar *sendbuf,*recvbuf; 1169 1170 ierr = MatSeqAIJGetArrayRead(a->A,&ada);CHKERRQ(ierr); 1171 ierr = MatSeqAIJGetArrayRead(a->B,&bda);CHKERRQ(ierr); 1172 sendcount = ad->nz + bd->nz; 1173 sendbuf = b->a + b->i[rstarts[rank]]; 1174 a_sendbuf = ada; 1175 b_sendbuf = bda; 1176 b_sendj = bd->j; 1177 n = A->rmap->rend - A->rmap->rstart; 1178 cnt = 0; 1179 for (i=0; i<n; i++) { 1180 /* put in lower diagonal portion */ 1181 m = bd->i[i+1] - bd->i[i]; 1182 while (m > 0) { 1183 /* is it above diagonal (in bd (compressed) numbering) */ 1184 if (garray[*b_sendj] > A->rmap->rstart + i) break; 1185 sendbuf[cnt++] = *b_sendbuf++; 1186 m--; 1187 b_sendj++; 1188 } 1189 1190 /* put in diagonal portion */ 1191 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 1192 sendbuf[cnt++] = *a_sendbuf++; 1193 } 1194 1195 /* put in upper diagonal portion */ 1196 while (m-- > 0) { 1197 sendbuf[cnt++] = *b_sendbuf++; 1198 b_sendj++; 1199 } 1200 } 1201 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 1202 1203 /* ----------------------------------------------------------------- 1204 Gather all numerical values to all processors 1205 */ 1206 if (!recvcounts) { 1207 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 1208 } 1209 for (i=0; i<size; i++) { 1210 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 1211 } 1212 displs[0] = 0; 1213 for (i=1; i<size; i++) { 1214 displs[i] = displs[i-1] + recvcounts[i-1]; 1215 } 1216 recvbuf = b->a; 1217 #if defined(PETSC_HAVE_MPI_IN_PLACE) 1218 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1219 #else 1220 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRMPI(ierr); 1221 #endif 1222 ierr = MatSeqAIJRestoreArrayRead(a->A,&ada);CHKERRQ(ierr); 1223 ierr = MatSeqAIJRestoreArrayRead(a->B,&bda);CHKERRQ(ierr); 1224 } /* endof (flag == MAT_GET_VALUES) */ 1225 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 1226 1227 if (A->symmetric) { 1228 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1229 } else if (A->hermitian) { 1230 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 1231 } else if (A->structurally_symmetric) { 1232 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 1233 } 1234 PetscFunctionReturn(0); 1235 } 1236 1237 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 1238 { 1239 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 1240 Mat submat,A = c->A,B = c->B; 1241 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc; 1242 PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB; 1243 PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray; 1244 const PetscInt *icol,*irow; 1245 PetscInt nrow,ncol,start; 1246 PetscErrorCode ierr; 1247 PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr; 1248 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc; 1249 PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr; 1250 PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz; 1251 PetscInt *lens,rmax,ncols,*cols,Crow; 1252 #if defined(PETSC_USE_CTABLE) 1253 PetscTable cmap,rmap; 1254 PetscInt *cmap_loc,*rmap_loc; 1255 #else 1256 PetscInt *cmap,*rmap; 1257 #endif 1258 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i; 1259 PetscInt *cworkB,lwrite,*subcols,*row2proc; 1260 PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL; 1261 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 1262 MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4; 1263 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2; 1264 MPI_Status *r_status3 = NULL,*r_status4,*s_status4; 1265 MPI_Comm comm; 1266 PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i; 1267 PetscMPIInt *onodes1,*olengths1,idex,end; 1268 Mat_SubSppt *smatis1; 1269 PetscBool isrowsorted,iscolsorted; 1270 1271 PetscFunctionBegin; 1272 PetscValidLogicalCollectiveInt(C,ismax,2); 1273 PetscValidLogicalCollectiveEnum(C,scall,5); 1274 if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1"); 1275 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1276 ierr = MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1277 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1278 size = c->size; 1279 rank = c->rank; 1280 1281 ierr = ISSorted(iscol[0],&iscolsorted);CHKERRQ(ierr); 1282 ierr = ISSorted(isrow[0],&isrowsorted);CHKERRQ(ierr); 1283 ierr = ISGetIndices(isrow[0],&irow);CHKERRQ(ierr); 1284 ierr = ISGetLocalSize(isrow[0],&nrow);CHKERRQ(ierr); 1285 if (allcolumns) { 1286 icol = NULL; 1287 ncol = C->cmap->N; 1288 } else { 1289 ierr = ISGetIndices(iscol[0],&icol);CHKERRQ(ierr); 1290 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1291 } 1292 1293 if (scall == MAT_INITIAL_MATRIX) { 1294 PetscInt *sbuf2_i,*cworkA,lwrite,ctmp; 1295 1296 /* Get some new tags to keep the communication clean */ 1297 tag1 = ((PetscObject)C)->tag; 1298 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 1299 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 1300 1301 /* evaluate communication - mesg to who, length of mesg, and buffer space 1302 required. Based on this, buffers are allocated, and data copied into them */ 1303 ierr = PetscCalloc2(size,&w1,size,&w2);CHKERRQ(ierr); 1304 ierr = PetscMalloc1(nrow,&row2proc);CHKERRQ(ierr); 1305 1306 /* w1[proc] = num of rows owned by proc -- to be requested */ 1307 proc = 0; 1308 nrqs = 0; /* num of outgoing messages */ 1309 for (j=0; j<nrow; j++) { 1310 row = irow[j]; 1311 if (!isrowsorted) proc = 0; 1312 while (row >= C->rmap->range[proc+1]) proc++; 1313 w1[proc]++; 1314 row2proc[j] = proc; /* map row index to proc */ 1315 1316 if (proc != rank && !w2[proc]) { 1317 w2[proc] = 1; nrqs++; 1318 } 1319 } 1320 w1[rank] = 0; /* rows owned by self will not be requested */ 1321 1322 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 1323 for (proc=0,j=0; proc<size; proc++) { 1324 if (w1[proc]) { pa[j++] = proc;} 1325 } 1326 1327 /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */ 1328 msz = 0; /* total mesg length (for all procs) */ 1329 for (i=0; i<nrqs; i++) { 1330 proc = pa[i]; 1331 w1[proc] += 3; 1332 msz += w1[proc]; 1333 } 1334 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 1335 1336 /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */ 1337 /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */ 1338 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 1339 1340 /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent; 1341 Output: onodes1: recv node-ids; olengths1: corresponding recv message length */ 1342 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 1343 1344 /* Now post the Irecvs corresponding to these messages */ 1345 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 1346 1347 ierr = PetscFree(onodes1);CHKERRQ(ierr); 1348 ierr = PetscFree(olengths1);CHKERRQ(ierr); 1349 1350 /* Allocate Memory for outgoing messages */ 1351 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 1352 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 1353 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 1354 1355 /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */ 1356 iptr = tmp; 1357 for (i=0; i<nrqs; i++) { 1358 proc = pa[i]; 1359 sbuf1[proc] = iptr; 1360 iptr += w1[proc]; 1361 } 1362 1363 /* Form the outgoing messages */ 1364 /* Initialize the header space */ 1365 for (i=0; i<nrqs; i++) { 1366 proc = pa[i]; 1367 ierr = PetscArrayzero(sbuf1[proc],3);CHKERRQ(ierr); 1368 ptr[proc] = sbuf1[proc] + 3; 1369 } 1370 1371 /* Parse the isrow and copy data into outbuf */ 1372 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 1373 for (j=0; j<nrow; j++) { /* parse the indices of each IS */ 1374 proc = row2proc[j]; 1375 if (proc != rank) { /* copy to the outgoing buf*/ 1376 *ptr[proc] = irow[j]; 1377 ctr[proc]++; ptr[proc]++; 1378 } 1379 } 1380 1381 /* Update the headers for the current IS */ 1382 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 1383 if ((ctr_j = ctr[j])) { 1384 sbuf1_j = sbuf1[j]; 1385 k = ++sbuf1_j[0]; 1386 sbuf1_j[2*k] = ctr_j; 1387 sbuf1_j[2*k-1] = 0; 1388 } 1389 } 1390 1391 /* Now post the sends */ 1392 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 1393 for (i=0; i<nrqs; ++i) { 1394 proc = pa[i]; 1395 ierr = MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);CHKERRMPI(ierr); 1396 } 1397 1398 /* Post Receives to capture the buffer size */ 1399 ierr = PetscMalloc4(nrqs+1,&r_status2,nrqr+1,&s_waits2,nrqs+1,&r_waits2,nrqr+1,&s_status2);CHKERRQ(ierr); 1400 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 1401 1402 rbuf2[0] = tmp + msz; 1403 for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]]; 1404 1405 for (i=0; i<nrqs; ++i) { 1406 proc = pa[i]; 1407 ierr = MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 1408 } 1409 1410 ierr = PetscFree2(w1,w2);CHKERRQ(ierr); 1411 1412 /* Send to other procs the buf size they should allocate */ 1413 /* Receive messages*/ 1414 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 1415 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 1416 1417 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 1418 for (i=0; i<nrqr; ++i) { 1419 req_size[i] = 0; 1420 rbuf1_i = rbuf1[i]; 1421 start = 2*rbuf1_i[0] + 1; 1422 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1423 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 1424 sbuf2_i = sbuf2[i]; 1425 for (j=start; j<end; j++) { 1426 k = rbuf1_i[j] - rstart; 1427 ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k]; 1428 sbuf2_i[j] = ncols; 1429 req_size[i] += ncols; 1430 } 1431 req_source1[i] = r_status1[i].MPI_SOURCE; 1432 1433 /* form the header */ 1434 sbuf2_i[0] = req_size[i]; 1435 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1436 1437 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 1438 } 1439 1440 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1441 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1442 1443 /* rbuf2 is received, Post recv column indices a->j */ 1444 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 1445 1446 ierr = PetscMalloc4(nrqs+1,&r_waits3,nrqr+1,&s_waits3,nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 1447 for (i=0; i<nrqs; ++i) { 1448 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 1449 req_source2[i] = r_status2[i].MPI_SOURCE; 1450 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 1451 } 1452 1453 /* Wait on sends1 and sends2 */ 1454 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1455 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr); 1456 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1457 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1458 1459 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr); 1460 ierr = PetscFree4(r_status2,s_waits2,r_waits2,s_status2);CHKERRQ(ierr); 1461 1462 /* Now allocate sending buffers for a->j, and send them off */ 1463 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1464 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1465 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1466 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1467 1468 for (i=0; i<nrqr; i++) { /* for each requested message */ 1469 rbuf1_i = rbuf1[i]; 1470 sbuf_aj_i = sbuf_aj[i]; 1471 ct1 = 2*rbuf1_i[0] + 1; 1472 ct2 = 0; 1473 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,0,"max1 %d != 1",max1); */ 1474 1475 kmax = rbuf1[i][2]; 1476 for (k=0; k<kmax; k++,ct1++) { /* for each row */ 1477 row = rbuf1_i[ct1] - rstart; 1478 nzA = ai[row+1] - ai[row]; 1479 nzB = bi[row+1] - bi[row]; 1480 ncols = nzA + nzB; 1481 cworkA = aj + ai[row]; cworkB = bj + bi[row]; 1482 1483 /* load the column indices for this row into cols*/ 1484 cols = sbuf_aj_i + ct2; 1485 1486 lwrite = 0; 1487 for (l=0; l<nzB; l++) { 1488 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1489 } 1490 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1491 for (l=0; l<nzB; l++) { 1492 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1493 } 1494 1495 ct2 += ncols; 1496 } 1497 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 1498 } 1499 1500 /* create column map (cmap): global col of C -> local col of submat */ 1501 #if defined(PETSC_USE_CTABLE) 1502 if (!allcolumns) { 1503 ierr = PetscTableCreate(ncol+1,C->cmap->N+1,&cmap);CHKERRQ(ierr); 1504 ierr = PetscCalloc1(C->cmap->n,&cmap_loc);CHKERRQ(ierr); 1505 for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */ 1506 if (icol[j] >= cstart && icol[j] <cend) { 1507 cmap_loc[icol[j] - cstart] = j+1; 1508 } else { /* use PetscTable for non-local col indices */ 1509 ierr = PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1510 } 1511 } 1512 } else { 1513 cmap = NULL; 1514 cmap_loc = NULL; 1515 } 1516 ierr = PetscCalloc1(C->rmap->n,&rmap_loc);CHKERRQ(ierr); 1517 #else 1518 if (!allcolumns) { 1519 ierr = PetscCalloc1(C->cmap->N,&cmap);CHKERRQ(ierr); 1520 for (j=0; j<ncol; j++) cmap[icol[j]] = j+1; 1521 } else { 1522 cmap = NULL; 1523 } 1524 #endif 1525 1526 /* Create lens for MatSeqAIJSetPreallocation() */ 1527 ierr = PetscCalloc1(nrow,&lens);CHKERRQ(ierr); 1528 1529 /* Compute lens from local part of C */ 1530 for (j=0; j<nrow; j++) { 1531 row = irow[j]; 1532 proc = row2proc[j]; 1533 if (proc == rank) { 1534 /* diagonal part A = c->A */ 1535 ncols = ai[row-rstart+1] - ai[row-rstart]; 1536 cols = aj + ai[row-rstart]; 1537 if (!allcolumns) { 1538 for (k=0; k<ncols; k++) { 1539 #if defined(PETSC_USE_CTABLE) 1540 tcol = cmap_loc[cols[k]]; 1541 #else 1542 tcol = cmap[cols[k]+cstart]; 1543 #endif 1544 if (tcol) lens[j]++; 1545 } 1546 } else { /* allcolumns */ 1547 lens[j] = ncols; 1548 } 1549 1550 /* off-diagonal part B = c->B */ 1551 ncols = bi[row-rstart+1] - bi[row-rstart]; 1552 cols = bj + bi[row-rstart]; 1553 if (!allcolumns) { 1554 for (k=0; k<ncols; k++) { 1555 #if defined(PETSC_USE_CTABLE) 1556 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1557 #else 1558 tcol = cmap[bmap[cols[k]]]; 1559 #endif 1560 if (tcol) lens[j]++; 1561 } 1562 } else { /* allcolumns */ 1563 lens[j] += ncols; 1564 } 1565 } 1566 } 1567 1568 /* Create row map (rmap): global row of C -> local row of submat */ 1569 #if defined(PETSC_USE_CTABLE) 1570 ierr = PetscTableCreate(nrow+1,C->rmap->N+1,&rmap);CHKERRQ(ierr); 1571 for (j=0; j<nrow; j++) { 1572 row = irow[j]; 1573 proc = row2proc[j]; 1574 if (proc == rank) { /* a local row */ 1575 rmap_loc[row - rstart] = j; 1576 } else { 1577 ierr = PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1578 } 1579 } 1580 #else 1581 ierr = PetscCalloc1(C->rmap->N,&rmap);CHKERRQ(ierr); 1582 for (j=0; j<nrow; j++) { 1583 rmap[irow[j]] = j; 1584 } 1585 #endif 1586 1587 /* Update lens from offproc data */ 1588 /* recv a->j is done */ 1589 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 1590 for (i=0; i<nrqs; i++) { 1591 proc = pa[i]; 1592 sbuf1_i = sbuf1[proc]; 1593 /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */ 1594 ct1 = 2 + 1; 1595 ct2 = 0; 1596 rbuf2_i = rbuf2[i]; /* received length of C->j */ 1597 rbuf3_i = rbuf3[i]; /* received C->j */ 1598 1599 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1600 max1 = sbuf1_i[2]; 1601 for (k=0; k<max1; k++,ct1++) { 1602 #if defined(PETSC_USE_CTABLE) 1603 ierr = PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1604 row--; 1605 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1606 #else 1607 row = rmap[sbuf1_i[ct1]]; /* the row index in submat */ 1608 #endif 1609 /* Now, store row index of submat in sbuf1_i[ct1] */ 1610 sbuf1_i[ct1] = row; 1611 1612 nnz = rbuf2_i[ct1]; 1613 if (!allcolumns) { 1614 for (l=0; l<nnz; l++,ct2++) { 1615 #if defined(PETSC_USE_CTABLE) 1616 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1617 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1618 } else { 1619 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1620 } 1621 #else 1622 tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */ 1623 #endif 1624 if (tcol) lens[row]++; 1625 } 1626 } else { /* allcolumns */ 1627 lens[row] += nnz; 1628 } 1629 } 1630 } 1631 ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr); 1632 ierr = PetscFree4(r_waits3,s_waits3,r_status3,s_status3);CHKERRQ(ierr); 1633 1634 /* Create the submatrices */ 1635 ierr = MatCreate(PETSC_COMM_SELF,&submat);CHKERRQ(ierr); 1636 ierr = MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1637 1638 ierr = ISGetBlockSize(isrow[0],&i);CHKERRQ(ierr); 1639 ierr = ISGetBlockSize(iscol[0],&j);CHKERRQ(ierr); 1640 ierr = MatSetBlockSizes(submat,i,j);CHKERRQ(ierr); 1641 ierr = MatSetType(submat,((PetscObject)A)->type_name);CHKERRQ(ierr); 1642 ierr = MatSeqAIJSetPreallocation(submat,0,lens);CHKERRQ(ierr); 1643 1644 /* create struct Mat_SubSppt and attached it to submat */ 1645 ierr = PetscNew(&smatis1);CHKERRQ(ierr); 1646 subc = (Mat_SeqAIJ*)submat->data; 1647 subc->submatis1 = smatis1; 1648 1649 smatis1->id = 0; 1650 smatis1->nrqs = nrqs; 1651 smatis1->nrqr = nrqr; 1652 smatis1->rbuf1 = rbuf1; 1653 smatis1->rbuf2 = rbuf2; 1654 smatis1->rbuf3 = rbuf3; 1655 smatis1->sbuf2 = sbuf2; 1656 smatis1->req_source2 = req_source2; 1657 1658 smatis1->sbuf1 = sbuf1; 1659 smatis1->ptr = ptr; 1660 smatis1->tmp = tmp; 1661 smatis1->ctr = ctr; 1662 1663 smatis1->pa = pa; 1664 smatis1->req_size = req_size; 1665 smatis1->req_source1 = req_source1; 1666 1667 smatis1->allcolumns = allcolumns; 1668 smatis1->singleis = PETSC_TRUE; 1669 smatis1->row2proc = row2proc; 1670 smatis1->rmap = rmap; 1671 smatis1->cmap = cmap; 1672 #if defined(PETSC_USE_CTABLE) 1673 smatis1->rmap_loc = rmap_loc; 1674 smatis1->cmap_loc = cmap_loc; 1675 #endif 1676 1677 smatis1->destroy = submat->ops->destroy; 1678 submat->ops->destroy = MatDestroySubMatrix_SeqAIJ; 1679 submat->factortype = C->factortype; 1680 1681 /* compute rmax */ 1682 rmax = 0; 1683 for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]); 1684 1685 } else { /* scall == MAT_REUSE_MATRIX */ 1686 submat = submats[0]; 1687 if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1688 1689 subc = (Mat_SeqAIJ*)submat->data; 1690 rmax = subc->rmax; 1691 smatis1 = subc->submatis1; 1692 nrqs = smatis1->nrqs; 1693 nrqr = smatis1->nrqr; 1694 rbuf1 = smatis1->rbuf1; 1695 rbuf2 = smatis1->rbuf2; 1696 rbuf3 = smatis1->rbuf3; 1697 req_source2 = smatis1->req_source2; 1698 1699 sbuf1 = smatis1->sbuf1; 1700 sbuf2 = smatis1->sbuf2; 1701 ptr = smatis1->ptr; 1702 tmp = smatis1->tmp; 1703 ctr = smatis1->ctr; 1704 1705 pa = smatis1->pa; 1706 req_size = smatis1->req_size; 1707 req_source1 = smatis1->req_source1; 1708 1709 allcolumns = smatis1->allcolumns; 1710 row2proc = smatis1->row2proc; 1711 rmap = smatis1->rmap; 1712 cmap = smatis1->cmap; 1713 #if defined(PETSC_USE_CTABLE) 1714 rmap_loc = smatis1->rmap_loc; 1715 cmap_loc = smatis1->cmap_loc; 1716 #endif 1717 } 1718 1719 /* Post recv matrix values */ 1720 ierr = PetscMalloc3(nrqs+1,&rbuf4, rmax,&subcols, rmax,&subvals);CHKERRQ(ierr); 1721 ierr = PetscMalloc4(nrqs+1,&r_waits4,nrqr+1,&s_waits4,nrqs+1,&r_status4,nrqr+1,&s_status4);CHKERRQ(ierr); 1722 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 1723 for (i=0; i<nrqs; ++i) { 1724 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 1725 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 1726 } 1727 1728 /* Allocate sending buffers for a->a, and send them off */ 1729 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1730 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1731 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1732 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1733 1734 for (i=0; i<nrqr; i++) { 1735 rbuf1_i = rbuf1[i]; 1736 sbuf_aa_i = sbuf_aa[i]; 1737 ct1 = 2*rbuf1_i[0]+1; 1738 ct2 = 0; 1739 /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */ 1740 1741 kmax = rbuf1_i[2]; 1742 for (k=0; k<kmax; k++,ct1++) { 1743 row = rbuf1_i[ct1] - rstart; 1744 nzA = ai[row+1] - ai[row]; 1745 nzB = bi[row+1] - bi[row]; 1746 ncols = nzA + nzB; 1747 cworkB = bj + bi[row]; 1748 vworkA = a_a + ai[row]; 1749 vworkB = b_a + bi[row]; 1750 1751 /* load the column values for this row into vals*/ 1752 vals = sbuf_aa_i + ct2; 1753 1754 lwrite = 0; 1755 for (l=0; l<nzB; l++) { 1756 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1757 } 1758 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1759 for (l=0; l<nzB; l++) { 1760 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1761 } 1762 1763 ct2 += ncols; 1764 } 1765 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 1766 } 1767 1768 /* Assemble submat */ 1769 /* First assemble the local rows */ 1770 for (j=0; j<nrow; j++) { 1771 row = irow[j]; 1772 proc = row2proc[j]; 1773 if (proc == rank) { 1774 Crow = row - rstart; /* local row index of C */ 1775 #if defined(PETSC_USE_CTABLE) 1776 row = rmap_loc[Crow]; /* row index of submat */ 1777 #else 1778 row = rmap[row]; 1779 #endif 1780 1781 if (allcolumns) { 1782 /* diagonal part A = c->A */ 1783 ncols = ai[Crow+1] - ai[Crow]; 1784 cols = aj + ai[Crow]; 1785 vals = a_a + ai[Crow]; 1786 i = 0; 1787 for (k=0; k<ncols; k++) { 1788 subcols[i] = cols[k] + cstart; 1789 subvals[i++] = vals[k]; 1790 } 1791 1792 /* off-diagonal part B = c->B */ 1793 ncols = bi[Crow+1] - bi[Crow]; 1794 cols = bj + bi[Crow]; 1795 vals = b_a + bi[Crow]; 1796 for (k=0; k<ncols; k++) { 1797 subcols[i] = bmap[cols[k]]; 1798 subvals[i++] = vals[k]; 1799 } 1800 1801 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1802 1803 } else { /* !allcolumns */ 1804 #if defined(PETSC_USE_CTABLE) 1805 /* diagonal part A = c->A */ 1806 ncols = ai[Crow+1] - ai[Crow]; 1807 cols = aj + ai[Crow]; 1808 vals = a_a + ai[Crow]; 1809 i = 0; 1810 for (k=0; k<ncols; k++) { 1811 tcol = cmap_loc[cols[k]]; 1812 if (tcol) { 1813 subcols[i] = --tcol; 1814 subvals[i++] = vals[k]; 1815 } 1816 } 1817 1818 /* off-diagonal part B = c->B */ 1819 ncols = bi[Crow+1] - bi[Crow]; 1820 cols = bj + bi[Crow]; 1821 vals = b_a + bi[Crow]; 1822 for (k=0; k<ncols; k++) { 1823 ierr = PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);CHKERRQ(ierr); 1824 if (tcol) { 1825 subcols[i] = --tcol; 1826 subvals[i++] = vals[k]; 1827 } 1828 } 1829 #else 1830 /* diagonal part A = c->A */ 1831 ncols = ai[Crow+1] - ai[Crow]; 1832 cols = aj + ai[Crow]; 1833 vals = a_a + ai[Crow]; 1834 i = 0; 1835 for (k=0; k<ncols; k++) { 1836 tcol = cmap[cols[k]+cstart]; 1837 if (tcol) { 1838 subcols[i] = --tcol; 1839 subvals[i++] = vals[k]; 1840 } 1841 } 1842 1843 /* off-diagonal part B = c->B */ 1844 ncols = bi[Crow+1] - bi[Crow]; 1845 cols = bj + bi[Crow]; 1846 vals = b_a + bi[Crow]; 1847 for (k=0; k<ncols; k++) { 1848 tcol = cmap[bmap[cols[k]]]; 1849 if (tcol) { 1850 subcols[i] = --tcol; 1851 subvals[i++] = vals[k]; 1852 } 1853 } 1854 #endif 1855 ierr = MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1856 } 1857 } 1858 } 1859 1860 /* Now assemble the off-proc rows */ 1861 for (i=0; i<nrqs; i++) { /* for each requested message */ 1862 /* recv values from other processes */ 1863 ierr = MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);CHKERRQ(ierr); 1864 proc = pa[idex]; 1865 sbuf1_i = sbuf1[proc]; 1866 /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,0,"jmax %d != 1",jmax); */ 1867 ct1 = 2 + 1; 1868 ct2 = 0; /* count of received C->j */ 1869 ct3 = 0; /* count of received C->j that will be inserted into submat */ 1870 rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */ 1871 rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */ 1872 rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */ 1873 1874 /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */ 1875 max1 = sbuf1_i[2]; /* num of rows */ 1876 for (k=0; k<max1; k++,ct1++) { /* for each recved row */ 1877 row = sbuf1_i[ct1]; /* row index of submat */ 1878 if (!allcolumns) { 1879 idex = 0; 1880 if (scall == MAT_INITIAL_MATRIX || !iscolsorted) { 1881 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1882 for (l=0; l<nnz; l++,ct2++) { /* for each recved column */ 1883 #if defined(PETSC_USE_CTABLE) 1884 if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) { 1885 tcol = cmap_loc[rbuf3_i[ct2] - cstart]; 1886 } else { 1887 ierr = PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1888 } 1889 #else 1890 tcol = cmap[rbuf3_i[ct2]]; 1891 #endif 1892 if (tcol) { 1893 subcols[idex] = --tcol; /* may not be sorted */ 1894 subvals[idex++] = rbuf4_i[ct2]; 1895 1896 /* We receive an entire column of C, but a subset of it needs to be inserted into submat. 1897 For reuse, we replace received C->j with index that should be inserted to submat */ 1898 if (iscolsorted) rbuf3_i[ct3++] = ct2; 1899 } 1900 } 1901 ierr = MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);CHKERRQ(ierr); 1902 } else { /* scall == MAT_REUSE_MATRIX */ 1903 submat = submats[0]; 1904 subc = (Mat_SeqAIJ*)submat->data; 1905 1906 nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */ 1907 for (l=0; l<nnz; l++) { 1908 ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */ 1909 subvals[idex++] = rbuf4_i[ct2]; 1910 } 1911 1912 bj = subc->j + subc->i[row]; /* sorted column indices */ 1913 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);CHKERRQ(ierr); 1914 } 1915 } else { /* allcolumns */ 1916 nnz = rbuf2_i[ct1]; /* num of C entries in this row */ 1917 ierr = MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);CHKERRQ(ierr); 1918 ct2 += nnz; 1919 } 1920 } 1921 } 1922 1923 /* sending a->a are done */ 1924 ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr); 1925 ierr = PetscFree4(r_waits4,s_waits4,r_status4,s_status4);CHKERRQ(ierr); 1926 1927 ierr = MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1928 ierr = MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1929 submats[0] = submat; 1930 1931 /* Restore the indices */ 1932 ierr = ISRestoreIndices(isrow[0],&irow);CHKERRQ(ierr); 1933 if (!allcolumns) { 1934 ierr = ISRestoreIndices(iscol[0],&icol);CHKERRQ(ierr); 1935 } 1936 1937 /* Destroy allocated memory */ 1938 for (i=0; i<nrqs; ++i) { 1939 ierr = PetscFree3(rbuf4[i],subcols,subvals);CHKERRQ(ierr); 1940 } 1941 ierr = PetscFree3(rbuf4,subcols,subvals);CHKERRQ(ierr); 1942 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1943 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1944 1945 if (scall == MAT_INITIAL_MATRIX) { 1946 ierr = PetscFree(lens);CHKERRQ(ierr); 1947 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1948 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1949 } 1950 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 1951 ierr = MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 1952 PetscFunctionReturn(0); 1953 } 1954 1955 PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1956 { 1957 PetscErrorCode ierr; 1958 PetscInt ncol; 1959 PetscBool colflag,allcolumns=PETSC_FALSE; 1960 1961 PetscFunctionBegin; 1962 /* Allocate memory to hold all the submatrices */ 1963 if (scall == MAT_INITIAL_MATRIX) { 1964 ierr = PetscCalloc1(2,submat);CHKERRQ(ierr); 1965 } 1966 1967 /* Check for special case: each processor gets entire matrix columns */ 1968 ierr = ISIdentity(iscol[0],&colflag);CHKERRQ(ierr); 1969 ierr = ISGetLocalSize(iscol[0],&ncol);CHKERRQ(ierr); 1970 if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE; 1971 1972 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);CHKERRQ(ierr); 1973 PetscFunctionReturn(0); 1974 } 1975 1976 PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1977 { 1978 PetscErrorCode ierr; 1979 PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2]; 1980 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE; 1981 Mat_SeqAIJ *subc; 1982 Mat_SubSppt *smat; 1983 1984 PetscFunctionBegin; 1985 /* Check for special case: each processor has a single IS */ 1986 if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPIU_Allreduce() */ 1987 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1988 C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */ 1989 PetscFunctionReturn(0); 1990 } 1991 1992 /* Collect global wantallmatrix and nstages */ 1993 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt); 1994 else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 1995 if (!nmax) nmax = 1; 1996 1997 if (scall == MAT_INITIAL_MATRIX) { 1998 /* Collect global wantallmatrix and nstages */ 1999 if (ismax == 1 && C->rmap->N == C->cmap->N) { 2000 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 2001 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 2002 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 2003 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 2004 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 2005 wantallmatrix = PETSC_TRUE; 2006 2007 ierr = PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 2008 } 2009 } 2010 2011 /* Determine the number of stages through which submatrices are done 2012 Each stage will extract nmax submatrices. 2013 nmax is determined by the matrix column dimension. 2014 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 2015 */ 2016 nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */ 2017 2018 in[0] = -1*(PetscInt)wantallmatrix; 2019 in[1] = nstages; 2020 ierr = MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 2021 wantallmatrix = (PetscBool)(-out[0]); 2022 nstages = out[1]; /* Make sure every processor loops through the global nstages */ 2023 2024 } else { /* MAT_REUSE_MATRIX */ 2025 if (ismax) { 2026 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2027 smat = subc->submatis1; 2028 } else { /* (*submat)[0] is a dummy matrix */ 2029 smat = (Mat_SubSppt*)(*submat)[0]->data; 2030 } 2031 if (!smat) { 2032 /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */ 2033 wantallmatrix = PETSC_TRUE; 2034 } else if (smat->singleis) { 2035 ierr = MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 2036 PetscFunctionReturn(0); 2037 } else { 2038 nstages = smat->nstages; 2039 } 2040 } 2041 2042 if (wantallmatrix) { 2043 ierr = MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 2044 PetscFunctionReturn(0); 2045 } 2046 2047 /* Allocate memory to hold all the submatrices and dummy submatrices */ 2048 if (scall == MAT_INITIAL_MATRIX) { 2049 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr); 2050 } 2051 2052 for (i=0,pos=0; i<nstages; i++) { 2053 if (pos+nmax <= ismax) max_no = nmax; 2054 else if (pos >= ismax) max_no = 0; 2055 else max_no = ismax-pos; 2056 2057 ierr = MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 2058 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */ 2059 smat = (Mat_SubSppt*)(*submat)[pos]->data; pos++; 2060 smat->nstages = nstages; 2061 } 2062 pos += max_no; 2063 } 2064 2065 if (ismax && scall == MAT_INITIAL_MATRIX) { 2066 /* save nstages for reuse */ 2067 subc = (Mat_SeqAIJ*)(*submat)[0]->data; 2068 smat = subc->submatis1; 2069 smat->nstages = nstages; 2070 } 2071 PetscFunctionReturn(0); 2072 } 2073 2074 /* -------------------------------------------------------------------------*/ 2075 PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 2076 { 2077 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 2078 Mat A = c->A; 2079 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc; 2080 const PetscInt **icol,**irow; 2081 PetscInt *nrow,*ncol,start; 2082 PetscErrorCode ierr; 2083 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr; 2084 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1; 2085 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol; 2086 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2; 2087 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 2088 #if defined(PETSC_USE_CTABLE) 2089 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 2090 #else 2091 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 2092 #endif 2093 const PetscInt *irow_i; 2094 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 2095 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 2096 MPI_Request *r_waits4,*s_waits3,*s_waits4; 2097 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 2098 MPI_Status *r_status3,*r_status4,*s_status4; 2099 MPI_Comm comm; 2100 PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i; 2101 PetscMPIInt *onodes1,*olengths1,end; 2102 PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 2103 Mat_SubSppt *smat_i; 2104 PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE; 2105 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen; 2106 2107 PetscFunctionBegin; 2108 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 2109 size = c->size; 2110 rank = c->rank; 2111 2112 ierr = PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);CHKERRQ(ierr); 2113 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr); 2114 2115 for (i=0; i<ismax; i++) { 2116 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr); 2117 if (!issorted[i]) iscsorted = issorted[i]; 2118 2119 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr); 2120 2121 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 2122 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 2123 2124 /* Check for special case: allcolumn */ 2125 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 2126 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 2127 if (colflag && ncol[i] == C->cmap->N) { 2128 allcolumns[i] = PETSC_TRUE; 2129 icol[i] = NULL; 2130 } else { 2131 allcolumns[i] = PETSC_FALSE; 2132 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 2133 } 2134 } 2135 2136 if (scall == MAT_REUSE_MATRIX) { 2137 /* Assumes new rows are same length as the old rows */ 2138 for (i=0; i<ismax; i++) { 2139 if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i); 2140 subc = (Mat_SeqAIJ*)submats[i]->data; 2141 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 2142 2143 /* Initial matrix as if empty */ 2144 ierr = PetscArrayzero(subc->ilen,submats[i]->rmap->n);CHKERRQ(ierr); 2145 2146 smat_i = subc->submatis1; 2147 2148 nrqs = smat_i->nrqs; 2149 nrqr = smat_i->nrqr; 2150 rbuf1 = smat_i->rbuf1; 2151 rbuf2 = smat_i->rbuf2; 2152 rbuf3 = smat_i->rbuf3; 2153 req_source2 = smat_i->req_source2; 2154 2155 sbuf1 = smat_i->sbuf1; 2156 sbuf2 = smat_i->sbuf2; 2157 ptr = smat_i->ptr; 2158 tmp = smat_i->tmp; 2159 ctr = smat_i->ctr; 2160 2161 pa = smat_i->pa; 2162 req_size = smat_i->req_size; 2163 req_source1 = smat_i->req_source1; 2164 2165 allcolumns[i] = smat_i->allcolumns; 2166 row2proc[i] = smat_i->row2proc; 2167 rmap[i] = smat_i->rmap; 2168 cmap[i] = smat_i->cmap; 2169 } 2170 2171 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */ 2172 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse"); 2173 smat_i = (Mat_SubSppt*)submats[0]->data; 2174 2175 nrqs = smat_i->nrqs; 2176 nrqr = smat_i->nrqr; 2177 rbuf1 = smat_i->rbuf1; 2178 rbuf2 = smat_i->rbuf2; 2179 rbuf3 = smat_i->rbuf3; 2180 req_source2 = smat_i->req_source2; 2181 2182 sbuf1 = smat_i->sbuf1; 2183 sbuf2 = smat_i->sbuf2; 2184 ptr = smat_i->ptr; 2185 tmp = smat_i->tmp; 2186 ctr = smat_i->ctr; 2187 2188 pa = smat_i->pa; 2189 req_size = smat_i->req_size; 2190 req_source1 = smat_i->req_source1; 2191 2192 allcolumns[0] = PETSC_FALSE; 2193 } 2194 } else { /* scall == MAT_INITIAL_MATRIX */ 2195 /* Get some new tags to keep the communication clean */ 2196 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 2197 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 2198 2199 /* evaluate communication - mesg to who, length of mesg, and buffer space 2200 required. Based on this, buffers are allocated, and data copied into them*/ 2201 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */ 2202 2203 for (i=0; i<ismax; i++) { 2204 jmax = nrow[i]; 2205 irow_i = irow[i]; 2206 2207 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr); 2208 row2proc[i] = row2proc_i; 2209 2210 if (issorted[i]) proc = 0; 2211 for (j=0; j<jmax; j++) { 2212 if (!issorted[i]) proc = 0; 2213 row = irow_i[j]; 2214 while (row >= C->rmap->range[proc+1]) proc++; 2215 w4[proc]++; 2216 row2proc_i[j] = proc; /* map row index to proc */ 2217 } 2218 for (j=0; j<size; j++) { 2219 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;} 2220 } 2221 } 2222 2223 nrqs = 0; /* no of outgoing messages */ 2224 msz = 0; /* total mesg length (for all procs) */ 2225 w1[rank] = 0; /* no mesg sent to self */ 2226 w3[rank] = 0; 2227 for (i=0; i<size; i++) { 2228 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 2229 } 2230 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 2231 for (i=0,j=0; i<size; i++) { 2232 if (w1[i]) { pa[j] = i; j++; } 2233 } 2234 2235 /* Each message would have a header = 1 + 2*(no of IS) + data */ 2236 for (i=0; i<nrqs; i++) { 2237 j = pa[i]; 2238 w1[j] += w2[j] + 2* w3[j]; 2239 msz += w1[j]; 2240 } 2241 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 2242 2243 /* Determine the number of messages to expect, their lengths, from from-ids */ 2244 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 2245 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 2246 2247 /* Now post the Irecvs corresponding to these messages */ 2248 tag0 = ((PetscObject)C)->tag; 2249 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 2250 2251 ierr = PetscFree(onodes1);CHKERRQ(ierr); 2252 ierr = PetscFree(olengths1);CHKERRQ(ierr); 2253 2254 /* Allocate Memory for outgoing messages */ 2255 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 2256 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr); 2257 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr); 2258 2259 { 2260 PetscInt *iptr = tmp; 2261 k = 0; 2262 for (i=0; i<nrqs; i++) { 2263 j = pa[i]; 2264 iptr += k; 2265 sbuf1[j] = iptr; 2266 k = w1[j]; 2267 } 2268 } 2269 2270 /* Form the outgoing messages. Initialize the header space */ 2271 for (i=0; i<nrqs; i++) { 2272 j = pa[i]; 2273 sbuf1[j][0] = 0; 2274 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr); 2275 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 2276 } 2277 2278 /* Parse the isrow and copy data into outbuf */ 2279 for (i=0; i<ismax; i++) { 2280 row2proc_i = row2proc[i]; 2281 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr); 2282 irow_i = irow[i]; 2283 jmax = nrow[i]; 2284 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 2285 proc = row2proc_i[j]; 2286 if (proc != rank) { /* copy to the outgoing buf*/ 2287 ctr[proc]++; 2288 *ptr[proc] = irow_i[j]; 2289 ptr[proc]++; 2290 } 2291 } 2292 /* Update the headers for the current IS */ 2293 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 2294 if ((ctr_j = ctr[j])) { 2295 sbuf1_j = sbuf1[j]; 2296 k = ++sbuf1_j[0]; 2297 sbuf1_j[2*k] = ctr_j; 2298 sbuf1_j[2*k-1] = i; 2299 } 2300 } 2301 } 2302 2303 /* Now post the sends */ 2304 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 2305 for (i=0; i<nrqs; ++i) { 2306 j = pa[i]; 2307 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRMPI(ierr); 2308 } 2309 2310 /* Post Receives to capture the buffer size */ 2311 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 2312 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr); 2313 rbuf2[0] = tmp + msz; 2314 for (i=1; i<nrqs; ++i) { 2315 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 2316 } 2317 for (i=0; i<nrqs; ++i) { 2318 j = pa[i]; 2319 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRMPI(ierr); 2320 } 2321 2322 /* Send to other procs the buf size they should allocate */ 2323 /* Receive messages*/ 2324 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 2325 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 2326 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr); 2327 { 2328 PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart; 2329 PetscInt *sbuf2_i; 2330 2331 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRMPI(ierr); 2332 for (i=0; i<nrqr; ++i) { 2333 req_size[i] = 0; 2334 rbuf1_i = rbuf1[i]; 2335 start = 2*rbuf1_i[0] + 1; 2336 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 2337 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr); 2338 sbuf2_i = sbuf2[i]; 2339 for (j=start; j<end; j++) { 2340 id = rbuf1_i[j] - rstart; 2341 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 2342 sbuf2_i[j] = ncols; 2343 req_size[i] += ncols; 2344 } 2345 req_source1[i] = r_status1[i].MPI_SOURCE; 2346 /* form the header */ 2347 sbuf2_i[0] = req_size[i]; 2348 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 2349 2350 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRMPI(ierr); 2351 } 2352 } 2353 ierr = PetscFree(r_status1);CHKERRQ(ierr); 2354 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 2355 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 2356 2357 /* Receive messages*/ 2358 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 2359 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 2360 2361 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRMPI(ierr); 2362 for (i=0; i<nrqs; ++i) { 2363 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr); 2364 req_source2[i] = r_status2[i].MPI_SOURCE; 2365 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRMPI(ierr); 2366 } 2367 ierr = PetscFree(r_status2);CHKERRQ(ierr); 2368 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 2369 2370 /* Wait on sends1 and sends2 */ 2371 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 2372 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 2373 2374 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRMPI(ierr);} 2375 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRMPI(ierr);} 2376 ierr = PetscFree(s_status1);CHKERRQ(ierr); 2377 ierr = PetscFree(s_status2);CHKERRQ(ierr); 2378 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 2379 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 2380 2381 /* Now allocate sending buffers for a->j, and send them off */ 2382 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 2383 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2384 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 2385 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 2386 2387 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 2388 { 2389 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 2390 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2391 PetscInt cend = C->cmap->rend; 2392 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 2393 2394 for (i=0; i<nrqr; i++) { 2395 rbuf1_i = rbuf1[i]; 2396 sbuf_aj_i = sbuf_aj[i]; 2397 ct1 = 2*rbuf1_i[0] + 1; 2398 ct2 = 0; 2399 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2400 kmax = rbuf1[i][2*j]; 2401 for (k=0; k<kmax; k++,ct1++) { 2402 row = rbuf1_i[ct1] - rstart; 2403 nzA = a_i[row+1] - a_i[row]; 2404 nzB = b_i[row+1] - b_i[row]; 2405 ncols = nzA + nzB; 2406 cworkA = a_j + a_i[row]; 2407 cworkB = b_j + b_i[row]; 2408 2409 /* load the column indices for this row into cols */ 2410 cols = sbuf_aj_i + ct2; 2411 2412 lwrite = 0; 2413 for (l=0; l<nzB; l++) { 2414 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 2415 } 2416 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 2417 for (l=0; l<nzB; l++) { 2418 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 2419 } 2420 2421 ct2 += ncols; 2422 } 2423 } 2424 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRMPI(ierr); 2425 } 2426 } 2427 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr); 2428 2429 /* create col map: global col of C -> local col of submatrices */ 2430 { 2431 const PetscInt *icol_i; 2432 #if defined(PETSC_USE_CTABLE) 2433 for (i=0; i<ismax; i++) { 2434 if (!allcolumns[i]) { 2435 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 2436 2437 jmax = ncol[i]; 2438 icol_i = icol[i]; 2439 cmap_i = cmap[i]; 2440 for (j=0; j<jmax; j++) { 2441 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2442 } 2443 } else cmap[i] = NULL; 2444 } 2445 #else 2446 for (i=0; i<ismax; i++) { 2447 if (!allcolumns[i]) { 2448 ierr = PetscCalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 2449 jmax = ncol[i]; 2450 icol_i = icol[i]; 2451 cmap_i = cmap[i]; 2452 for (j=0; j<jmax; j++) { 2453 cmap_i[icol_i[j]] = j+1; 2454 } 2455 } else cmap[i] = NULL; 2456 } 2457 #endif 2458 } 2459 2460 /* Create lens which is required for MatCreate... */ 2461 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 2462 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 2463 2464 if (ismax) { 2465 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr); 2466 } 2467 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 2468 2469 /* Update lens from local data */ 2470 for (i=0; i<ismax; i++) { 2471 row2proc_i = row2proc[i]; 2472 jmax = nrow[i]; 2473 if (!allcolumns[i]) cmap_i = cmap[i]; 2474 irow_i = irow[i]; 2475 lens_i = lens[i]; 2476 for (j=0; j<jmax; j++) { 2477 row = irow_i[j]; 2478 proc = row2proc_i[j]; 2479 if (proc == rank) { 2480 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2481 if (!allcolumns[i]) { 2482 for (k=0; k<ncols; k++) { 2483 #if defined(PETSC_USE_CTABLE) 2484 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2485 #else 2486 tcol = cmap_i[cols[k]]; 2487 #endif 2488 if (tcol) lens_i[j]++; 2489 } 2490 } else { /* allcolumns */ 2491 lens_i[j] = ncols; 2492 } 2493 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);CHKERRQ(ierr); 2494 } 2495 } 2496 } 2497 2498 /* Create row map: global row of C -> local row of submatrices */ 2499 #if defined(PETSC_USE_CTABLE) 2500 for (i=0; i<ismax; i++) { 2501 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 2502 irow_i = irow[i]; 2503 jmax = nrow[i]; 2504 for (j=0; j<jmax; j++) { 2505 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 2506 } 2507 } 2508 #else 2509 for (i=0; i<ismax; i++) { 2510 ierr = PetscCalloc1(C->rmap->N,&rmap[i]);CHKERRQ(ierr); 2511 rmap_i = rmap[i]; 2512 irow_i = irow[i]; 2513 jmax = nrow[i]; 2514 for (j=0; j<jmax; j++) { 2515 rmap_i[irow_i[j]] = j; 2516 } 2517 } 2518 #endif 2519 2520 /* Update lens from offproc data */ 2521 { 2522 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 2523 2524 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr); 2525 for (tmp2=0; tmp2<nrqs; tmp2++) { 2526 sbuf1_i = sbuf1[pa[tmp2]]; 2527 jmax = sbuf1_i[0]; 2528 ct1 = 2*jmax+1; 2529 ct2 = 0; 2530 rbuf2_i = rbuf2[tmp2]; 2531 rbuf3_i = rbuf3[tmp2]; 2532 for (j=1; j<=jmax; j++) { 2533 is_no = sbuf1_i[2*j-1]; 2534 max1 = sbuf1_i[2*j]; 2535 lens_i = lens[is_no]; 2536 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2537 rmap_i = rmap[is_no]; 2538 for (k=0; k<max1; k++,ct1++) { 2539 #if defined(PETSC_USE_CTABLE) 2540 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 2541 row--; 2542 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 2543 #else 2544 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 2545 #endif 2546 max2 = rbuf2_i[ct1]; 2547 for (l=0; l<max2; l++,ct2++) { 2548 if (!allcolumns[is_no]) { 2549 #if defined(PETSC_USE_CTABLE) 2550 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2551 #else 2552 tcol = cmap_i[rbuf3_i[ct2]]; 2553 #endif 2554 if (tcol) lens_i[row]++; 2555 } else { /* allcolumns */ 2556 lens_i[row]++; /* lens_i[row] += max2 ? */ 2557 } 2558 } 2559 } 2560 } 2561 } 2562 } 2563 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 2564 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRMPI(ierr);} 2565 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr); 2566 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 2567 2568 /* Create the submatrices */ 2569 for (i=0; i<ismax; i++) { 2570 PetscInt rbs,cbs; 2571 2572 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 2573 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 2574 2575 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 2576 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2577 2578 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 2579 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 2580 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 2581 2582 /* create struct Mat_SubSppt and attached it to submat */ 2583 ierr = PetscNew(&smat_i);CHKERRQ(ierr); 2584 subc = (Mat_SeqAIJ*)submats[i]->data; 2585 subc->submatis1 = smat_i; 2586 2587 smat_i->destroy = submats[i]->ops->destroy; 2588 submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ; 2589 submats[i]->factortype = C->factortype; 2590 2591 smat_i->id = i; 2592 smat_i->nrqs = nrqs; 2593 smat_i->nrqr = nrqr; 2594 smat_i->rbuf1 = rbuf1; 2595 smat_i->rbuf2 = rbuf2; 2596 smat_i->rbuf3 = rbuf3; 2597 smat_i->sbuf2 = sbuf2; 2598 smat_i->req_source2 = req_source2; 2599 2600 smat_i->sbuf1 = sbuf1; 2601 smat_i->ptr = ptr; 2602 smat_i->tmp = tmp; 2603 smat_i->ctr = ctr; 2604 2605 smat_i->pa = pa; 2606 smat_i->req_size = req_size; 2607 smat_i->req_source1 = req_source1; 2608 2609 smat_i->allcolumns = allcolumns[i]; 2610 smat_i->singleis = PETSC_FALSE; 2611 smat_i->row2proc = row2proc[i]; 2612 smat_i->rmap = rmap[i]; 2613 smat_i->cmap = cmap[i]; 2614 } 2615 2616 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */ 2617 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr); 2618 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 2619 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr); 2620 2621 /* create struct Mat_SubSppt and attached it to submat */ 2622 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr); 2623 submats[0]->data = (void*)smat_i; 2624 2625 smat_i->destroy = submats[0]->ops->destroy; 2626 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy; 2627 submats[0]->factortype = C->factortype; 2628 2629 smat_i->id = 0; 2630 smat_i->nrqs = nrqs; 2631 smat_i->nrqr = nrqr; 2632 smat_i->rbuf1 = rbuf1; 2633 smat_i->rbuf2 = rbuf2; 2634 smat_i->rbuf3 = rbuf3; 2635 smat_i->sbuf2 = sbuf2; 2636 smat_i->req_source2 = req_source2; 2637 2638 smat_i->sbuf1 = sbuf1; 2639 smat_i->ptr = ptr; 2640 smat_i->tmp = tmp; 2641 smat_i->ctr = ctr; 2642 2643 smat_i->pa = pa; 2644 smat_i->req_size = req_size; 2645 smat_i->req_source1 = req_source1; 2646 2647 smat_i->allcolumns = PETSC_FALSE; 2648 smat_i->singleis = PETSC_FALSE; 2649 smat_i->row2proc = NULL; 2650 smat_i->rmap = NULL; 2651 smat_i->cmap = NULL; 2652 } 2653 2654 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 2655 ierr = PetscFree(lens);CHKERRQ(ierr); 2656 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 2657 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 2658 2659 } /* endof scall == MAT_INITIAL_MATRIX */ 2660 2661 /* Post recv matrix values */ 2662 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr); 2663 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 2664 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 2665 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 2666 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 2667 for (i=0; i<nrqs; ++i) { 2668 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf4[i]);CHKERRQ(ierr); 2669 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRMPI(ierr); 2670 } 2671 2672 /* Allocate sending buffers for a->a, and send them off */ 2673 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 2674 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 2675 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 2676 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 2677 2678 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 2679 { 2680 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 2681 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 2682 PetscInt cend = C->cmap->rend; 2683 PetscInt *b_j = b->j; 2684 PetscScalar *vworkA,*vworkB,*a_a,*b_a; 2685 2686 ierr = MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2687 ierr = MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2688 for (i=0; i<nrqr; i++) { 2689 rbuf1_i = rbuf1[i]; 2690 sbuf_aa_i = sbuf_aa[i]; 2691 ct1 = 2*rbuf1_i[0]+1; 2692 ct2 = 0; 2693 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 2694 kmax = rbuf1_i[2*j]; 2695 for (k=0; k<kmax; k++,ct1++) { 2696 row = rbuf1_i[ct1] - rstart; 2697 nzA = a_i[row+1] - a_i[row]; 2698 nzB = b_i[row+1] - b_i[row]; 2699 ncols = nzA + nzB; 2700 cworkB = b_j + b_i[row]; 2701 vworkA = a_a + a_i[row]; 2702 vworkB = b_a + b_i[row]; 2703 2704 /* load the column values for this row into vals*/ 2705 vals = sbuf_aa_i+ct2; 2706 2707 lwrite = 0; 2708 for (l=0; l<nzB; l++) { 2709 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 2710 } 2711 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 2712 for (l=0; l<nzB; l++) { 2713 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 2714 } 2715 2716 ct2 += ncols; 2717 } 2718 } 2719 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRMPI(ierr); 2720 } 2721 ierr = MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);CHKERRQ(ierr); 2722 ierr = MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);CHKERRQ(ierr); 2723 } 2724 2725 /* Assemble the matrices */ 2726 /* First assemble the local rows */ 2727 for (i=0; i<ismax; i++) { 2728 row2proc_i = row2proc[i]; 2729 subc = (Mat_SeqAIJ*)submats[i]->data; 2730 imat_ilen = subc->ilen; 2731 imat_j = subc->j; 2732 imat_i = subc->i; 2733 imat_a = subc->a; 2734 2735 if (!allcolumns[i]) cmap_i = cmap[i]; 2736 rmap_i = rmap[i]; 2737 irow_i = irow[i]; 2738 jmax = nrow[i]; 2739 for (j=0; j<jmax; j++) { 2740 row = irow_i[j]; 2741 proc = row2proc_i[j]; 2742 if (proc == rank) { 2743 old_row = row; 2744 #if defined(PETSC_USE_CTABLE) 2745 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2746 row--; 2747 #else 2748 row = rmap_i[row]; 2749 #endif 2750 ilen_row = imat_ilen[row]; 2751 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2752 mat_i = imat_i[row]; 2753 mat_a = imat_a + mat_i; 2754 mat_j = imat_j + mat_i; 2755 if (!allcolumns[i]) { 2756 for (k=0; k<ncols; k++) { 2757 #if defined(PETSC_USE_CTABLE) 2758 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 2759 #else 2760 tcol = cmap_i[cols[k]]; 2761 #endif 2762 if (tcol) { 2763 *mat_j++ = tcol - 1; 2764 *mat_a++ = vals[k]; 2765 ilen_row++; 2766 } 2767 } 2768 } else { /* allcolumns */ 2769 for (k=0; k<ncols; k++) { 2770 *mat_j++ = cols[k]; /* global col index! */ 2771 *mat_a++ = vals[k]; 2772 ilen_row++; 2773 } 2774 } 2775 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 2776 2777 imat_ilen[row] = ilen_row; 2778 } 2779 } 2780 } 2781 2782 /* Now assemble the off proc rows */ 2783 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr); 2784 for (tmp2=0; tmp2<nrqs; tmp2++) { 2785 sbuf1_i = sbuf1[pa[tmp2]]; 2786 jmax = sbuf1_i[0]; 2787 ct1 = 2*jmax + 1; 2788 ct2 = 0; 2789 rbuf2_i = rbuf2[tmp2]; 2790 rbuf3_i = rbuf3[tmp2]; 2791 rbuf4_i = rbuf4[tmp2]; 2792 for (j=1; j<=jmax; j++) { 2793 is_no = sbuf1_i[2*j-1]; 2794 rmap_i = rmap[is_no]; 2795 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 2796 subc = (Mat_SeqAIJ*)submats[is_no]->data; 2797 imat_ilen = subc->ilen; 2798 imat_j = subc->j; 2799 imat_i = subc->i; 2800 imat_a = subc->a; 2801 max1 = sbuf1_i[2*j]; 2802 for (k=0; k<max1; k++,ct1++) { 2803 row = sbuf1_i[ct1]; 2804 #if defined(PETSC_USE_CTABLE) 2805 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 2806 row--; 2807 #else 2808 row = rmap_i[row]; 2809 #endif 2810 ilen = imat_ilen[row]; 2811 mat_i = imat_i[row]; 2812 mat_a = imat_a + mat_i; 2813 mat_j = imat_j + mat_i; 2814 max2 = rbuf2_i[ct1]; 2815 if (!allcolumns[is_no]) { 2816 for (l=0; l<max2; l++,ct2++) { 2817 #if defined(PETSC_USE_CTABLE) 2818 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 2819 #else 2820 tcol = cmap_i[rbuf3_i[ct2]]; 2821 #endif 2822 if (tcol) { 2823 *mat_j++ = tcol - 1; 2824 *mat_a++ = rbuf4_i[ct2]; 2825 ilen++; 2826 } 2827 } 2828 } else { /* allcolumns */ 2829 for (l=0; l<max2; l++,ct2++) { 2830 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 2831 *mat_a++ = rbuf4_i[ct2]; 2832 ilen++; 2833 } 2834 } 2835 imat_ilen[row] = ilen; 2836 } 2837 } 2838 } 2839 2840 if (!iscsorted) { /* sort column indices of the rows */ 2841 for (i=0; i<ismax; i++) { 2842 subc = (Mat_SeqAIJ*)submats[i]->data; 2843 imat_j = subc->j; 2844 imat_i = subc->i; 2845 imat_a = subc->a; 2846 imat_ilen = subc->ilen; 2847 2848 if (allcolumns[i]) continue; 2849 jmax = nrow[i]; 2850 for (j=0; j<jmax; j++) { 2851 mat_i = imat_i[j]; 2852 mat_a = imat_a + mat_i; 2853 mat_j = imat_j + mat_i; 2854 ierr = PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);CHKERRQ(ierr); 2855 } 2856 } 2857 } 2858 2859 ierr = PetscFree(r_status4);CHKERRQ(ierr); 2860 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 2861 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRMPI(ierr);} 2862 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 2863 ierr = PetscFree(s_status4);CHKERRQ(ierr); 2864 2865 /* Restore the indices */ 2866 for (i=0; i<ismax; i++) { 2867 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 2868 if (!allcolumns[i]) { 2869 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 2870 } 2871 } 2872 2873 for (i=0; i<ismax; i++) { 2874 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2875 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 2876 } 2877 2878 /* Destroy allocated memory */ 2879 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 2880 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 2881 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr); 2882 2883 for (i=0; i<nrqs; ++i) { 2884 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 2885 } 2886 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 2887 2888 ierr = PetscFree4(row2proc,cmap,rmap,allcolumns);CHKERRQ(ierr); 2889 PetscFunctionReturn(0); 2890 } 2891 2892 /* 2893 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 2894 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 2895 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 2896 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 2897 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 2898 state, and needs to be "assembled" later by compressing B's column space. 2899 2900 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 2901 Following this call, C->A & C->B have been created, even if empty. 2902 */ 2903 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 2904 { 2905 /* If making this function public, change the error returned in this function away from _PLIB. */ 2906 PetscErrorCode ierr; 2907 Mat_MPIAIJ *aij; 2908 Mat_SeqAIJ *Baij; 2909 PetscBool seqaij,Bdisassembled; 2910 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 2911 PetscScalar v; 2912 const PetscInt *rowindices,*colindices; 2913 2914 PetscFunctionBegin; 2915 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 2916 if (A) { 2917 ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2918 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 2919 if (rowemb) { 2920 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2921 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 2922 } else { 2923 if (C->rmap->n != A->rmap->n) { 2924 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2925 } 2926 } 2927 if (dcolemb) { 2928 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 2929 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 2930 } else { 2931 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2932 } 2933 } 2934 if (B) { 2935 ierr = PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 2936 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 2937 if (rowemb) { 2938 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 2939 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 2940 } else { 2941 if (C->rmap->n != B->rmap->n) { 2942 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 2943 } 2944 } 2945 if (ocolemb) { 2946 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 2947 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 2948 } else { 2949 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 2950 } 2951 } 2952 2953 aij = (Mat_MPIAIJ*)C->data; 2954 if (!aij->A) { 2955 /* Mimic parts of MatMPIAIJSetPreallocation() */ 2956 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 2957 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 2958 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 2959 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 2960 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 2961 } 2962 if (A) { 2963 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 2964 } else { 2965 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 2966 } 2967 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 2968 /* 2969 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 2970 need to "disassemble" B -- convert it to using C's global indices. 2971 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 2972 2973 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 2974 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 2975 2976 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 2977 At least avoid calling MatSetValues() and the implied searches? 2978 */ 2979 2980 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 2981 #if defined(PETSC_USE_CTABLE) 2982 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 2983 #else 2984 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 2985 /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */ 2986 if (aij->B) { 2987 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 2988 } 2989 #endif 2990 ngcol = 0; 2991 if (aij->lvec) { 2992 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 2993 } 2994 if (aij->garray) { 2995 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 2996 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 2997 } 2998 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 2999 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 3000 } 3001 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 3002 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 3003 } 3004 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 3005 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 3006 } 3007 } 3008 Bdisassembled = PETSC_FALSE; 3009 if (!aij->B) { 3010 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 3011 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 3012 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 3013 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 3014 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 3015 Bdisassembled = PETSC_TRUE; 3016 } 3017 if (B) { 3018 Baij = (Mat_SeqAIJ*)B->data; 3019 if (pattern == DIFFERENT_NONZERO_PATTERN) { 3020 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 3021 for (i=0; i<B->rmap->n; i++) { 3022 nz[i] = Baij->i[i+1] - Baij->i[i]; 3023 } 3024 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 3025 ierr = PetscFree(nz);CHKERRQ(ierr); 3026 } 3027 3028 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 3029 shift = rend-rstart; 3030 count = 0; 3031 rowindices = NULL; 3032 colindices = NULL; 3033 if (rowemb) { 3034 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 3035 } 3036 if (ocolemb) { 3037 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 3038 } 3039 for (i=0; i<B->rmap->n; i++) { 3040 PetscInt row; 3041 row = i; 3042 if (rowindices) row = rowindices[i]; 3043 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 3044 col = Baij->j[count]; 3045 if (colindices) col = colindices[col]; 3046 if (Bdisassembled && col>=rstart) col += shift; 3047 v = Baij->a[count]; 3048 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 3049 ++count; 3050 } 3051 } 3052 /* No assembly for aij->B is necessary. */ 3053 /* FIXME: set aij->B's nonzerostate correctly. */ 3054 } else { 3055 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 3056 } 3057 C->preallocated = PETSC_TRUE; 3058 C->was_assembled = PETSC_FALSE; 3059 C->assembled = PETSC_FALSE; 3060 /* 3061 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 3062 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 3063 */ 3064 PetscFunctionReturn(0); 3065 } 3066 3067 /* 3068 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 3069 */ 3070 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 3071 { 3072 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data; 3073 3074 PetscFunctionBegin; 3075 PetscValidPointer(A,2); 3076 PetscValidPointer(B,3); 3077 /* FIXME: make sure C is assembled */ 3078 *A = aij->A; 3079 *B = aij->B; 3080 /* Note that we don't incref *A and *B, so be careful! */ 3081 PetscFunctionReturn(0); 3082 } 3083 3084 /* 3085 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 3086 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 3087 */ 3088 PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 3089 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 3090 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 3091 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 3092 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 3093 { 3094 PetscErrorCode ierr; 3095 PetscMPIInt size,flag; 3096 PetscInt i,ii,cismax,ispar; 3097 Mat *A,*B; 3098 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 3099 3100 PetscFunctionBegin; 3101 if (!ismax) PetscFunctionReturn(0); 3102 3103 for (i = 0, cismax = 0; i < ismax; ++i) { 3104 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRMPI(ierr); 3105 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 3106 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRMPI(ierr); 3107 if (size > 1) ++cismax; 3108 } 3109 3110 /* 3111 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 3112 ispar counts the number of parallel ISs across C's comm. 3113 */ 3114 ierr = MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 3115 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 3116 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 3117 PetscFunctionReturn(0); 3118 } 3119 3120 /* if (ispar) */ 3121 /* 3122 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 3123 These are used to extract the off-diag portion of the resulting parallel matrix. 3124 The row IS for the off-diag portion is the same as for the diag portion, 3125 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 3126 */ 3127 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 3128 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 3129 for (i = 0, ii = 0; i < ismax; ++i) { 3130 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3131 if (size > 1) { 3132 /* 3133 TODO: This is the part that's ***NOT SCALABLE***. 3134 To fix this we need to extract just the indices of C's nonzero columns 3135 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 3136 part of iscol[i] -- without actually computing ciscol[ii]. This also has 3137 to be done without serializing on the IS list, so, most likely, it is best 3138 done by rewriting MatCreateSubMatrices_MPIAIJ() directly. 3139 */ 3140 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 3141 /* Now we have to 3142 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 3143 were sorted on each rank, concatenated they might no longer be sorted; 3144 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 3145 indices in the nondecreasing order to the original index positions. 3146 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 3147 */ 3148 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 3149 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 3150 ++ii; 3151 } 3152 } 3153 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 3154 for (i = 0, ii = 0; i < ismax; ++i) { 3155 PetscInt j,issize; 3156 const PetscInt *indices; 3157 3158 /* 3159 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 3160 */ 3161 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 3162 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 3163 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 3164 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 3165 for (j = 1; j < issize; ++j) { 3166 if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3167 } 3168 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 3169 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 3170 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 3171 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 3172 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 3173 for (j = 1; j < issize; ++j) { 3174 if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 3175 } 3176 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 3177 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3178 if (size > 1) { 3179 cisrow[ii] = isrow[i]; 3180 ++ii; 3181 } 3182 } 3183 /* 3184 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 3185 array of sequential matrices underlying the resulting parallel matrices. 3186 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 3187 contain duplicates. 3188 3189 There are as many diag matrices as there are original index sets. There are only as many parallel 3190 and off-diag matrices, as there are parallel (comm size > 1) index sets. 3191 3192 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 3193 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 3194 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 3195 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 3196 with A[i] and B[ii] extracted from the corresponding MPI submat. 3197 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 3198 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 3199 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 3200 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 3201 values into A[i] and B[ii] sitting inside the corresponding submat. 3202 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 3203 A[i], B[ii], AA[i] or BB[ii] matrices. 3204 */ 3205 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 3206 if (scall == MAT_INITIAL_MATRIX) { 3207 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 3208 } 3209 3210 /* Now obtain the sequential A and B submatrices separately. */ 3211 /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */ 3212 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);CHKERRQ(ierr); 3213 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);CHKERRQ(ierr); 3214 3215 /* 3216 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 3217 matrices A & B have been extracted directly into the parallel matrices containing them, or 3218 simply into the sequential matrix identical with the corresponding A (if size == 1). 3219 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 3220 to have the same sparsity pattern. 3221 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 3222 must be constructed for C. This is done by setseqmat(s). 3223 */ 3224 for (i = 0, ii = 0; i < ismax; ++i) { 3225 /* 3226 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 3227 That way we can avoid sorting and computing permutations when reusing. 3228 To this end: 3229 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 3230 - if caching arrays to hold the ISs, make and compose a container for them so that it can 3231 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 3232 */ 3233 MatStructure pattern = DIFFERENT_NONZERO_PATTERN; 3234 3235 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);CHKERRMPI(ierr); 3236 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 3237 if (size > 1) { 3238 if (scall == MAT_INITIAL_MATRIX) { 3239 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 3240 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 3241 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 3242 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 3243 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 3244 } 3245 /* 3246 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 3247 */ 3248 { 3249 Mat AA = A[i],BB = B[ii]; 3250 3251 if (AA || BB) { 3252 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 3253 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3254 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 3255 } 3256 ierr = MatDestroy(&AA);CHKERRQ(ierr); 3257 } 3258 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 3259 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 3260 ++ii; 3261 } else { /* if (size == 1) */ 3262 if (scall == MAT_REUSE_MATRIX) { 3263 ierr = MatDestroy(&(*submat)[i]);CHKERRQ(ierr); 3264 } 3265 if (isrow_p[i] || iscol_p[i]) { 3266 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 3267 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 3268 /* Otherwise A is extracted straight into (*submats)[i]. */ 3269 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 3270 ierr = MatDestroy(A+i);CHKERRQ(ierr); 3271 } else (*submat)[i] = A[i]; 3272 } 3273 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 3274 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 3275 } 3276 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 3277 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 3278 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 3279 ierr = PetscFree(A);CHKERRQ(ierr); 3280 ierr = MatDestroySubMatrices(cismax,&B);CHKERRQ(ierr); 3281 PetscFunctionReturn(0); 3282 } 3283 3284 PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 3285 { 3286 PetscErrorCode ierr; 3287 3288 PetscFunctionBegin; 3289 ierr = MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 3290 PetscFunctionReturn(0); 3291 } 3292