1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++) { 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 195 } 196 /* Update the headers for the current IS */ 197 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 198 if ((ctr_j = ctr[j])) { 199 outdat_j = outdat[j]; 200 k = ++outdat_j[0]; 201 outdat_j[2*k] = ctr_j; 202 outdat_j[2*k-1] = i; 203 } 204 } 205 isz[i] = isz_i; 206 } 207 } 208 209 /* Now post the sends */ 210 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 211 for (i=0; i<nrqs; ++i) { 212 j = pa[i]; 213 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 214 } 215 216 /* No longer need the original indices*/ 217 for (i=0; i<imax; ++i) { 218 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 219 } 220 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 221 222 for (i=0; i<imax; ++i) { 223 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 224 } 225 226 /* Do Local work*/ 227 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 228 229 /* Receive messages*/ 230 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 231 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 232 233 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 234 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 235 236 /* Phase 1 sends are complete - deallocate buffers */ 237 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 238 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 239 240 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 241 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 242 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 243 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 244 ierr = PetscFree(rbuf);CHKERRQ(ierr); 245 246 247 /* Send the data back*/ 248 /* Do a global reduction to know the buffer space req for incoming messages*/ 249 { 250 PetscMPIInt *rw1; 251 252 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 253 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 254 255 for (i=0; i<nrqr; ++i) { 256 proc = recv_status[i].MPI_SOURCE; 257 258 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 259 rw1[proc] = isz1[i]; 260 } 261 ierr = PetscFree(onodes1);CHKERRQ(ierr); 262 ierr = PetscFree(olengths1);CHKERRQ(ierr); 263 264 /* Determine the number of messages to expect, their lengths, from from-ids */ 265 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 266 ierr = PetscFree(rw1);CHKERRQ(ierr); 267 } 268 /* Now post the Irecvs corresponding to these messages */ 269 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 270 271 /* Now post the sends */ 272 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 273 for (i=0; i<nrqr; ++i) { 274 j = recv_status[i].MPI_SOURCE; 275 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 276 } 277 278 /* receive work done on other processors*/ 279 { 280 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 281 PetscMPIInt idex; 282 PetscBT table_i; 283 MPI_Status *status2; 284 285 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 286 for (i=0; i<nrqs; ++i) { 287 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 288 /* Process the message*/ 289 rbuf2_i = rbuf2[idex]; 290 ct1 = 2*rbuf2_i[0]+1; 291 jmax = rbuf2[idex][0]; 292 for (j=1; j<=jmax; j++) { 293 max = rbuf2_i[2*j]; 294 is_no = rbuf2_i[2*j-1]; 295 isz_i = isz[is_no]; 296 data_i = data[is_no]; 297 table_i = table[is_no]; 298 for (k=0; k<max; k++,ct1++) { 299 row = rbuf2_i[ct1]; 300 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 301 } 302 isz[is_no] = isz_i; 303 } 304 } 305 306 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 307 ierr = PetscFree(status2);CHKERRQ(ierr); 308 } 309 310 for (i=0; i<imax; ++i) { 311 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 312 } 313 314 ierr = PetscFree(onodes2);CHKERRQ(ierr); 315 ierr = PetscFree(olengths2);CHKERRQ(ierr); 316 317 ierr = PetscFree(pa);CHKERRQ(ierr); 318 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 320 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 321 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 323 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 324 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 325 ierr = PetscFree(s_status);CHKERRQ(ierr); 326 ierr = PetscFree(recv_status);CHKERRQ(ierr); 327 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 328 ierr = PetscFree(xdata);CHKERRQ(ierr); 329 ierr = PetscFree(isz1);CHKERRQ(ierr); 330 PetscFunctionReturn(0); 331 } 332 333 #undef __FUNCT__ 334 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 335 /* 336 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 337 the work on the local processor. 338 339 Inputs: 340 C - MAT_MPIAIJ; 341 imax - total no of index sets processed at a time; 342 table - an array of char - size = m bits. 343 344 Output: 345 isz - array containing the count of the solution elements corresponding 346 to each index set; 347 data - pointer to the solutions 348 */ 349 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 350 { 351 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 352 Mat A = c->A,B = c->B; 353 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 354 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 355 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 356 PetscBT table_i; 357 358 PetscFunctionBegin; 359 rstart = C->rmap->rstart; 360 cstart = C->cmap->rstart; 361 ai = a->i; 362 aj = a->j; 363 bi = b->i; 364 bj = b->j; 365 garray = c->garray; 366 367 368 for (i=0; i<imax; i++) { 369 data_i = data[i]; 370 table_i = table[i]; 371 isz_i = isz[i]; 372 for (j=0,max=isz[i]; j<max; j++) { 373 row = data_i[j] - rstart; 374 start = ai[row]; 375 end = ai[row+1]; 376 for (k=start; k<end; k++) { /* Amat */ 377 val = aj[k] + cstart; 378 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 379 } 380 start = bi[row]; 381 end = bi[row+1]; 382 for (k=start; k<end; k++) { /* Bmat */ 383 val = garray[bj[k]]; 384 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 385 } 386 } 387 isz[i] = isz_i; 388 } 389 PetscFunctionReturn(0); 390 } 391 392 #undef __FUNCT__ 393 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 394 /* 395 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 396 and return the output 397 398 Input: 399 C - the matrix 400 nrqr - no of messages being processed. 401 rbuf - an array of pointers to the recieved requests 402 403 Output: 404 xdata - array of messages to be sent back 405 isz1 - size of each message 406 407 For better efficiency perhaps we should malloc separately each xdata[i], 408 then if a remalloc is required we need only copy the data for that one row 409 rather then all previous rows as it is now where a single large chunck of 410 memory is used. 411 412 */ 413 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 414 { 415 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 416 Mat A = c->A,B = c->B; 417 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 418 PetscErrorCode ierr; 419 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 420 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 421 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 422 PetscInt *rbuf_i,kmax,rbuf_0; 423 PetscBT xtable; 424 425 PetscFunctionBegin; 426 m = C->rmap->N; 427 rstart = C->rmap->rstart; 428 cstart = C->cmap->rstart; 429 ai = a->i; 430 aj = a->j; 431 bi = b->i; 432 bj = b->j; 433 garray = c->garray; 434 435 436 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 437 rbuf_i = rbuf[i]; 438 rbuf_0 = rbuf_i[0]; 439 ct += rbuf_0; 440 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 441 } 442 443 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 444 else max1 = 1; 445 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 446 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 447 ++no_malloc; 448 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 449 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 450 451 ct3 = 0; 452 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 453 rbuf_i = rbuf[i]; 454 rbuf_0 = rbuf_i[0]; 455 ct1 = 2*rbuf_0+1; 456 ct2 = ct1; 457 ct3 += ct1; 458 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 459 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 460 oct2 = ct2; 461 kmax = rbuf_i[2*j]; 462 for (k=0; k<kmax; k++,ct1++) { 463 row = rbuf_i[ct1]; 464 if (!PetscBTLookupSet(xtable,row)) { 465 if (!(ct3 < mem_estimate)) { 466 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 467 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 468 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 469 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 470 xdata[0] = tmp; 471 mem_estimate = new_estimate; ++no_malloc; 472 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 473 } 474 xdata[i][ct2++] = row; 475 ct3++; 476 } 477 } 478 for (k=oct2,max2=ct2; k<max2; k++) { 479 row = xdata[i][k] - rstart; 480 start = ai[row]; 481 end = ai[row+1]; 482 for (l=start; l<end; l++) { 483 val = aj[l] + cstart; 484 if (!PetscBTLookupSet(xtable,val)) { 485 if (!(ct3 < mem_estimate)) { 486 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 487 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 488 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 489 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 490 xdata[0] = tmp; 491 mem_estimate = new_estimate; ++no_malloc; 492 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 493 } 494 xdata[i][ct2++] = val; 495 ct3++; 496 } 497 } 498 start = bi[row]; 499 end = bi[row+1]; 500 for (l=start; l<end; l++) { 501 val = garray[bj[l]]; 502 if (!PetscBTLookupSet(xtable,val)) { 503 if (!(ct3 < mem_estimate)) { 504 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 505 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 506 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 507 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 508 xdata[0] = tmp; 509 mem_estimate = new_estimate; ++no_malloc; 510 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 511 } 512 xdata[i][ct2++] = val; 513 ct3++; 514 } 515 } 516 } 517 /* Update the header*/ 518 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 519 xdata[i][2*j-1] = rbuf_i[2*j-1]; 520 } 521 xdata[i][0] = rbuf_0; 522 xdata[i+1] = xdata[i] + ct2; 523 isz1[i] = ct2; /* size of each message */ 524 } 525 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 526 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 527 PetscFunctionReturn(0); 528 } 529 /* -------------------------------------------------------------------------*/ 530 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 531 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 532 /* 533 Every processor gets the entire matrix 534 */ 535 #undef __FUNCT__ 536 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 537 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 538 { 539 Mat B; 540 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 541 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 542 PetscErrorCode ierr; 543 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 544 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 545 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 546 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 547 548 PetscFunctionBegin; 549 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 550 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 551 552 if (scall == MAT_INITIAL_MATRIX) { 553 /* ---------------------------------------------------------------- 554 Tell every processor the number of nonzeros per row 555 */ 556 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 557 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 558 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 559 } 560 sendcount = A->rmap->rend - A->rmap->rstart; 561 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 562 for (i=0; i<size; i++) { 563 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 564 displs[i] = A->rmap->range[i]; 565 } 566 #if defined(PETSC_HAVE_MPI_IN_PLACE) 567 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 568 #else 569 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 570 #endif 571 /* --------------------------------------------------------------- 572 Create the sequential matrix of the same type as the local block diagonal 573 */ 574 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 575 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 576 ierr = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 577 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 578 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 579 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 580 **Bin = B; 581 b = (Mat_SeqAIJ*)B->data; 582 583 /*-------------------------------------------------------------------- 584 Copy my part of matrix column indices over 585 */ 586 sendcount = ad->nz + bd->nz; 587 jsendbuf = b->j + b->i[rstarts[rank]]; 588 a_jsendbuf = ad->j; 589 b_jsendbuf = bd->j; 590 n = A->rmap->rend - A->rmap->rstart; 591 cnt = 0; 592 for (i=0; i<n; i++) { 593 594 /* put in lower diagonal portion */ 595 m = bd->i[i+1] - bd->i[i]; 596 while (m > 0) { 597 /* is it above diagonal (in bd (compressed) numbering) */ 598 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 599 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 600 m--; 601 } 602 603 /* put in diagonal portion */ 604 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 605 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 606 } 607 608 /* put in upper diagonal portion */ 609 while (m-- > 0) { 610 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 611 } 612 } 613 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 614 615 /*-------------------------------------------------------------------- 616 Gather all column indices to all processors 617 */ 618 for (i=0; i<size; i++) { 619 recvcounts[i] = 0; 620 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 621 recvcounts[i] += lens[j]; 622 } 623 } 624 displs[0] = 0; 625 for (i=1; i<size; i++) { 626 displs[i] = displs[i-1] + recvcounts[i-1]; 627 } 628 #if defined(PETSC_HAVE_MPI_IN_PLACE) 629 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 630 #else 631 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 632 #endif 633 /*-------------------------------------------------------------------- 634 Assemble the matrix into useable form (note numerical values not yet set) 635 */ 636 /* set the b->ilen (length of each row) values */ 637 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 638 /* set the b->i indices */ 639 b->i[0] = 0; 640 for (i=1; i<=A->rmap->N; i++) { 641 b->i[i] = b->i[i-1] + lens[i-1]; 642 } 643 ierr = PetscFree(lens);CHKERRQ(ierr); 644 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 645 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 646 647 } else { 648 B = **Bin; 649 b = (Mat_SeqAIJ*)B->data; 650 } 651 652 /*-------------------------------------------------------------------- 653 Copy my part of matrix numerical values into the values location 654 */ 655 if (flag == MAT_GET_VALUES) { 656 sendcount = ad->nz + bd->nz; 657 sendbuf = b->a + b->i[rstarts[rank]]; 658 a_sendbuf = ad->a; 659 b_sendbuf = bd->a; 660 b_sendj = bd->j; 661 n = A->rmap->rend - A->rmap->rstart; 662 cnt = 0; 663 for (i=0; i<n; i++) { 664 665 /* put in lower diagonal portion */ 666 m = bd->i[i+1] - bd->i[i]; 667 while (m > 0) { 668 /* is it above diagonal (in bd (compressed) numbering) */ 669 if (garray[*b_sendj] > A->rmap->rstart + i) break; 670 sendbuf[cnt++] = *b_sendbuf++; 671 m--; 672 b_sendj++; 673 } 674 675 /* put in diagonal portion */ 676 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 677 sendbuf[cnt++] = *a_sendbuf++; 678 } 679 680 /* put in upper diagonal portion */ 681 while (m-- > 0) { 682 sendbuf[cnt++] = *b_sendbuf++; 683 b_sendj++; 684 } 685 } 686 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 687 688 /* ----------------------------------------------------------------- 689 Gather all numerical values to all processors 690 */ 691 if (!recvcounts) { 692 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 693 } 694 for (i=0; i<size; i++) { 695 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 696 } 697 displs[0] = 0; 698 for (i=1; i<size; i++) { 699 displs[i] = displs[i-1] + recvcounts[i-1]; 700 } 701 recvbuf = b->a; 702 #if defined(PETSC_HAVE_MPI_IN_PLACE) 703 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 704 #else 705 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 706 #endif 707 } /* endof (flag == MAT_GET_VALUES) */ 708 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 709 710 if (A->symmetric) { 711 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 712 } else if (A->hermitian) { 713 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 714 } else if (A->structurally_symmetric) { 715 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 716 } 717 PetscFunctionReturn(0); 718 } 719 720 721 722 #undef __FUNCT__ 723 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 724 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 725 { 726 PetscErrorCode ierr; 727 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 728 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 729 730 PetscFunctionBegin; 731 /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */ 732 /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level. 733 However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that 734 take care to order the result correctly by assembling it with MatSetValues() (after preallocating). 735 */ 736 for (i = 0; i < ismax; ++i) { 737 PetscBool sorted; 738 ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr); 739 if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i); 740 } 741 742 /* 743 Check for special case: each processor gets entire matrix 744 */ 745 if (ismax == 1 && C->rmap->N == C->cmap->N) { 746 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 747 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 748 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 749 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 750 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 751 wantallmatrix = PETSC_TRUE; 752 753 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 754 } 755 } 756 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 757 if (twantallmatrix) { 758 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 759 PetscFunctionReturn(0); 760 } 761 762 /* Allocate memory to hold all the submatrices */ 763 if (scall != MAT_REUSE_MATRIX) { 764 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 765 } 766 767 /* Check for special case: each processor gets entire matrix columns */ 768 ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr); 769 for (i=0; i<ismax; i++) { 770 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 771 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 772 if (colflag && ncol == C->cmap->N) { 773 allcolumns[i] = PETSC_TRUE; 774 } else { 775 allcolumns[i] = PETSC_FALSE; 776 } 777 } 778 779 /* Determine the number of stages through which submatrices are done */ 780 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 781 782 /* 783 Each stage will extract nmax submatrices. 784 nmax is determined by the matrix column dimension. 785 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 786 */ 787 if (!nmax) nmax = 1; 788 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 789 790 /* Make sure every processor loops through the nstages */ 791 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 792 793 for (i=0,pos=0; i<nstages; i++) { 794 if (pos+nmax <= ismax) max_no = nmax; 795 else if (pos == ismax) max_no = 0; 796 else max_no = ismax-pos; 797 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 798 pos += max_no; 799 } 800 801 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 802 PetscFunctionReturn(0); 803 } 804 805 /* -------------------------------------------------------------------------*/ 806 #undef __FUNCT__ 807 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 808 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 809 { 810 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 811 Mat A = c->A; 812 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 813 const PetscInt **icol,**irow; 814 PetscInt *nrow,*ncol,start; 815 PetscErrorCode ierr; 816 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 817 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 818 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 819 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 820 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 821 #if defined(PETSC_USE_CTABLE) 822 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 823 #else 824 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 825 #endif 826 const PetscInt *irow_i; 827 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 828 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 829 MPI_Request *r_waits4,*s_waits3,*s_waits4; 830 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 831 MPI_Status *r_status3,*r_status4,*s_status4; 832 MPI_Comm comm; 833 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 834 PetscMPIInt *onodes1,*olengths1; 835 PetscMPIInt idex,idex2,end; 836 837 PetscFunctionBegin; 838 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 839 tag0 = ((PetscObject)C)->tag; 840 size = c->size; 841 rank = c->rank; 842 843 /* Get some new tags to keep the communication clean */ 844 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 845 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 846 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 847 848 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 849 850 for (i=0; i<ismax; i++) { 851 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 852 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 853 if (allcolumns[i]) { 854 icol[i] = NULL; 855 ncol[i] = C->cmap->N; 856 } else { 857 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 858 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 859 } 860 } 861 862 /* evaluate communication - mesg to who, length of mesg, and buffer space 863 required. Based on this, buffers are allocated, and data copied into them*/ 864 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 865 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 866 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 867 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 868 for (i=0; i<ismax; i++) { 869 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 870 jmax = nrow[i]; 871 irow_i = irow[i]; 872 for (j=0; j<jmax; j++) { 873 l = 0; 874 row = irow_i[j]; 875 while (row >= C->rmap->range[l+1]) l++; 876 proc = l; 877 w4[proc]++; 878 } 879 for (j=0; j<size; j++) { 880 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 881 } 882 } 883 884 nrqs = 0; /* no of outgoing messages */ 885 msz = 0; /* total mesg length (for all procs) */ 886 w1[rank] = 0; /* no mesg sent to self */ 887 w3[rank] = 0; 888 for (i=0; i<size; i++) { 889 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 890 } 891 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 892 for (i=0,j=0; i<size; i++) { 893 if (w1[i]) { pa[j] = i; j++; } 894 } 895 896 /* Each message would have a header = 1 + 2*(no of IS) + data */ 897 for (i=0; i<nrqs; i++) { 898 j = pa[i]; 899 w1[j] += w2[j] + 2* w3[j]; 900 msz += w1[j]; 901 } 902 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 903 904 /* Determine the number of messages to expect, their lengths, from from-ids */ 905 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 906 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 907 908 /* Now post the Irecvs corresponding to these messages */ 909 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 910 911 ierr = PetscFree(onodes1);CHKERRQ(ierr); 912 ierr = PetscFree(olengths1);CHKERRQ(ierr); 913 914 /* Allocate Memory for outgoing messages */ 915 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 916 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 917 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 918 919 { 920 PetscInt *iptr = tmp,ict = 0; 921 for (i=0; i<nrqs; i++) { 922 j = pa[i]; 923 iptr += ict; 924 sbuf1[j] = iptr; 925 ict = w1[j]; 926 } 927 } 928 929 /* Form the outgoing messages */ 930 /* Initialize the header space */ 931 for (i=0; i<nrqs; i++) { 932 j = pa[i]; 933 sbuf1[j][0] = 0; 934 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 935 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 936 } 937 938 /* Parse the isrow and copy data into outbuf */ 939 for (i=0; i<ismax; i++) { 940 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 941 irow_i = irow[i]; 942 jmax = nrow[i]; 943 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 944 l = 0; 945 row = irow_i[j]; 946 while (row >= C->rmap->range[l+1]) l++; 947 proc = l; 948 if (proc != rank) { /* copy to the outgoing buf*/ 949 ctr[proc]++; 950 *ptr[proc] = row; 951 ptr[proc]++; 952 } 953 } 954 /* Update the headers for the current IS */ 955 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 956 if ((ctr_j = ctr[j])) { 957 sbuf1_j = sbuf1[j]; 958 k = ++sbuf1_j[0]; 959 sbuf1_j[2*k] = ctr_j; 960 sbuf1_j[2*k-1] = i; 961 } 962 } 963 } 964 965 /* Now post the sends */ 966 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 967 for (i=0; i<nrqs; ++i) { 968 j = pa[i]; 969 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 970 } 971 972 /* Post Receives to capture the buffer size */ 973 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 974 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 975 rbuf2[0] = tmp + msz; 976 for (i=1; i<nrqs; ++i) { 977 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 978 } 979 for (i=0; i<nrqs; ++i) { 980 j = pa[i]; 981 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 982 } 983 984 /* Send to other procs the buf size they should allocate */ 985 986 987 /* Receive messages*/ 988 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 989 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 990 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 991 { 992 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 993 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 994 PetscInt *sbuf2_i; 995 996 for (i=0; i<nrqr; ++i) { 997 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 998 999 req_size[idex] = 0; 1000 rbuf1_i = rbuf1[idex]; 1001 start = 2*rbuf1_i[0] + 1; 1002 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1003 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 1004 sbuf2_i = sbuf2[idex]; 1005 for (j=start; j<end; j++) { 1006 id = rbuf1_i[j] - rstart; 1007 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1008 sbuf2_i[j] = ncols; 1009 req_size[idex] += ncols; 1010 } 1011 req_source[idex] = r_status1[i].MPI_SOURCE; 1012 /* form the header */ 1013 sbuf2_i[0] = req_size[idex]; 1014 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1015 1016 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1017 } 1018 } 1019 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1020 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1021 1022 /* recv buffer sizes */ 1023 /* Receive messages*/ 1024 1025 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1026 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1027 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1028 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1029 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1030 1031 for (i=0; i<nrqs; ++i) { 1032 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1033 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1034 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1035 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1036 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1037 } 1038 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1039 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1040 1041 /* Wait on sends1 and sends2 */ 1042 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1043 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1044 1045 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1046 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1047 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1048 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1049 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1050 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1051 1052 /* Now allocate buffers for a->j, and send them off */ 1053 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1054 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1055 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1056 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1057 1058 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1059 { 1060 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1061 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1062 PetscInt cend = C->cmap->rend; 1063 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1064 1065 for (i=0; i<nrqr; i++) { 1066 rbuf1_i = rbuf1[i]; 1067 sbuf_aj_i = sbuf_aj[i]; 1068 ct1 = 2*rbuf1_i[0] + 1; 1069 ct2 = 0; 1070 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1071 kmax = rbuf1[i][2*j]; 1072 for (k=0; k<kmax; k++,ct1++) { 1073 row = rbuf1_i[ct1] - rstart; 1074 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1075 ncols = nzA + nzB; 1076 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1077 1078 /* load the column indices for this row into cols*/ 1079 cols = sbuf_aj_i + ct2; 1080 1081 lwrite = 0; 1082 for (l=0; l<nzB; l++) { 1083 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1084 } 1085 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1086 for (l=0; l<nzB; l++) { 1087 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1088 } 1089 1090 ct2 += ncols; 1091 } 1092 } 1093 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1094 } 1095 } 1096 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1097 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1098 1099 /* Allocate buffers for a->a, and send them off */ 1100 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1101 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1102 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1103 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1104 1105 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1106 { 1107 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1108 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1109 PetscInt cend = C->cmap->rend; 1110 PetscInt *b_j = b->j; 1111 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1112 1113 for (i=0; i<nrqr; i++) { 1114 rbuf1_i = rbuf1[i]; 1115 sbuf_aa_i = sbuf_aa[i]; 1116 ct1 = 2*rbuf1_i[0]+1; 1117 ct2 = 0; 1118 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1119 kmax = rbuf1_i[2*j]; 1120 for (k=0; k<kmax; k++,ct1++) { 1121 row = rbuf1_i[ct1] - rstart; 1122 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1123 ncols = nzA + nzB; 1124 cworkB = b_j + b_i[row]; 1125 vworkA = a_a + a_i[row]; 1126 vworkB = b_a + b_i[row]; 1127 1128 /* load the column values for this row into vals*/ 1129 vals = sbuf_aa_i+ct2; 1130 1131 lwrite = 0; 1132 for (l=0; l<nzB; l++) { 1133 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1134 } 1135 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1136 for (l=0; l<nzB; l++) { 1137 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1138 } 1139 1140 ct2 += ncols; 1141 } 1142 } 1143 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1144 } 1145 } 1146 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1147 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1148 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1149 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1150 1151 /* Form the matrix */ 1152 /* create col map: global col of C -> local col of submatrices */ 1153 { 1154 const PetscInt *icol_i; 1155 #if defined(PETSC_USE_CTABLE) 1156 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1157 for (i=0; i<ismax; i++) { 1158 if (!allcolumns[i]) { 1159 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1160 1161 jmax = ncol[i]; 1162 icol_i = icol[i]; 1163 cmap_i = cmap[i]; 1164 for (j=0; j<jmax; j++) { 1165 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1166 } 1167 } else { 1168 cmap[i] = NULL; 1169 } 1170 } 1171 #else 1172 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1173 for (i=0; i<ismax; i++) { 1174 if (!allcolumns[i]) { 1175 ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr); 1176 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1177 jmax = ncol[i]; 1178 icol_i = icol[i]; 1179 cmap_i = cmap[i]; 1180 for (j=0; j<jmax; j++) { 1181 cmap_i[icol_i[j]] = j+1; 1182 } 1183 } else { 1184 cmap[i] = NULL; 1185 } 1186 } 1187 #endif 1188 } 1189 1190 /* Create lens which is required for MatCreate... */ 1191 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1192 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1193 if (ismax) { 1194 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1195 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1196 } 1197 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1198 1199 /* Update lens from local data */ 1200 for (i=0; i<ismax; i++) { 1201 jmax = nrow[i]; 1202 if (!allcolumns[i]) cmap_i = cmap[i]; 1203 irow_i = irow[i]; 1204 lens_i = lens[i]; 1205 for (j=0; j<jmax; j++) { 1206 l = 0; 1207 row = irow_i[j]; 1208 while (row >= C->rmap->range[l+1]) l++; 1209 proc = l; 1210 if (proc == rank) { 1211 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1212 if (!allcolumns[i]) { 1213 for (k=0; k<ncols; k++) { 1214 #if defined(PETSC_USE_CTABLE) 1215 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1216 #else 1217 tcol = cmap_i[cols[k]]; 1218 #endif 1219 if (tcol) lens_i[j]++; 1220 } 1221 } else { /* allcolumns */ 1222 lens_i[j] = ncols; 1223 } 1224 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1225 } 1226 } 1227 } 1228 1229 /* Create row map: global row of C -> local row of submatrices */ 1230 #if defined(PETSC_USE_CTABLE) 1231 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1232 for (i=0; i<ismax; i++) { 1233 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1234 rmap_i = rmap[i]; 1235 irow_i = irow[i]; 1236 jmax = nrow[i]; 1237 for (j=0; j<jmax; j++) { 1238 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1239 } 1240 } 1241 #else 1242 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1243 if (ismax) { 1244 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1245 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1246 } 1247 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1248 for (i=0; i<ismax; i++) { 1249 rmap_i = rmap[i]; 1250 irow_i = irow[i]; 1251 jmax = nrow[i]; 1252 for (j=0; j<jmax; j++) { 1253 rmap_i[irow_i[j]] = j; 1254 } 1255 } 1256 #endif 1257 1258 /* Update lens from offproc data */ 1259 { 1260 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1261 1262 for (tmp2=0; tmp2<nrqs; tmp2++) { 1263 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1264 idex = pa[idex2]; 1265 sbuf1_i = sbuf1[idex]; 1266 jmax = sbuf1_i[0]; 1267 ct1 = 2*jmax+1; 1268 ct2 = 0; 1269 rbuf2_i = rbuf2[idex2]; 1270 rbuf3_i = rbuf3[idex2]; 1271 for (j=1; j<=jmax; j++) { 1272 is_no = sbuf1_i[2*j-1]; 1273 max1 = sbuf1_i[2*j]; 1274 lens_i = lens[is_no]; 1275 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1276 rmap_i = rmap[is_no]; 1277 for (k=0; k<max1; k++,ct1++) { 1278 #if defined(PETSC_USE_CTABLE) 1279 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1280 row--; 1281 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1282 #else 1283 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1284 #endif 1285 max2 = rbuf2_i[ct1]; 1286 for (l=0; l<max2; l++,ct2++) { 1287 if (!allcolumns[is_no]) { 1288 #if defined(PETSC_USE_CTABLE) 1289 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1290 #else 1291 tcol = cmap_i[rbuf3_i[ct2]]; 1292 #endif 1293 if (tcol) lens_i[row]++; 1294 } else { /* allcolumns */ 1295 lens_i[row]++; /* lens_i[row] += max2 ? */ 1296 } 1297 } 1298 } 1299 } 1300 } 1301 } 1302 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1303 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1304 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1305 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1306 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1307 1308 /* Create the submatrices */ 1309 if (scall == MAT_REUSE_MATRIX) { 1310 PetscBool flag; 1311 1312 /* 1313 Assumes new rows are same length as the old rows,hence bug! 1314 */ 1315 for (i=0; i<ismax; i++) { 1316 mat = (Mat_SeqAIJ*)(submats[i]->data); 1317 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1318 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1319 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1320 /* Initial matrix as if empty */ 1321 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1322 1323 submats[i]->factortype = C->factortype; 1324 } 1325 } else { 1326 for (i=0; i<ismax; i++) { 1327 PetscInt rbs,cbs; 1328 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1329 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1330 1331 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1332 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1333 1334 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1335 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1336 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1337 } 1338 } 1339 1340 /* Assemble the matrices */ 1341 /* First assemble the local rows */ 1342 { 1343 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1344 PetscScalar *imat_a; 1345 1346 for (i=0; i<ismax; i++) { 1347 mat = (Mat_SeqAIJ*)submats[i]->data; 1348 imat_ilen = mat->ilen; 1349 imat_j = mat->j; 1350 imat_i = mat->i; 1351 imat_a = mat->a; 1352 1353 if (!allcolumns[i]) cmap_i = cmap[i]; 1354 rmap_i = rmap[i]; 1355 irow_i = irow[i]; 1356 jmax = nrow[i]; 1357 for (j=0; j<jmax; j++) { 1358 l = 0; 1359 row = irow_i[j]; 1360 while (row >= C->rmap->range[l+1]) l++; 1361 proc = l; 1362 if (proc == rank) { 1363 old_row = row; 1364 #if defined(PETSC_USE_CTABLE) 1365 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1366 row--; 1367 #else 1368 row = rmap_i[row]; 1369 #endif 1370 ilen_row = imat_ilen[row]; 1371 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1372 mat_i = imat_i[row]; 1373 mat_a = imat_a + mat_i; 1374 mat_j = imat_j + mat_i; 1375 if (!allcolumns[i]) { 1376 for (k=0; k<ncols; k++) { 1377 #if defined(PETSC_USE_CTABLE) 1378 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1379 #else 1380 tcol = cmap_i[cols[k]]; 1381 #endif 1382 if (tcol) { 1383 *mat_j++ = tcol - 1; 1384 *mat_a++ = vals[k]; 1385 ilen_row++; 1386 } 1387 } 1388 } else { /* allcolumns */ 1389 for (k=0; k<ncols; k++) { 1390 *mat_j++ = cols[k]; /* global col index! */ 1391 *mat_a++ = vals[k]; 1392 ilen_row++; 1393 } 1394 } 1395 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1396 1397 imat_ilen[row] = ilen_row; 1398 } 1399 } 1400 } 1401 } 1402 1403 /* Now assemble the off proc rows*/ 1404 { 1405 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1406 PetscInt *imat_j,*imat_i; 1407 PetscScalar *imat_a,*rbuf4_i; 1408 1409 for (tmp2=0; tmp2<nrqs; tmp2++) { 1410 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1411 idex = pa[idex2]; 1412 sbuf1_i = sbuf1[idex]; 1413 jmax = sbuf1_i[0]; 1414 ct1 = 2*jmax + 1; 1415 ct2 = 0; 1416 rbuf2_i = rbuf2[idex2]; 1417 rbuf3_i = rbuf3[idex2]; 1418 rbuf4_i = rbuf4[idex2]; 1419 for (j=1; j<=jmax; j++) { 1420 is_no = sbuf1_i[2*j-1]; 1421 rmap_i = rmap[is_no]; 1422 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1423 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1424 imat_ilen = mat->ilen; 1425 imat_j = mat->j; 1426 imat_i = mat->i; 1427 imat_a = mat->a; 1428 max1 = sbuf1_i[2*j]; 1429 for (k=0; k<max1; k++,ct1++) { 1430 row = sbuf1_i[ct1]; 1431 #if defined(PETSC_USE_CTABLE) 1432 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1433 row--; 1434 #else 1435 row = rmap_i[row]; 1436 #endif 1437 ilen = imat_ilen[row]; 1438 mat_i = imat_i[row]; 1439 mat_a = imat_a + mat_i; 1440 mat_j = imat_j + mat_i; 1441 max2 = rbuf2_i[ct1]; 1442 if (!allcolumns[is_no]) { 1443 for (l=0; l<max2; l++,ct2++) { 1444 1445 #if defined(PETSC_USE_CTABLE) 1446 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1447 #else 1448 tcol = cmap_i[rbuf3_i[ct2]]; 1449 #endif 1450 if (tcol) { 1451 *mat_j++ = tcol - 1; 1452 *mat_a++ = rbuf4_i[ct2]; 1453 ilen++; 1454 } 1455 } 1456 } else { /* allcolumns */ 1457 for (l=0; l<max2; l++,ct2++) { 1458 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1459 *mat_a++ = rbuf4_i[ct2]; 1460 ilen++; 1461 } 1462 } 1463 imat_ilen[row] = ilen; 1464 } 1465 } 1466 } 1467 } 1468 1469 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1470 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1471 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1472 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1473 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1474 1475 /* Restore the indices */ 1476 for (i=0; i<ismax; i++) { 1477 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1478 if (!allcolumns[i]) { 1479 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1480 } 1481 } 1482 1483 /* Destroy allocated memory */ 1484 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1485 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1486 ierr = PetscFree(pa);CHKERRQ(ierr); 1487 1488 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1489 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1490 for (i=0; i<nrqr; ++i) { 1491 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1492 } 1493 for (i=0; i<nrqs; ++i) { 1494 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1495 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1496 } 1497 1498 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1499 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1500 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1501 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1502 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1503 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1504 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1505 1506 #if defined(PETSC_USE_CTABLE) 1507 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1508 #else 1509 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1510 #endif 1511 ierr = PetscFree(rmap);CHKERRQ(ierr); 1512 1513 for (i=0; i<ismax; i++) { 1514 if (!allcolumns[i]) { 1515 #if defined(PETSC_USE_CTABLE) 1516 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1517 #else 1518 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1519 #endif 1520 } 1521 } 1522 ierr = PetscFree(cmap);CHKERRQ(ierr); 1523 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1524 ierr = PetscFree(lens);CHKERRQ(ierr); 1525 1526 for (i=0; i<ismax; i++) { 1527 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1528 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1529 } 1530 PetscFunctionReturn(0); 1531 } 1532 1533 /* 1534 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1535 Be careful not to destroy them elsewhere. 1536 */ 1537 #undef __FUNCT__ 1538 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1539 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1540 { 1541 /* If making this function public, change the error returned in this function away from _PLIB. */ 1542 PetscErrorCode ierr; 1543 Mat_MPIAIJ *aij; 1544 PetscBool seqaij; 1545 1546 PetscFunctionBegin; 1547 /* Check to make sure the component matrices are compatible with C. */ 1548 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1549 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1550 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1551 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1552 if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1553 1554 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1555 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1556 ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs);CHKERRQ(ierr); 1557 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1558 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1559 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1560 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1561 aij = (Mat_MPIAIJ*)((*C)->data); 1562 aij->A = A; 1563 aij->B = B; 1564 ierr = PetscLogObjectParent(*C,A);CHKERRQ(ierr); 1565 ierr = PetscLogObjectParent(*C,B);CHKERRQ(ierr); 1566 1567 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1568 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1569 PetscFunctionReturn(0); 1570 } 1571 1572 #undef __FUNCT__ 1573 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1574 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1575 { 1576 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1577 1578 PetscFunctionBegin; 1579 PetscValidPointer(A,2); 1580 PetscValidPointer(B,3); 1581 *A = aij->A; 1582 *B = aij->B; 1583 /* Note that we don't incref *A and *B, so be careful! */ 1584 PetscFunctionReturn(0); 1585 } 1586 1587 #undef __FUNCT__ 1588 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1589 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1590 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1591 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1592 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1593 { 1594 PetscErrorCode ierr; 1595 PetscMPIInt size, flag; 1596 PetscInt i,ii; 1597 PetscInt ismax_c; 1598 1599 PetscFunctionBegin; 1600 if (!ismax) PetscFunctionReturn(0); 1601 1602 for (i = 0, ismax_c = 0; i < ismax; ++i) { 1603 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 1604 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1605 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1606 if (size > 1) ++ismax_c; 1607 } 1608 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1609 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1610 } else { /* if (ismax_c) */ 1611 Mat *A,*B; 1612 IS *isrow_c, *iscol_c; 1613 PetscMPIInt size; 1614 /* 1615 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1616 array of sequential matrices underlying the resulting parallel matrices. 1617 Which arrays to allocate is based on the value of MatReuse scall. 1618 There are as many diag matrices as there are original index sets. 1619 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1620 1621 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1622 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1623 will deposite the extracted diag and off-diag parts. 1624 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1625 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1626 */ 1627 1628 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1629 if (scall != MAT_REUSE_MATRIX) { 1630 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1631 } else { 1632 ierr = PetscMalloc(ismax*sizeof(Mat), &A);CHKERRQ(ierr); 1633 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B);CHKERRQ(ierr); 1634 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1635 for (i = 0, ii = 0; i < ismax; ++i) { 1636 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1637 if (size > 1) { 1638 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 1639 ++ii; 1640 } else A[i] = (*submat)[i]; 1641 } 1642 } 1643 /* 1644 Construct the complements of the iscol ISs for parallel ISs only. 1645 These are used to extract the off-diag portion of the resulting parallel matrix. 1646 The row IS for the off-diag portion is the same as for the diag portion, 1647 so we merely alias the row IS, while skipping those that are sequential. 1648 */ 1649 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c);CHKERRQ(ierr); 1650 for (i = 0, ii = 0; i < ismax; ++i) { 1651 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1652 if (size > 1) { 1653 isrow_c[ii] = isrow[i]; 1654 1655 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 1656 ++ii; 1657 } 1658 } 1659 /* Now obtain the sequential A and B submatrices separately. */ 1660 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 1661 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 1662 for (ii = 0; ii < ismax_c; ++ii) { 1663 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 1664 } 1665 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 1666 /* 1667 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1668 have been extracted directly into the parallel matrices containing them, or 1669 simply into the sequential matrix identical with the corresponding A (if size == 1). 1670 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1671 A is put into the correct submat slot (if size == 1). 1672 */ 1673 if (scall != MAT_REUSE_MATRIX) { 1674 for (i = 0, ii = 0; i < ismax; ++i) { 1675 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1676 if (size > 1) { 1677 /* 1678 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1679 */ 1680 /* Construct submat[i] from the Seq pieces A and B. */ 1681 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 1682 1683 ++ii; 1684 } else (*submat)[i] = A[i]; 1685 } 1686 } 1687 ierr = PetscFree(A);CHKERRQ(ierr); 1688 ierr = PetscFree(B);CHKERRQ(ierr); 1689 } 1690 PetscFunctionReturn(0); 1691 } /* MatGetSubMatricesParallel_MPIXAIJ() */ 1692 1693 1694 1695 #undef __FUNCT__ 1696 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1697 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1698 { 1699 PetscErrorCode ierr; 1700 1701 PetscFunctionBegin; 1702 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 1703 PetscFunctionReturn(0); 1704 } 1705