1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++){ 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 260 rw1[proc] = isz1[i]; 261 } 262 ierr = PetscFree(onodes1);CHKERRQ(ierr); 263 ierr = PetscFree(olengths1);CHKERRQ(ierr); 264 265 /* Determine the number of messages to expect, their lengths, from from-ids */ 266 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 267 ierr = PetscFree(rw1);CHKERRQ(ierr); 268 } 269 /* Now post the Irecvs corresponding to these messages */ 270 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 271 272 /* Now post the sends */ 273 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 274 for (i=0; i<nrqr; ++i) { 275 j = recv_status[i].MPI_SOURCE; 276 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* receive work done on other processors*/ 280 { 281 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 282 PetscMPIInt idex; 283 PetscBT table_i; 284 MPI_Status *status2; 285 286 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 287 for (i=0; i<nrqs; ++i) { 288 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 289 /* Process the message*/ 290 rbuf2_i = rbuf2[idex]; 291 ct1 = 2*rbuf2_i[0]+1; 292 jmax = rbuf2[idex][0]; 293 for (j=1; j<=jmax; j++) { 294 max = rbuf2_i[2*j]; 295 is_no = rbuf2_i[2*j-1]; 296 isz_i = isz[is_no]; 297 data_i = data[is_no]; 298 table_i = table[is_no]; 299 for (k=0; k<max; k++,ct1++) { 300 row = rbuf2_i[ct1]; 301 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 302 } 303 isz[is_no] = isz_i; 304 } 305 } 306 307 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 308 ierr = PetscFree(status2);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<imax; ++i) { 312 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 313 } 314 315 ierr = PetscFree(onodes2);CHKERRQ(ierr); 316 ierr = PetscFree(olengths2);CHKERRQ(ierr); 317 318 ierr = PetscFree(pa);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 324 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 325 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 326 ierr = PetscFree(s_status);CHKERRQ(ierr); 327 ierr = PetscFree(recv_status);CHKERRQ(ierr); 328 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 329 ierr = PetscFree(xdata);CHKERRQ(ierr); 330 ierr = PetscFree(isz1);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 336 /* 337 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 338 the work on the local processor. 339 340 Inputs: 341 C - MAT_MPIAIJ; 342 imax - total no of index sets processed at a time; 343 table - an array of char - size = m bits. 344 345 Output: 346 isz - array containing the count of the solution elements corresponding 347 to each index set; 348 data - pointer to the solutions 349 */ 350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 351 { 352 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 353 Mat A = c->A,B = c->B; 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 355 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 356 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 357 PetscBT table_i; 358 359 PetscFunctionBegin; 360 rstart = C->rmap->rstart; 361 cstart = C->cmap->rstart; 362 ai = a->i; 363 aj = a->j; 364 bi = b->i; 365 bj = b->j; 366 garray = c->garray; 367 368 369 for (i=0; i<imax; i++) { 370 data_i = data[i]; 371 table_i = table[i]; 372 isz_i = isz[i]; 373 for (j=0,max=isz[i]; j<max; j++) { 374 row = data_i[j] - rstart; 375 start = ai[row]; 376 end = ai[row+1]; 377 for (k=start; k<end; k++) { /* Amat */ 378 val = aj[k] + cstart; 379 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 380 } 381 start = bi[row]; 382 end = bi[row+1]; 383 for (k=start; k<end; k++) { /* Bmat */ 384 val = garray[bj[k]]; 385 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 386 } 387 } 388 isz[i] = isz_i; 389 } 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 395 /* 396 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 397 and return the output 398 399 Input: 400 C - the matrix 401 nrqr - no of messages being processed. 402 rbuf - an array of pointers to the recieved requests 403 404 Output: 405 xdata - array of messages to be sent back 406 isz1 - size of each message 407 408 For better efficiency perhaps we should malloc separately each xdata[i], 409 then if a remalloc is required we need only copy the data for that one row 410 rather then all previous rows as it is now where a single large chunck of 411 memory is used. 412 413 */ 414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 415 { 416 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 417 Mat A = c->A,B = c->B; 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 419 PetscErrorCode ierr; 420 PetscMPIInt rank; 421 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 422 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 423 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 424 PetscInt *rbuf_i,kmax,rbuf_0; 425 PetscBT xtable; 426 427 PetscFunctionBegin; 428 rank = c->rank; 429 m = C->rmap->N; 430 rstart = C->rmap->rstart; 431 cstart = C->cmap->rstart; 432 ai = a->i; 433 aj = a->j; 434 bi = b->i; 435 bj = b->j; 436 garray = c->garray; 437 438 439 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 440 rbuf_i = rbuf[i]; 441 rbuf_0 = rbuf_i[0]; 442 ct += rbuf_0; 443 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 444 } 445 446 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 447 else max1 = 1; 448 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 449 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 450 ++no_malloc; 451 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 452 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 453 454 ct3 = 0; 455 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 456 rbuf_i = rbuf[i]; 457 rbuf_0 = rbuf_i[0]; 458 ct1 = 2*rbuf_0+1; 459 ct2 = ct1; 460 ct3 += ct1; 461 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 462 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 463 oct2 = ct2; 464 kmax = rbuf_i[2*j]; 465 for (k=0; k<kmax; k++,ct1++) { 466 row = rbuf_i[ct1]; 467 if (!PetscBTLookupSet(xtable,row)) { 468 if (!(ct3 < mem_estimate)) { 469 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 470 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 471 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 472 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 473 xdata[0] = tmp; 474 mem_estimate = new_estimate; ++no_malloc; 475 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 476 } 477 xdata[i][ct2++] = row; 478 ct3++; 479 } 480 } 481 for (k=oct2,max2=ct2; k<max2; k++) { 482 row = xdata[i][k] - rstart; 483 start = ai[row]; 484 end = ai[row+1]; 485 for (l=start; l<end; l++) { 486 val = aj[l] + cstart; 487 if (!PetscBTLookupSet(xtable,val)) { 488 if (!(ct3 < mem_estimate)) { 489 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 490 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 491 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 492 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 493 xdata[0] = tmp; 494 mem_estimate = new_estimate; ++no_malloc; 495 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 496 } 497 xdata[i][ct2++] = val; 498 ct3++; 499 } 500 } 501 start = bi[row]; 502 end = bi[row+1]; 503 for (l=start; l<end; l++) { 504 val = garray[bj[l]]; 505 if (!PetscBTLookupSet(xtable,val)) { 506 if (!(ct3 < mem_estimate)) { 507 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 508 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 509 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 510 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 511 xdata[0] = tmp; 512 mem_estimate = new_estimate; ++no_malloc; 513 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 514 } 515 xdata[i][ct2++] = val; 516 ct3++; 517 } 518 } 519 } 520 /* Update the header*/ 521 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 522 xdata[i][2*j-1] = rbuf_i[2*j-1]; 523 } 524 xdata[i][0] = rbuf_0; 525 xdata[i+1] = xdata[i] + ct2; 526 isz1[i] = ct2; /* size of each message */ 527 } 528 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 529 ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 530 PetscFunctionReturn(0); 531 } 532 /* -------------------------------------------------------------------------*/ 533 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 534 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 535 /* 536 Every processor gets the entire matrix 537 */ 538 #undef __FUNCT__ 539 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 540 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 541 { 542 Mat B; 543 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 544 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 545 PetscErrorCode ierr; 546 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 547 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 548 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 549 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 550 551 PetscFunctionBegin; 552 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 553 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 554 555 if (scall == MAT_INITIAL_MATRIX) { 556 /* ---------------------------------------------------------------- 557 Tell every processor the number of nonzeros per row 558 */ 559 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 560 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 561 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 562 } 563 sendcount = A->rmap->rend - A->rmap->rstart; 564 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 565 for (i=0; i<size; i++) { 566 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 567 displs[i] = A->rmap->range[i]; 568 } 569 #if defined(PETSC_HAVE_MPI_IN_PLACE) 570 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 571 #else 572 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 573 #endif 574 /* --------------------------------------------------------------- 575 Create the sequential matrix of the same type as the local block diagonal 576 */ 577 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 578 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 579 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 580 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 581 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 582 **Bin = B; 583 b = (Mat_SeqAIJ *)B->data; 584 585 /*-------------------------------------------------------------------- 586 Copy my part of matrix column indices over 587 */ 588 sendcount = ad->nz + bd->nz; 589 jsendbuf = b->j + b->i[rstarts[rank]]; 590 a_jsendbuf = ad->j; 591 b_jsendbuf = bd->j; 592 n = A->rmap->rend - A->rmap->rstart; 593 cnt = 0; 594 for (i=0; i<n; i++) { 595 596 /* put in lower diagonal portion */ 597 m = bd->i[i+1] - bd->i[i]; 598 while (m > 0) { 599 /* is it above diagonal (in bd (compressed) numbering) */ 600 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 601 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 602 m--; 603 } 604 605 /* put in diagonal portion */ 606 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 607 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 608 } 609 610 /* put in upper diagonal portion */ 611 while (m-- > 0) { 612 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 613 } 614 } 615 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 616 617 /*-------------------------------------------------------------------- 618 Gather all column indices to all processors 619 */ 620 for (i=0; i<size; i++) { 621 recvcounts[i] = 0; 622 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 623 recvcounts[i] += lens[j]; 624 } 625 } 626 displs[0] = 0; 627 for (i=1; i<size; i++) { 628 displs[i] = displs[i-1] + recvcounts[i-1]; 629 } 630 #if defined(PETSC_HAVE_MPI_IN_PLACE) 631 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 632 #else 633 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 634 #endif 635 /*-------------------------------------------------------------------- 636 Assemble the matrix into useable form (note numerical values not yet set) 637 */ 638 /* set the b->ilen (length of each row) values */ 639 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 640 /* set the b->i indices */ 641 b->i[0] = 0; 642 for (i=1; i<=A->rmap->N; i++) { 643 b->i[i] = b->i[i-1] + lens[i-1]; 644 } 645 ierr = PetscFree(lens);CHKERRQ(ierr); 646 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 647 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 648 649 } else { 650 B = **Bin; 651 b = (Mat_SeqAIJ *)B->data; 652 } 653 654 /*-------------------------------------------------------------------- 655 Copy my part of matrix numerical values into the values location 656 */ 657 if (flag == MAT_GET_VALUES){ 658 sendcount = ad->nz + bd->nz; 659 sendbuf = b->a + b->i[rstarts[rank]]; 660 a_sendbuf = ad->a; 661 b_sendbuf = bd->a; 662 b_sendj = bd->j; 663 n = A->rmap->rend - A->rmap->rstart; 664 cnt = 0; 665 for (i=0; i<n; i++) { 666 667 /* put in lower diagonal portion */ 668 m = bd->i[i+1] - bd->i[i]; 669 while (m > 0) { 670 /* is it above diagonal (in bd (compressed) numbering) */ 671 if (garray[*b_sendj] > A->rmap->rstart + i) break; 672 sendbuf[cnt++] = *b_sendbuf++; 673 m--; 674 b_sendj++; 675 } 676 677 /* put in diagonal portion */ 678 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 679 sendbuf[cnt++] = *a_sendbuf++; 680 } 681 682 /* put in upper diagonal portion */ 683 while (m-- > 0) { 684 sendbuf[cnt++] = *b_sendbuf++; 685 b_sendj++; 686 } 687 } 688 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 689 690 /* ----------------------------------------------------------------- 691 Gather all numerical values to all processors 692 */ 693 if (!recvcounts) { 694 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 695 } 696 for (i=0; i<size; i++) { 697 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 698 } 699 displs[0] = 0; 700 for (i=1; i<size; i++) { 701 displs[i] = displs[i-1] + recvcounts[i-1]; 702 } 703 recvbuf = b->a; 704 #if defined(PETSC_HAVE_MPI_IN_PLACE) 705 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 706 #else 707 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 708 #endif 709 } /* endof (flag == MAT_GET_VALUES) */ 710 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 711 712 if (A->symmetric){ 713 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 714 } else if (A->hermitian) { 715 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 716 } else if (A->structurally_symmetric) { 717 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 718 } 719 PetscFunctionReturn(0); 720 } 721 722 723 724 #undef __FUNCT__ 725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 727 { 728 PetscErrorCode ierr; 729 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 730 PetscBool rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix; 731 732 PetscFunctionBegin; 733 /* 734 Check for special case each processor gets entire matrix 735 */ 736 if (ismax == 1 && C->rmap->N == C->cmap->N) { 737 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 738 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 739 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 740 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 741 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 742 wantallmatrix = PETSC_TRUE; 743 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 744 } 745 } 746 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 747 if (twantallmatrix) { 748 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 749 PetscFunctionReturn(0); 750 } 751 752 /* Allocate memory to hold all the submatrices */ 753 if (scall != MAT_REUSE_MATRIX) { 754 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 755 } 756 /* Determine the number of stages through which submatrices are done */ 757 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 758 /* 759 Each stage will extract nmax submatrices. 760 nmax is determined by the matrix column dimension. 761 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 762 */ 763 if (!nmax) nmax = 1; 764 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 765 766 /* Make sure every processor loops through the nstages */ 767 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 768 769 for (i=0,pos=0; i<nstages; i++) { 770 if (pos+nmax <= ismax) max_no = nmax; 771 else if (pos == ismax) max_no = 0; 772 else max_no = ismax-pos; 773 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 774 pos += max_no; 775 } 776 PetscFunctionReturn(0); 777 } 778 779 /* -------------------------------------------------------------------------*/ 780 #undef __FUNCT__ 781 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 782 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 783 { 784 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 785 Mat A = c->A; 786 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 787 const PetscInt **icol,**irow; 788 PetscInt *nrow,*ncol,start; 789 PetscErrorCode ierr; 790 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 791 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 792 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 793 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 794 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 795 #if defined (PETSC_USE_CTABLE) 796 PetscTable *cmap,cmap_i,*rmap,rmap_i; 797 #else 798 PetscInt **cmap,*cmap_i,**rmap,*rmap_i; 799 #endif 800 const PetscInt *irow_i; 801 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 802 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 803 MPI_Request *r_waits4,*s_waits3,*s_waits4; 804 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 805 MPI_Status *r_status3,*r_status4,*s_status4; 806 MPI_Comm comm; 807 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 808 PetscBool sorted; 809 PetscMPIInt *onodes1,*olengths1; 810 PetscMPIInt idex,idex2,end; 811 812 PetscFunctionBegin; 813 comm = ((PetscObject)C)->comm; 814 tag0 = ((PetscObject)C)->tag; 815 size = c->size; 816 rank = c->rank; 817 818 /* Get some new tags to keep the communication clean */ 819 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 820 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 821 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 822 823 /* Check if the col indices are sorted */ 824 for (i=0; i<ismax; i++) { 825 ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr); 826 /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/ 827 ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr); 828 /* if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */ 829 } 830 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 831 832 for (i=0; i<ismax; i++) { 833 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 834 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 835 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 836 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 837 } 838 839 /* evaluate communication - mesg to who, length of mesg, and buffer space 840 required. Based on this, buffers are allocated, and data copied into them*/ 841 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 842 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 843 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 844 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 845 for (i=0; i<ismax; i++) { 846 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 847 jmax = nrow[i]; 848 irow_i = irow[i]; 849 for (j=0; j<jmax; j++) { 850 l = 0; 851 row = irow_i[j]; 852 while (row >= C->rmap->range[l+1]) l++; 853 proc = l; 854 w4[proc]++; 855 } 856 for (j=0; j<size; j++) { 857 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 858 } 859 } 860 861 nrqs = 0; /* no of outgoing messages */ 862 msz = 0; /* total mesg length (for all procs) */ 863 w1[rank] = 0; /* no mesg sent to self */ 864 w3[rank] = 0; 865 for (i=0; i<size; i++) { 866 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 867 } 868 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 869 for (i=0,j=0; i<size; i++) { 870 if (w1[i]) { pa[j] = i; j++; } 871 } 872 873 /* Each message would have a header = 1 + 2*(no of IS) + data */ 874 for (i=0; i<nrqs; i++) { 875 j = pa[i]; 876 w1[j] += w2[j] + 2* w3[j]; 877 msz += w1[j]; 878 } 879 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 880 881 /* Determine the number of messages to expect, their lengths, from from-ids */ 882 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 883 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 884 885 /* Now post the Irecvs corresponding to these messages */ 886 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 887 888 ierr = PetscFree(onodes1);CHKERRQ(ierr); 889 ierr = PetscFree(olengths1);CHKERRQ(ierr); 890 891 /* Allocate Memory for outgoing messages */ 892 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 893 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 894 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 895 896 { 897 PetscInt *iptr = tmp,ict = 0; 898 for (i=0; i<nrqs; i++) { 899 j = pa[i]; 900 iptr += ict; 901 sbuf1[j] = iptr; 902 ict = w1[j]; 903 } 904 } 905 906 /* Form the outgoing messages */ 907 /* Initialize the header space */ 908 for (i=0; i<nrqs; i++) { 909 j = pa[i]; 910 sbuf1[j][0] = 0; 911 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 912 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 913 } 914 915 /* Parse the isrow and copy data into outbuf */ 916 for (i=0; i<ismax; i++) { 917 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 918 irow_i = irow[i]; 919 jmax = nrow[i]; 920 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 921 l = 0; 922 row = irow_i[j]; 923 while (row >= C->rmap->range[l+1]) l++; 924 proc = l; 925 if (proc != rank) { /* copy to the outgoing buf*/ 926 ctr[proc]++; 927 *ptr[proc] = row; 928 ptr[proc]++; 929 } 930 } 931 /* Update the headers for the current IS */ 932 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 933 if ((ctr_j = ctr[j])) { 934 sbuf1_j = sbuf1[j]; 935 k = ++sbuf1_j[0]; 936 sbuf1_j[2*k] = ctr_j; 937 sbuf1_j[2*k-1] = i; 938 } 939 } 940 } 941 942 /* Now post the sends */ 943 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 944 for (i=0; i<nrqs; ++i) { 945 j = pa[i]; 946 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 947 } 948 949 /* Post Receives to capture the buffer size */ 950 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 951 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 952 rbuf2[0] = tmp + msz; 953 for (i=1; i<nrqs; ++i) { 954 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 955 } 956 for (i=0; i<nrqs; ++i) { 957 j = pa[i]; 958 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 959 } 960 961 /* Send to other procs the buf size they should allocate */ 962 963 964 /* Receive messages*/ 965 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 966 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 967 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 968 { 969 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 970 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 971 PetscInt *sbuf2_i; 972 973 for (i=0; i<nrqr; ++i) { 974 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 975 req_size[idex] = 0; 976 rbuf1_i = rbuf1[idex]; 977 start = 2*rbuf1_i[0] + 1; 978 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 979 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 980 sbuf2_i = sbuf2[idex]; 981 for (j=start; j<end; j++) { 982 id = rbuf1_i[j] - rstart; 983 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 984 sbuf2_i[j] = ncols; 985 req_size[idex] += ncols; 986 } 987 req_source[idex] = r_status1[i].MPI_SOURCE; 988 /* form the header */ 989 sbuf2_i[0] = req_size[idex]; 990 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 991 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 992 } 993 } 994 ierr = PetscFree(r_status1);CHKERRQ(ierr); 995 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 996 997 /* recv buffer sizes */ 998 /* Receive messages*/ 999 1000 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1001 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1002 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1003 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1004 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1005 1006 for (i=0; i<nrqs; ++i) { 1007 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1008 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1009 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1010 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1011 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1012 } 1013 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1014 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1015 1016 /* Wait on sends1 and sends2 */ 1017 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1018 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1019 1020 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1021 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1022 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1023 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1024 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1025 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1026 1027 /* Now allocate buffers for a->j, and send them off */ 1028 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1029 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1030 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1031 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1032 1033 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1034 { 1035 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1036 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1037 PetscInt cend = C->cmap->rend; 1038 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1039 1040 for (i=0; i<nrqr; i++) { 1041 rbuf1_i = rbuf1[i]; 1042 sbuf_aj_i = sbuf_aj[i]; 1043 ct1 = 2*rbuf1_i[0] + 1; 1044 ct2 = 0; 1045 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1046 kmax = rbuf1[i][2*j]; 1047 for (k=0; k<kmax; k++,ct1++) { 1048 row = rbuf1_i[ct1] - rstart; 1049 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1050 ncols = nzA + nzB; 1051 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1052 1053 /* load the column indices for this row into cols*/ 1054 cols = sbuf_aj_i + ct2; 1055 1056 lwrite = 0; 1057 for (l=0; l<nzB; l++) { 1058 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1059 } 1060 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1061 for (l=0; l<nzB; l++) { 1062 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1063 } 1064 1065 ct2 += ncols; 1066 } 1067 } 1068 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1069 } 1070 } 1071 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1072 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1073 1074 /* Allocate buffers for a->a, and send them off */ 1075 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1076 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1077 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1078 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1079 1080 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1081 { 1082 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1083 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1084 PetscInt cend = C->cmap->rend; 1085 PetscInt *b_j = b->j; 1086 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1087 1088 for (i=0; i<nrqr; i++) { 1089 rbuf1_i = rbuf1[i]; 1090 sbuf_aa_i = sbuf_aa[i]; 1091 ct1 = 2*rbuf1_i[0]+1; 1092 ct2 = 0; 1093 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1094 kmax = rbuf1_i[2*j]; 1095 for (k=0; k<kmax; k++,ct1++) { 1096 row = rbuf1_i[ct1] - rstart; 1097 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1098 ncols = nzA + nzB; 1099 cworkB = b_j + b_i[row]; 1100 vworkA = a_a + a_i[row]; 1101 vworkB = b_a + b_i[row]; 1102 1103 /* load the column values for this row into vals*/ 1104 vals = sbuf_aa_i+ct2; 1105 1106 lwrite = 0; 1107 for (l=0; l<nzB; l++) { 1108 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1109 } 1110 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1111 for (l=0; l<nzB; l++) { 1112 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1113 } 1114 1115 ct2 += ncols; 1116 } 1117 } 1118 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1119 } 1120 } 1121 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1122 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1123 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1124 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1125 1126 /* Form the matrix */ 1127 /* create col map */ 1128 { 1129 const PetscInt *icol_i; 1130 #if defined (PETSC_USE_CTABLE) 1131 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1132 for (i=0; i<ismax; i++) { 1133 ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr); 1134 jmax = ncol[i]; 1135 icol_i = icol[i]; 1136 cmap_i = cmap[i]; 1137 for (j=0; j<jmax; j++) { 1138 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr); 1139 } 1140 } 1141 #else 1142 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1143 ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 1144 ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1145 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; } 1146 for (i=0; i<ismax; i++) { 1147 jmax = ncol[i]; 1148 icol_i = icol[i]; 1149 cmap_i = cmap[i]; 1150 for (j=0; j<jmax; j++) { 1151 cmap_i[icol_i[j]] = j+1; 1152 } 1153 } 1154 #endif 1155 } 1156 1157 /* Create lens which is required for MatCreate... */ 1158 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1159 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1160 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1161 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1162 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1163 1164 /* Update lens from local data */ 1165 for (i=0; i<ismax; i++) { 1166 jmax = nrow[i]; 1167 cmap_i = cmap[i]; 1168 irow_i = irow[i]; 1169 lens_i = lens[i]; 1170 for (j=0; j<jmax; j++) { 1171 l = 0; 1172 row = irow_i[j]; 1173 while (row >= C->rmap->range[l+1]) l++; 1174 proc = l; 1175 if (proc == rank) { 1176 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1177 for (k=0; k<ncols; k++) { 1178 #if defined (PETSC_USE_CTABLE) 1179 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1180 #else 1181 tcol = cmap_i[cols[k]]; 1182 #endif 1183 if (tcol) { lens_i[j]++;} 1184 } 1185 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1186 } 1187 } 1188 } 1189 1190 /* Create row map*/ 1191 #if defined (PETSC_USE_CTABLE) 1192 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1193 for (i=0; i<ismax; i++) { 1194 ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr); 1195 rmap_i = rmap[i]; 1196 irow_i = irow[i]; 1197 jmax = nrow[i]; 1198 for (j=0; j<jmax; j++) { 1199 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr); 1200 } 1201 } 1202 #else 1203 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1204 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1205 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1206 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1207 for (i=0; i<ismax; i++) { 1208 rmap_i = rmap[i]; 1209 irow_i = irow[i]; 1210 jmax = nrow[i]; 1211 for (j=0; j<jmax; j++) { 1212 rmap_i[irow_i[j]] = j; 1213 } 1214 } 1215 #endif 1216 1217 /* Update lens from offproc data */ 1218 { 1219 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1220 1221 for (tmp2=0; tmp2<nrqs; tmp2++) { 1222 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1223 idex = pa[idex2]; 1224 sbuf1_i = sbuf1[idex]; 1225 jmax = sbuf1_i[0]; 1226 ct1 = 2*jmax+1; 1227 ct2 = 0; 1228 rbuf2_i = rbuf2[idex2]; 1229 rbuf3_i = rbuf3[idex2]; 1230 for (j=1; j<=jmax; j++) { 1231 is_no = sbuf1_i[2*j-1]; 1232 max1 = sbuf1_i[2*j]; 1233 lens_i = lens[is_no]; 1234 cmap_i = cmap[is_no]; 1235 rmap_i = rmap[is_no]; 1236 for (k=0; k<max1; k++,ct1++) { 1237 #if defined (PETSC_USE_CTABLE) 1238 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1239 row--; 1240 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1241 #else 1242 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1243 #endif 1244 max2 = rbuf2_i[ct1]; 1245 for (l=0; l<max2; l++,ct2++) { 1246 #if defined (PETSC_USE_CTABLE) 1247 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1248 #else 1249 tcol = cmap_i[rbuf3_i[ct2]]; 1250 #endif 1251 if (tcol) { 1252 lens_i[row]++; 1253 } 1254 } 1255 } 1256 } 1257 } 1258 } 1259 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1260 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1261 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1262 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1263 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1264 1265 /* Create the submatrices */ 1266 if (scall == MAT_REUSE_MATRIX) { 1267 PetscBool flag; 1268 1269 /* 1270 Assumes new rows are same length as the old rows,hence bug! 1271 */ 1272 for (i=0; i<ismax; i++) { 1273 mat = (Mat_SeqAIJ *)(submats[i]->data); 1274 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1275 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1276 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1277 /* Initial matrix as if empty */ 1278 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1279 submats[i]->factortype = C->factortype; 1280 } 1281 } else { 1282 for (i=0; i<ismax; i++) { 1283 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1284 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1285 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1286 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1287 } 1288 } 1289 1290 /* Assemble the matrices */ 1291 /* First assemble the local rows */ 1292 { 1293 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1294 PetscScalar *imat_a; 1295 1296 for (i=0; i<ismax; i++) { 1297 mat = (Mat_SeqAIJ*)submats[i]->data; 1298 imat_ilen = mat->ilen; 1299 imat_j = mat->j; 1300 imat_i = mat->i; 1301 imat_a = mat->a; 1302 cmap_i = cmap[i]; 1303 rmap_i = rmap[i]; 1304 irow_i = irow[i]; 1305 jmax = nrow[i]; 1306 for (j=0; j<jmax; j++) { 1307 l = 0; 1308 row = irow_i[j]; 1309 while (row >= C->rmap->range[l+1]) l++; 1310 proc = l; 1311 if (proc == rank) { 1312 old_row = row; 1313 #if defined (PETSC_USE_CTABLE) 1314 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1315 row--; 1316 #else 1317 row = rmap_i[row]; 1318 #endif 1319 ilen_row = imat_ilen[row]; 1320 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1321 mat_i = imat_i[row] ; 1322 mat_a = imat_a + mat_i; 1323 mat_j = imat_j + mat_i; 1324 for (k=0; k<ncols; k++) { 1325 #if defined (PETSC_USE_CTABLE) 1326 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1327 #else 1328 tcol = cmap_i[cols[k]]; 1329 #endif 1330 if (tcol){ 1331 *mat_j++ = tcol - 1; 1332 *mat_a++ = vals[k]; 1333 ilen_row++; 1334 } 1335 } 1336 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1337 imat_ilen[row] = ilen_row; 1338 } 1339 } 1340 } 1341 } 1342 1343 /* Now assemble the off proc rows*/ 1344 { 1345 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1346 PetscInt *imat_j,*imat_i; 1347 PetscScalar *imat_a,*rbuf4_i; 1348 1349 for (tmp2=0; tmp2<nrqs; tmp2++) { 1350 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1351 idex = pa[idex2]; 1352 sbuf1_i = sbuf1[idex]; 1353 jmax = sbuf1_i[0]; 1354 ct1 = 2*jmax + 1; 1355 ct2 = 0; 1356 rbuf2_i = rbuf2[idex2]; 1357 rbuf3_i = rbuf3[idex2]; 1358 rbuf4_i = rbuf4[idex2]; 1359 for (j=1; j<=jmax; j++) { 1360 is_no = sbuf1_i[2*j-1]; 1361 rmap_i = rmap[is_no]; 1362 cmap_i = cmap[is_no]; 1363 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1364 imat_ilen = mat->ilen; 1365 imat_j = mat->j; 1366 imat_i = mat->i; 1367 imat_a = mat->a; 1368 max1 = sbuf1_i[2*j]; 1369 for (k=0; k<max1; k++,ct1++) { 1370 row = sbuf1_i[ct1]; 1371 #if defined (PETSC_USE_CTABLE) 1372 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1373 row--; 1374 #else 1375 row = rmap_i[row]; 1376 #endif 1377 ilen = imat_ilen[row]; 1378 mat_i = imat_i[row] ; 1379 mat_a = imat_a + mat_i; 1380 mat_j = imat_j + mat_i; 1381 max2 = rbuf2_i[ct1]; 1382 for (l=0; l<max2; l++,ct2++) { 1383 1384 #if defined (PETSC_USE_CTABLE) 1385 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1386 #else 1387 tcol = cmap_i[rbuf3_i[ct2]]; 1388 #endif 1389 if (tcol) { 1390 *mat_j++ = tcol - 1; 1391 *mat_a++ = rbuf4_i[ct2]; 1392 ilen++; 1393 } 1394 } 1395 imat_ilen[row] = ilen; 1396 } 1397 } 1398 } 1399 } 1400 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1401 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1402 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1403 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1404 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1405 1406 /* Restore the indices */ 1407 for (i=0; i<ismax; i++) { 1408 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1409 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1410 } 1411 1412 /* Destroy allocated memory */ 1413 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1414 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1415 ierr = PetscFree(pa);CHKERRQ(ierr); 1416 1417 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1418 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1419 for (i=0; i<nrqr; ++i) { 1420 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1421 } 1422 for (i=0; i<nrqs; ++i) { 1423 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1424 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1425 } 1426 1427 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1428 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1429 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1430 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1431 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1432 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1433 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1434 #if defined (PETSC_USE_CTABLE) 1435 for (i=0; i<ismax; i++) { 1436 ierr = PetscTableDestroy((PetscTable)cmap[i]);CHKERRQ(ierr); 1437 ierr = PetscTableDestroy((PetscTable)rmap[i]);CHKERRQ(ierr); 1438 } 1439 #else 1440 ierr = PetscFree(cmap[0]);CHKERRQ(ierr); 1441 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1442 #endif 1443 ierr = PetscFree(cmap);CHKERRQ(ierr); 1444 ierr = PetscFree(rmap);CHKERRQ(ierr); 1445 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1446 ierr = PetscFree(lens);CHKERRQ(ierr); 1447 1448 for (i=0; i<ismax; i++) { 1449 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1450 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1451 } 1452 PetscFunctionReturn(0); 1453 } 1454 1455 /* 1456 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1457 Be careful not to destroy them elsewhere. 1458 */ 1459 #undef __FUNCT__ 1460 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1461 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1462 { 1463 /* If making this function public, change the error returned in this function away from _PLIB. */ 1464 PetscErrorCode ierr; 1465 Mat_MPIAIJ *aij; 1466 PetscBool seqaij; 1467 1468 PetscFunctionBegin; 1469 /* Check to make sure the component matrices are compatible with C. */ 1470 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1471 if(!seqaij) { 1472 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1473 } 1474 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1475 if(!seqaij) { 1476 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1477 } 1478 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1479 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1480 } 1481 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1482 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1483 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1484 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1485 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1486 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1487 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1488 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1489 } 1490 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1491 aij = (Mat_MPIAIJ*)((*C)->data); 1492 aij->A = A; 1493 aij->B = B; 1494 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1495 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1496 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1497 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1498 PetscFunctionReturn(0); 1499 } 1500 1501 #undef __FUNCT__ 1502 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1503 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1504 { 1505 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1506 PetscFunctionBegin; 1507 PetscValidPointer(A,2); 1508 PetscValidPointer(B,3); 1509 *A = aij->A; 1510 *B = aij->B; 1511 /* Note that we don't incref *A and *B, so be careful! */ 1512 PetscFunctionReturn(0); 1513 } 1514 1515 1516 #undef __FUNCT__ 1517 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1518 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1519 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1520 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1521 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1522 { 1523 PetscErrorCode ierr; 1524 PetscMPIInt size, flag; 1525 PetscInt i,ii; 1526 PetscInt ismax_c; 1527 PetscFunctionBegin; 1528 if(!ismax) { 1529 PetscFunctionReturn(0); 1530 } 1531 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1532 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1533 if(flag != MPI_IDENT) { 1534 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1535 } 1536 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1537 if(size > 1) { 1538 ++ismax_c; 1539 } 1540 } 1541 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1542 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1543 } 1544 else { /* if(ismax_c) */ 1545 Mat *A,*B; 1546 IS *isrow_c, *iscol_c; 1547 PetscMPIInt size; 1548 /* 1549 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1550 array of sequential matrices underlying the resulting parallel matrices. 1551 Which arrays to allocate is based on the value of MatReuse scall. 1552 There are as many diag matrices as there are original index sets. 1553 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1554 1555 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1556 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1557 will deposite the extracted diag and off-diag parts. 1558 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1559 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1560 */ 1561 1562 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1563 if (scall != MAT_REUSE_MATRIX) { 1564 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1565 } 1566 else { 1567 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1568 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1569 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1570 for(i = 0, ii = 0; i < ismax; ++i) { 1571 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1572 if(size > 1) { 1573 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1574 ++ii; 1575 } 1576 else { 1577 A[i] = (*submat)[i]; 1578 } 1579 } 1580 } 1581 /* 1582 Construct the complements of the iscol ISs for parallel ISs only. 1583 These are used to extract the off-diag portion of the resulting parallel matrix. 1584 The row IS for the off-diag portion is the same as for the diag portion, 1585 so we merely alias the row IS, while skipping those that are sequential. 1586 */ 1587 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1588 for(i = 0, ii = 0; i < ismax; ++i) { 1589 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1590 if(size > 1) { 1591 isrow_c[ii] = isrow[i]; 1592 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1593 ++ii; 1594 } 1595 } 1596 /* Now obtain the sequential A and B submatrices separately. */ 1597 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1598 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1599 for(ii = 0; ii < ismax_c; ++ii) { 1600 ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr); 1601 } 1602 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1603 /* 1604 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1605 have been extracted directly into the parallel matrices containing them, or 1606 simply into the sequential matrix identical with the corresponding A (if size == 1). 1607 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1608 A is put into the correct submat slot (if size == 1). 1609 */ 1610 if(scall != MAT_REUSE_MATRIX) { 1611 for(i = 0, ii = 0; i < ismax; ++i) { 1612 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1613 if(size > 1) { 1614 /* 1615 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1616 */ 1617 /* Construct submat[i] from the Seq pieces A and B. */ 1618 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1619 1620 ++ii; 1621 } 1622 else { 1623 (*submat)[i] = A[i]; 1624 } 1625 } 1626 } 1627 ierr = PetscFree(A); CHKERRQ(ierr); 1628 ierr = PetscFree(B); CHKERRQ(ierr); 1629 } 1630 PetscFunctionReturn(0); 1631 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1632 1633 1634 1635 #undef __FUNCT__ 1636 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1637 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1638 { 1639 PetscErrorCode ierr; 1640 PetscFunctionBegin; 1641 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1642 MatCreateMPIAIJFromSeqMatrices_Private, 1643 MatMPIAIJExtractSeqMatrices_Private); 1644 CHKERRQ(ierr); 1645 PetscFunctionReturn(0); 1646 } 1647