1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include "../src/mat/impls/aij/mpi/mpiaij.h" 7 #include "petscbt.h" 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,*rtable,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 70 PetscFunctionBegin; 71 comm = ((PetscObject)C)->comm; 72 size = c->size; 73 rank = c->rank; 74 m = C->rmap->N; 75 76 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 78 79 ierr = PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);CHKERRQ(ierr); 80 81 for (i=0; i<imax; i++) { 82 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 83 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 84 } 85 86 /* Create hash table for the mapping :row -> proc*/ 87 for (i=0,j=0; i<size; i++) { 88 len = C->rmap->range[i+1]; 89 for (; j<len; j++) { 90 rtable[j] = i; 91 } 92 } 93 94 /* evaluate communication - mesg to who,length of mesg, and buffer space 95 required. Based on this, buffers are allocated, and data copied into them*/ 96 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 97 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 98 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 99 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 100 for (i=0; i<imax; i++) { 101 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 102 idx_i = idx[i]; 103 len = n[i]; 104 for (j=0; j<len; j++) { 105 row = idx_i[j]; 106 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 107 proc = rtable[row]; 108 w4[proc]++; 109 } 110 for (j=0; j<size; j++){ 111 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 112 } 113 } 114 115 nrqs = 0; /* no of outgoing messages */ 116 msz = 0; /* total mesg length (for all proc */ 117 w1[rank] = 0; /* no mesg sent to intself */ 118 w3[rank] = 0; 119 for (i=0; i<size; i++) { 120 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 121 } 122 /* pa - is list of processors to communicate with */ 123 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 124 for (i=0,j=0; i<size; i++) { 125 if (w1[i]) {pa[j] = i; j++;} 126 } 127 128 /* Each message would have a header = 1 + 2*(no of IS) + data */ 129 for (i=0; i<nrqs; i++) { 130 j = pa[i]; 131 w1[j] += w2[j] + 2*w3[j]; 132 msz += w1[j]; 133 } 134 135 /* Determine the number of messages to expect, their lengths, from from-ids */ 136 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 137 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 138 139 /* Now post the Irecvs corresponding to these messages */ 140 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 141 142 /* Allocate Memory for outgoing messages */ 143 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 144 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 145 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 146 147 { 148 PetscInt *iptr = tmp,ict = 0; 149 for (i=0; i<nrqs; i++) { 150 j = pa[i]; 151 iptr += ict; 152 outdat[j] = iptr; 153 ict = w1[j]; 154 } 155 } 156 157 /* Form the outgoing messages */ 158 /*plug in the headers*/ 159 for (i=0; i<nrqs; i++) { 160 j = pa[i]; 161 outdat[j][0] = 0; 162 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 163 ptr[j] = outdat[j] + 2*w3[j] + 1; 164 } 165 166 /* Memory for doing local proc's work*/ 167 { 168 PetscInt *d_p; 169 char *t_p; 170 171 /* should replace with PetscMallocN() */ 172 ierr = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) + 173 (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr); 174 ierr = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) + 175 (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr); 176 data = (PetscInt **)(table + imax); 177 isz = (PetscInt *)(data + imax); 178 d_p = (PetscInt *)(isz + imax); 179 t_p = (char *)(d_p + m*imax); 180 for (i=0; i<imax; i++) { 181 table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i; 182 data[i] = d_p + (m)*i; 183 } 184 } 185 186 /* Parse the IS and update local tables and the outgoing buf with the data*/ 187 { 188 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 189 PetscBT table_i; 190 191 for (i=0; i<imax; i++) { 192 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 193 n_i = n[i]; 194 table_i = table[i]; 195 idx_i = idx[i]; 196 data_i = data[i]; 197 isz_i = isz[i]; 198 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 199 row = idx_i[j]; 200 proc = rtable[row]; 201 if (proc != rank) { /* copy to the outgoing buffer */ 202 ctr[proc]++; 203 *ptr[proc] = row; 204 ptr[proc]++; 205 } else { /* Update the local table */ 206 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 207 } 208 } 209 /* Update the headers for the current IS */ 210 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 211 if ((ctr_j = ctr[j])) { 212 outdat_j = outdat[j]; 213 k = ++outdat_j[0]; 214 outdat_j[2*k] = ctr_j; 215 outdat_j[2*k-1] = i; 216 } 217 } 218 isz[i] = isz_i; 219 } 220 } 221 222 /* Now post the sends */ 223 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 224 for (i=0; i<nrqs; ++i) { 225 j = pa[i]; 226 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 227 } 228 229 /* No longer need the original indices*/ 230 for (i=0; i<imax; ++i) { 231 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 232 } 233 ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr); 234 235 for (i=0; i<imax; ++i) { 236 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 237 } 238 239 /* Do Local work*/ 240 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 241 242 /* Receive messages*/ 243 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 244 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 245 246 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 247 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 248 249 /* Phase 1 sends are complete - deallocate buffers */ 250 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 251 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 252 253 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 254 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 255 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 256 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 257 ierr = PetscFree(rbuf);CHKERRQ(ierr); 258 259 260 /* Send the data back*/ 261 /* Do a global reduction to know the buffer space req for incoming messages*/ 262 { 263 PetscMPIInt *rw1; 264 265 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 266 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 267 268 for (i=0; i<nrqr; ++i) { 269 proc = recv_status[i].MPI_SOURCE; 270 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 271 rw1[proc] = isz1[i]; 272 } 273 ierr = PetscFree(onodes1);CHKERRQ(ierr); 274 ierr = PetscFree(olengths1);CHKERRQ(ierr); 275 276 /* Determine the number of messages to expect, their lengths, from from-ids */ 277 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 278 ierr = PetscFree(rw1);CHKERRQ(ierr); 279 } 280 /* Now post the Irecvs corresponding to these messages */ 281 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 282 283 /* Now post the sends */ 284 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 285 for (i=0; i<nrqr; ++i) { 286 j = recv_status[i].MPI_SOURCE; 287 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 288 } 289 290 /* receive work done on other processors*/ 291 { 292 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 293 PetscMPIInt idex; 294 PetscBT table_i; 295 MPI_Status *status2; 296 297 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 298 for (i=0; i<nrqs; ++i) { 299 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 300 /* Process the message*/ 301 rbuf2_i = rbuf2[idex]; 302 ct1 = 2*rbuf2_i[0]+1; 303 jmax = rbuf2[idex][0]; 304 for (j=1; j<=jmax; j++) { 305 max = rbuf2_i[2*j]; 306 is_no = rbuf2_i[2*j-1]; 307 isz_i = isz[is_no]; 308 data_i = data[is_no]; 309 table_i = table[is_no]; 310 for (k=0; k<max; k++,ct1++) { 311 row = rbuf2_i[ct1]; 312 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 313 } 314 isz[is_no] = isz_i; 315 } 316 } 317 318 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 319 ierr = PetscFree(status2);CHKERRQ(ierr); 320 } 321 322 for (i=0; i<imax; ++i) { 323 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 324 } 325 326 ierr = PetscFree(onodes2);CHKERRQ(ierr); 327 ierr = PetscFree(olengths2);CHKERRQ(ierr); 328 329 ierr = PetscFree(pa);CHKERRQ(ierr); 330 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 331 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 332 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 333 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 334 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 335 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 336 ierr = PetscFree(table);CHKERRQ(ierr); 337 ierr = PetscFree(s_status);CHKERRQ(ierr); 338 ierr = PetscFree(recv_status);CHKERRQ(ierr); 339 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 340 ierr = PetscFree(xdata);CHKERRQ(ierr); 341 ierr = PetscFree(isz1);CHKERRQ(ierr); 342 PetscFunctionReturn(0); 343 } 344 345 #undef __FUNCT__ 346 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 347 /* 348 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 349 the work on the local processor. 350 351 Inputs: 352 C - MAT_MPIAIJ; 353 imax - total no of index sets processed at a time; 354 table - an array of char - size = m bits. 355 356 Output: 357 isz - array containing the count of the solution elements correspondign 358 to each index set; 359 data - pointer to the solutions 360 */ 361 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 362 { 363 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 364 Mat A = c->A,B = c->B; 365 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 366 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 367 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 368 PetscBT table_i; 369 370 PetscFunctionBegin; 371 rstart = C->rmap->rstart; 372 cstart = C->cmap->rstart; 373 ai = a->i; 374 aj = a->j; 375 bi = b->i; 376 bj = b->j; 377 garray = c->garray; 378 379 380 for (i=0; i<imax; i++) { 381 data_i = data[i]; 382 table_i = table[i]; 383 isz_i = isz[i]; 384 for (j=0,max=isz[i]; j<max; j++) { 385 row = data_i[j] - rstart; 386 start = ai[row]; 387 end = ai[row+1]; 388 for (k=start; k<end; k++) { /* Amat */ 389 val = aj[k] + cstart; 390 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 391 } 392 start = bi[row]; 393 end = bi[row+1]; 394 for (k=start; k<end; k++) { /* Bmat */ 395 val = garray[bj[k]]; 396 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 397 } 398 } 399 isz[i] = isz_i; 400 } 401 PetscFunctionReturn(0); 402 } 403 404 #undef __FUNCT__ 405 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 406 /* 407 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 408 and return the output 409 410 Input: 411 C - the matrix 412 nrqr - no of messages being processed. 413 rbuf - an array of pointers to the recieved requests 414 415 Output: 416 xdata - array of messages to be sent back 417 isz1 - size of each message 418 419 For better efficiency perhaps we should malloc separately each xdata[i], 420 then if a remalloc is required we need only copy the data for that one row 421 rather then all previous rows as it is now where a single large chunck of 422 memory is used. 423 424 */ 425 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 426 { 427 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 428 Mat A = c->A,B = c->B; 429 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 430 PetscErrorCode ierr; 431 PetscMPIInt rank; 432 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 433 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 434 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 435 PetscInt *rbuf_i,kmax,rbuf_0; 436 PetscBT xtable; 437 438 PetscFunctionBegin; 439 rank = c->rank; 440 m = C->rmap->N; 441 rstart = C->rmap->rstart; 442 cstart = C->cmap->rstart; 443 ai = a->i; 444 aj = a->j; 445 bi = b->i; 446 bj = b->j; 447 garray = c->garray; 448 449 450 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 451 rbuf_i = rbuf[i]; 452 rbuf_0 = rbuf_i[0]; 453 ct += rbuf_0; 454 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 455 } 456 457 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 458 else max1 = 1; 459 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 460 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 461 ++no_malloc; 462 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 463 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 464 465 ct3 = 0; 466 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 467 rbuf_i = rbuf[i]; 468 rbuf_0 = rbuf_i[0]; 469 ct1 = 2*rbuf_0+1; 470 ct2 = ct1; 471 ct3 += ct1; 472 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 473 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 474 oct2 = ct2; 475 kmax = rbuf_i[2*j]; 476 for (k=0; k<kmax; k++,ct1++) { 477 row = rbuf_i[ct1]; 478 if (!PetscBTLookupSet(xtable,row)) { 479 if (!(ct3 < mem_estimate)) { 480 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 481 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 482 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 483 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 484 xdata[0] = tmp; 485 mem_estimate = new_estimate; ++no_malloc; 486 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 487 } 488 xdata[i][ct2++] = row; 489 ct3++; 490 } 491 } 492 for (k=oct2,max2=ct2; k<max2; k++) { 493 row = xdata[i][k] - rstart; 494 start = ai[row]; 495 end = ai[row+1]; 496 for (l=start; l<end; l++) { 497 val = aj[l] + cstart; 498 if (!PetscBTLookupSet(xtable,val)) { 499 if (!(ct3 < mem_estimate)) { 500 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 501 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 502 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 503 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 504 xdata[0] = tmp; 505 mem_estimate = new_estimate; ++no_malloc; 506 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 507 } 508 xdata[i][ct2++] = val; 509 ct3++; 510 } 511 } 512 start = bi[row]; 513 end = bi[row+1]; 514 for (l=start; l<end; l++) { 515 val = garray[bj[l]]; 516 if (!PetscBTLookupSet(xtable,val)) { 517 if (!(ct3 < mem_estimate)) { 518 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 519 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 520 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 521 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 522 xdata[0] = tmp; 523 mem_estimate = new_estimate; ++no_malloc; 524 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 525 } 526 xdata[i][ct2++] = val; 527 ct3++; 528 } 529 } 530 } 531 /* Update the header*/ 532 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 533 xdata[i][2*j-1] = rbuf_i[2*j-1]; 534 } 535 xdata[i][0] = rbuf_0; 536 xdata[i+1] = xdata[i] + ct2; 537 isz1[i] = ct2; /* size of each message */ 538 } 539 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 540 ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 541 PetscFunctionReturn(0); 542 } 543 /* -------------------------------------------------------------------------*/ 544 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 545 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 546 /* 547 Every processor gets the entire matrix 548 */ 549 #undef __FUNCT__ 550 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 551 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 552 { 553 Mat B; 554 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 555 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 556 PetscErrorCode ierr; 557 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 558 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 559 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 560 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 561 562 PetscFunctionBegin; 563 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 564 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 565 566 if (scall == MAT_INITIAL_MATRIX) { 567 /* ---------------------------------------------------------------- 568 Tell every processor the number of nonzeros per row 569 */ 570 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 571 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 572 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 573 } 574 sendcount = A->rmap->rend - A->rmap->rstart; 575 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 576 for (i=0; i<size; i++) { 577 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 578 displs[i] = A->rmap->range[i]; 579 } 580 #if defined(PETSC_HAVE_MPI_IN_PLACE) 581 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 582 #else 583 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 584 #endif 585 /* --------------------------------------------------------------- 586 Create the sequential matrix of the same type as the local block diagonal 587 */ 588 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 589 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 590 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 591 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 592 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 593 **Bin = B; 594 b = (Mat_SeqAIJ *)B->data; 595 596 /*-------------------------------------------------------------------- 597 Copy my part of matrix column indices over 598 */ 599 sendcount = ad->nz + bd->nz; 600 jsendbuf = b->j + b->i[rstarts[rank]]; 601 a_jsendbuf = ad->j; 602 b_jsendbuf = bd->j; 603 n = A->rmap->rend - A->rmap->rstart; 604 cnt = 0; 605 for (i=0; i<n; i++) { 606 607 /* put in lower diagonal portion */ 608 m = bd->i[i+1] - bd->i[i]; 609 while (m > 0) { 610 /* is it above diagonal (in bd (compressed) numbering) */ 611 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 612 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 613 m--; 614 } 615 616 /* put in diagonal portion */ 617 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 618 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 619 } 620 621 /* put in upper diagonal portion */ 622 while (m-- > 0) { 623 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 624 } 625 } 626 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 627 628 /*-------------------------------------------------------------------- 629 Gather all column indices to all processors 630 */ 631 for (i=0; i<size; i++) { 632 recvcounts[i] = 0; 633 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 634 recvcounts[i] += lens[j]; 635 } 636 } 637 displs[0] = 0; 638 for (i=1; i<size; i++) { 639 displs[i] = displs[i-1] + recvcounts[i-1]; 640 } 641 #if defined(PETSC_HAVE_MPI_IN_PLACE) 642 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 643 #else 644 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 645 #endif 646 /*-------------------------------------------------------------------- 647 Assemble the matrix into useable form (note numerical values not yet set) 648 */ 649 /* set the b->ilen (length of each row) values */ 650 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 651 /* set the b->i indices */ 652 b->i[0] = 0; 653 for (i=1; i<=A->rmap->N; i++) { 654 b->i[i] = b->i[i-1] + lens[i-1]; 655 } 656 ierr = PetscFree(lens);CHKERRQ(ierr); 657 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 658 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 659 660 } else { 661 B = **Bin; 662 b = (Mat_SeqAIJ *)B->data; 663 } 664 665 /*-------------------------------------------------------------------- 666 Copy my part of matrix numerical values into the values location 667 */ 668 if (flag == MAT_GET_VALUES){ 669 sendcount = ad->nz + bd->nz; 670 sendbuf = b->a + b->i[rstarts[rank]]; 671 a_sendbuf = ad->a; 672 b_sendbuf = bd->a; 673 b_sendj = bd->j; 674 n = A->rmap->rend - A->rmap->rstart; 675 cnt = 0; 676 for (i=0; i<n; i++) { 677 678 /* put in lower diagonal portion */ 679 m = bd->i[i+1] - bd->i[i]; 680 while (m > 0) { 681 /* is it above diagonal (in bd (compressed) numbering) */ 682 if (garray[*b_sendj] > A->rmap->rstart + i) break; 683 sendbuf[cnt++] = *b_sendbuf++; 684 m--; 685 b_sendj++; 686 } 687 688 /* put in diagonal portion */ 689 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 690 sendbuf[cnt++] = *a_sendbuf++; 691 } 692 693 /* put in upper diagonal portion */ 694 while (m-- > 0) { 695 sendbuf[cnt++] = *b_sendbuf++; 696 b_sendj++; 697 } 698 } 699 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 700 701 /* ----------------------------------------------------------------- 702 Gather all numerical values to all processors 703 */ 704 if (!recvcounts) { 705 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 706 } 707 for (i=0; i<size; i++) { 708 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 709 } 710 displs[0] = 0; 711 for (i=1; i<size; i++) { 712 displs[i] = displs[i-1] + recvcounts[i-1]; 713 } 714 recvbuf = b->a; 715 #if defined(PETSC_HAVE_MPI_IN_PLACE) 716 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 717 #else 718 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 719 #endif 720 } /* endof (flag == MAT_GET_VALUES) */ 721 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 722 723 if (A->symmetric){ 724 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 725 } else if (A->hermitian) { 726 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 727 } else if (A->structurally_symmetric) { 728 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 729 } 730 PetscFunctionReturn(0); 731 } 732 733 734 735 #undef __FUNCT__ 736 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 737 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 738 { 739 PetscErrorCode ierr; 740 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 741 PetscBool rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix; 742 743 PetscFunctionBegin; 744 /* 745 Check for special case each processor gets entire matrix 746 */ 747 if (ismax == 1 && C->rmap->N == C->cmap->N) { 748 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 749 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 750 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 751 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 752 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 753 wantallmatrix = PETSC_TRUE; 754 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 755 } 756 } 757 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 758 if (twantallmatrix) { 759 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 760 PetscFunctionReturn(0); 761 } 762 763 /* Allocate memory to hold all the submatrices */ 764 if (scall != MAT_REUSE_MATRIX) { 765 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 766 } 767 /* Determine the number of stages through which submatrices are done */ 768 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 769 /* 770 Each stage will extract nmax submatrices. 771 nmax is determined by the matrix column dimension. 772 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 773 */ 774 if (!nmax) nmax = 1; 775 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 776 777 /* Make sure every processor loops through the nstages */ 778 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 779 780 for (i=0,pos=0; i<nstages; i++) { 781 if (pos+nmax <= ismax) max_no = nmax; 782 else if (pos == ismax) max_no = 0; 783 else max_no = ismax-pos; 784 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 785 pos += max_no; 786 } 787 PetscFunctionReturn(0); 788 } 789 790 791 792 793 /* -------------------------------------------------------------------------*/ 794 #undef __FUNCT__ 795 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 796 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 797 { 798 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 799 Mat A = c->A; 800 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 801 const PetscInt **icol,**irow; 802 PetscInt *nrow,*ncol,start; 803 PetscErrorCode ierr; 804 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 805 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 806 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 807 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap; 808 PetscInt **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 809 const PetscInt *irow_i; 810 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 811 PetscInt *rmap_i; 812 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 813 MPI_Request *r_waits4,*s_waits3,*s_waits4; 814 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 815 MPI_Status *r_status3,*r_status4,*s_status4; 816 MPI_Comm comm; 817 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 818 PetscBool sorted; 819 PetscMPIInt *onodes1,*olengths1; 820 PetscMPIInt idex,idex2,end; 821 822 PetscFunctionBegin; 823 comm = ((PetscObject)C)->comm; 824 tag0 = ((PetscObject)C)->tag; 825 size = c->size; 826 rank = c->rank; 827 828 /* Get some new tags to keep the communication clean */ 829 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 830 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 831 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 832 833 /* Check if the col indices are sorted */ 834 for (i=0; i<ismax; i++) { 835 ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr); 836 /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/ 837 ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr); 838 /* if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */ 839 } 840 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 841 842 for (i=0; i<ismax; i++) { 843 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 844 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 845 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 846 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 847 } 848 849 /* evaluate communication - mesg to who, length of mesg, and buffer space 850 required. Based on this, buffers are allocated, and data copied into them*/ 851 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 852 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 853 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 854 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 855 for (i=0; i<ismax; i++) { 856 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 857 jmax = nrow[i]; 858 irow_i = irow[i]; 859 for (j=0; j<jmax; j++) { 860 l = 0; 861 row = irow_i[j]; 862 while (row >= C->rmap->range[l+1]) l++; 863 proc = l; 864 w4[proc]++; 865 } 866 for (j=0; j<size; j++) { 867 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 868 } 869 } 870 871 nrqs = 0; /* no of outgoing messages */ 872 msz = 0; /* total mesg length (for all procs) */ 873 w1[rank] = 0; /* no mesg sent to self */ 874 w3[rank] = 0; 875 for (i=0; i<size; i++) { 876 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 877 } 878 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 879 for (i=0,j=0; i<size; i++) { 880 if (w1[i]) { pa[j] = i; j++; } 881 } 882 883 /* Each message would have a header = 1 + 2*(no of IS) + data */ 884 for (i=0; i<nrqs; i++) { 885 j = pa[i]; 886 w1[j] += w2[j] + 2* w3[j]; 887 msz += w1[j]; 888 } 889 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 890 891 /* Determine the number of messages to expect, their lengths, from from-ids */ 892 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 893 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 894 895 /* Now post the Irecvs corresponding to these messages */ 896 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 897 898 ierr = PetscFree(onodes1);CHKERRQ(ierr); 899 ierr = PetscFree(olengths1);CHKERRQ(ierr); 900 901 /* Allocate Memory for outgoing messages */ 902 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 903 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 904 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 905 906 { 907 PetscInt *iptr = tmp,ict = 0; 908 for (i=0; i<nrqs; i++) { 909 j = pa[i]; 910 iptr += ict; 911 sbuf1[j] = iptr; 912 ict = w1[j]; 913 } 914 } 915 916 /* Form the outgoing messages */ 917 /* Initialize the header space */ 918 for (i=0; i<nrqs; i++) { 919 j = pa[i]; 920 sbuf1[j][0] = 0; 921 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 922 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 923 } 924 925 /* Parse the isrow and copy data into outbuf */ 926 for (i=0; i<ismax; i++) { 927 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 928 irow_i = irow[i]; 929 jmax = nrow[i]; 930 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 931 l = 0; 932 row = irow_i[j]; 933 while (row >= C->rmap->range[l+1]) l++; 934 proc = l; 935 if (proc != rank) { /* copy to the outgoing buf*/ 936 ctr[proc]++; 937 *ptr[proc] = row; 938 ptr[proc]++; 939 } 940 } 941 /* Update the headers for the current IS */ 942 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 943 if ((ctr_j = ctr[j])) { 944 sbuf1_j = sbuf1[j]; 945 k = ++sbuf1_j[0]; 946 sbuf1_j[2*k] = ctr_j; 947 sbuf1_j[2*k-1] = i; 948 } 949 } 950 } 951 952 /* Now post the sends */ 953 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 954 for (i=0; i<nrqs; ++i) { 955 j = pa[i]; 956 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 957 } 958 959 /* Post Receives to capture the buffer size */ 960 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 961 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 962 rbuf2[0] = tmp + msz; 963 for (i=1; i<nrqs; ++i) { 964 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 965 } 966 for (i=0; i<nrqs; ++i) { 967 j = pa[i]; 968 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 969 } 970 971 /* Send to other procs the buf size they should allocate */ 972 973 974 /* Receive messages*/ 975 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 976 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 977 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 978 { 979 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 980 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 981 PetscInt *sbuf2_i; 982 983 for (i=0; i<nrqr; ++i) { 984 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 985 req_size[idex] = 0; 986 rbuf1_i = rbuf1[idex]; 987 start = 2*rbuf1_i[0] + 1; 988 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 989 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 990 sbuf2_i = sbuf2[idex]; 991 for (j=start; j<end; j++) { 992 id = rbuf1_i[j] - rstart; 993 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 994 sbuf2_i[j] = ncols; 995 req_size[idex] += ncols; 996 } 997 req_source[idex] = r_status1[i].MPI_SOURCE; 998 /* form the header */ 999 sbuf2_i[0] = req_size[idex]; 1000 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 1001 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1002 } 1003 } 1004 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1005 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1006 1007 /* recv buffer sizes */ 1008 /* Receive messages*/ 1009 1010 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1011 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1012 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1013 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1014 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1015 1016 for (i=0; i<nrqs; ++i) { 1017 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1018 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1019 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1020 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1021 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1022 } 1023 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1024 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1025 1026 /* Wait on sends1 and sends2 */ 1027 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1028 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1029 1030 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1031 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1032 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1033 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1034 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1035 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1036 1037 /* Now allocate buffers for a->j, and send them off */ 1038 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1039 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1040 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1041 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1042 1043 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1044 { 1045 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1046 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1047 PetscInt cend = C->cmap->rend; 1048 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1049 1050 for (i=0; i<nrqr; i++) { 1051 rbuf1_i = rbuf1[i]; 1052 sbuf_aj_i = sbuf_aj[i]; 1053 ct1 = 2*rbuf1_i[0] + 1; 1054 ct2 = 0; 1055 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1056 kmax = rbuf1[i][2*j]; 1057 for (k=0; k<kmax; k++,ct1++) { 1058 row = rbuf1_i[ct1] - rstart; 1059 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1060 ncols = nzA + nzB; 1061 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1062 1063 /* load the column indices for this row into cols*/ 1064 cols = sbuf_aj_i + ct2; 1065 1066 lwrite = 0; 1067 for (l=0; l<nzB; l++) { 1068 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1069 } 1070 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1071 for (l=0; l<nzB; l++) { 1072 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1073 } 1074 1075 ct2 += ncols; 1076 } 1077 } 1078 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1079 } 1080 } 1081 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1082 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1083 1084 /* Allocate buffers for a->a, and send them off */ 1085 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1086 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1087 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1088 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1089 1090 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1091 { 1092 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1093 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1094 PetscInt cend = C->cmap->rend; 1095 PetscInt *b_j = b->j; 1096 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1097 1098 for (i=0; i<nrqr; i++) { 1099 rbuf1_i = rbuf1[i]; 1100 sbuf_aa_i = sbuf_aa[i]; 1101 ct1 = 2*rbuf1_i[0]+1; 1102 ct2 = 0; 1103 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1104 kmax = rbuf1_i[2*j]; 1105 for (k=0; k<kmax; k++,ct1++) { 1106 row = rbuf1_i[ct1] - rstart; 1107 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1108 ncols = nzA + nzB; 1109 cworkB = b_j + b_i[row]; 1110 vworkA = a_a + a_i[row]; 1111 vworkB = b_a + b_i[row]; 1112 1113 /* load the column values for this row into vals*/ 1114 vals = sbuf_aa_i+ct2; 1115 1116 lwrite = 0; 1117 for (l=0; l<nzB; l++) { 1118 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1119 } 1120 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1121 for (l=0; l<nzB; l++) { 1122 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1123 } 1124 1125 ct2 += ncols; 1126 } 1127 } 1128 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1129 } 1130 } 1131 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1132 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1133 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1134 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1135 1136 /* Form the matrix */ 1137 /* create col map */ 1138 { 1139 const PetscInt *icol_i; 1140 1141 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1142 ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 1143 ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1144 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; } 1145 for (i=0; i<ismax; i++) { 1146 jmax = ncol[i]; 1147 icol_i = icol[i]; 1148 cmap_i = cmap[i]; 1149 for (j=0; j<jmax; j++) { 1150 cmap_i[icol_i[j]] = j+1; 1151 } 1152 } 1153 } 1154 1155 /* Create lens which is required for MatCreate... */ 1156 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1157 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1158 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1159 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1160 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1161 1162 /* Update lens from local data */ 1163 for (i=0; i<ismax; i++) { 1164 jmax = nrow[i]; 1165 cmap_i = cmap[i]; 1166 irow_i = irow[i]; 1167 lens_i = lens[i]; 1168 for (j=0; j<jmax; j++) { 1169 l = 0; 1170 row = irow_i[j]; 1171 while (row >= C->rmap->range[l+1]) l++; 1172 proc = l; 1173 if (proc == rank) { 1174 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1175 for (k=0; k<ncols; k++) { 1176 if (cmap_i[cols[k]]) { lens_i[j]++;} 1177 } 1178 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1179 } 1180 } 1181 } 1182 1183 /* Create row map*/ 1184 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1185 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1186 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1187 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1188 for (i=0; i<ismax; i++) { 1189 rmap_i = rmap[i]; 1190 irow_i = irow[i]; 1191 jmax = nrow[i]; 1192 for (j=0; j<jmax; j++) { 1193 rmap_i[irow_i[j]] = j; 1194 } 1195 } 1196 1197 /* Update lens from offproc data */ 1198 { 1199 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1200 1201 for (tmp2=0; tmp2<nrqs; tmp2++) { 1202 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1203 idex = pa[idex2]; 1204 sbuf1_i = sbuf1[idex]; 1205 jmax = sbuf1_i[0]; 1206 ct1 = 2*jmax+1; 1207 ct2 = 0; 1208 rbuf2_i = rbuf2[idex2]; 1209 rbuf3_i = rbuf3[idex2]; 1210 for (j=1; j<=jmax; j++) { 1211 is_no = sbuf1_i[2*j-1]; 1212 max1 = sbuf1_i[2*j]; 1213 lens_i = lens[is_no]; 1214 cmap_i = cmap[is_no]; 1215 rmap_i = rmap[is_no]; 1216 for (k=0; k<max1; k++,ct1++) { 1217 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1218 max2 = rbuf2_i[ct1]; 1219 for (l=0; l<max2; l++,ct2++) { 1220 if (cmap_i[rbuf3_i[ct2]]) { 1221 lens_i[row]++; 1222 } 1223 } 1224 } 1225 } 1226 } 1227 } 1228 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1229 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1230 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1231 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1232 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1233 1234 /* Create the submatrices */ 1235 if (scall == MAT_REUSE_MATRIX) { 1236 PetscBool flag; 1237 1238 /* 1239 Assumes new rows are same length as the old rows,hence bug! 1240 */ 1241 for (i=0; i<ismax; i++) { 1242 mat = (Mat_SeqAIJ *)(submats[i]->data); 1243 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1244 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1245 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1246 /* Initial matrix as if empty */ 1247 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1248 submats[i]->factortype = C->factortype; 1249 } 1250 } else { 1251 for (i=0; i<ismax; i++) { 1252 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1253 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1254 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1255 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1256 } 1257 } 1258 1259 /* Assemble the matrices */ 1260 /* First assemble the local rows */ 1261 { 1262 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1263 PetscScalar *imat_a; 1264 1265 for (i=0; i<ismax; i++) { 1266 mat = (Mat_SeqAIJ*)submats[i]->data; 1267 imat_ilen = mat->ilen; 1268 imat_j = mat->j; 1269 imat_i = mat->i; 1270 imat_a = mat->a; 1271 cmap_i = cmap[i]; 1272 rmap_i = rmap[i]; 1273 irow_i = irow[i]; 1274 jmax = nrow[i]; 1275 for (j=0; j<jmax; j++) { 1276 l = 0; 1277 row = irow_i[j]; 1278 while (row >= C->rmap->range[l+1]) l++; 1279 proc = l; 1280 if (proc == rank) { 1281 old_row = row; 1282 row = rmap_i[row]; 1283 ilen_row = imat_ilen[row]; 1284 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1285 mat_i = imat_i[row] ; 1286 mat_a = imat_a + mat_i; 1287 mat_j = imat_j + mat_i; 1288 for (k=0; k<ncols; k++) { 1289 if ((tcol = cmap_i[cols[k]])) { 1290 *mat_j++ = tcol - 1; 1291 *mat_a++ = vals[k]; 1292 ilen_row++; 1293 } 1294 } 1295 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1296 imat_ilen[row] = ilen_row; 1297 } 1298 } 1299 } 1300 } 1301 1302 /* Now assemble the off proc rows*/ 1303 { 1304 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1305 PetscInt *imat_j,*imat_i; 1306 PetscScalar *imat_a,*rbuf4_i; 1307 1308 for (tmp2=0; tmp2<nrqs; tmp2++) { 1309 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1310 idex = pa[idex2]; 1311 sbuf1_i = sbuf1[idex]; 1312 jmax = sbuf1_i[0]; 1313 ct1 = 2*jmax + 1; 1314 ct2 = 0; 1315 rbuf2_i = rbuf2[idex2]; 1316 rbuf3_i = rbuf3[idex2]; 1317 rbuf4_i = rbuf4[idex2]; 1318 for (j=1; j<=jmax; j++) { 1319 is_no = sbuf1_i[2*j-1]; 1320 rmap_i = rmap[is_no]; 1321 cmap_i = cmap[is_no]; 1322 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1323 imat_ilen = mat->ilen; 1324 imat_j = mat->j; 1325 imat_i = mat->i; 1326 imat_a = mat->a; 1327 max1 = sbuf1_i[2*j]; 1328 for (k=0; k<max1; k++,ct1++) { 1329 row = sbuf1_i[ct1]; 1330 row = rmap_i[row]; 1331 ilen = imat_ilen[row]; 1332 mat_i = imat_i[row] ; 1333 mat_a = imat_a + mat_i; 1334 mat_j = imat_j + mat_i; 1335 max2 = rbuf2_i[ct1]; 1336 for (l=0; l<max2; l++,ct2++) { 1337 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1338 *mat_j++ = tcol - 1; 1339 *mat_a++ = rbuf4_i[ct2]; 1340 ilen++; 1341 } 1342 } 1343 imat_ilen[row] = ilen; 1344 } 1345 } 1346 } 1347 } 1348 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1349 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1350 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1351 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1352 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1353 1354 /* Restore the indices */ 1355 for (i=0; i<ismax; i++) { 1356 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1357 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1358 } 1359 1360 /* Destroy allocated memory */ 1361 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1362 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1363 ierr = PetscFree(pa);CHKERRQ(ierr); 1364 1365 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1366 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1367 for (i=0; i<nrqr; ++i) { 1368 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1369 } 1370 for (i=0; i<nrqs; ++i) { 1371 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1372 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1373 } 1374 1375 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1376 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1377 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1378 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1379 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1380 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1381 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1382 1383 ierr = PetscFree(cmap[0]);CHKERRQ(ierr); 1384 ierr = PetscFree(cmap);CHKERRQ(ierr); 1385 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1386 ierr = PetscFree(rmap);CHKERRQ(ierr); 1387 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1388 ierr = PetscFree(lens);CHKERRQ(ierr); 1389 1390 for (i=0; i<ismax; i++) { 1391 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1392 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1393 } 1394 PetscFunctionReturn(0); 1395 } 1396 1397 /* 1398 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1399 Be careful not to destroy them elsewhere. 1400 */ 1401 #undef __FUNCT__ 1402 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1403 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1404 { 1405 /* If making this function public, change the error returned in this function away from _PLIB. */ 1406 PetscErrorCode ierr; 1407 Mat_MPIAIJ *aij; 1408 PetscBool seqaij; 1409 1410 PetscFunctionBegin; 1411 /* Check to make sure the component matrices are compatible with C. */ 1412 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1413 if(!seqaij) { 1414 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1415 } 1416 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1417 if(!seqaij) { 1418 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1419 } 1420 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1421 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1422 } 1423 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1424 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1425 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1426 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1427 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1428 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1429 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1430 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1431 } 1432 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1433 aij = (Mat_MPIAIJ*)((*C)->data); 1434 aij->A = A; 1435 aij->B = B; 1436 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1437 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1438 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1439 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1440 PetscFunctionReturn(0); 1441 } 1442 1443 #undef __FUNCT__ 1444 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1445 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1446 { 1447 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1448 PetscFunctionBegin; 1449 PetscValidPointer(A,2); 1450 PetscValidPointer(B,3); 1451 *A = aij->A; 1452 *B = aij->B; 1453 /* Note that we don't incref *A and *B, so be careful! */ 1454 PetscFunctionReturn(0); 1455 } 1456 1457 1458 #undef __FUNCT__ 1459 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1460 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1461 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1462 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1463 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1464 { 1465 PetscErrorCode ierr; 1466 PetscMPIInt size, flag; 1467 PetscInt i,ii; 1468 PetscInt ismax_c; 1469 PetscFunctionBegin; 1470 if(!ismax) { 1471 PetscFunctionReturn(0); 1472 } 1473 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1474 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1475 if(flag != MPI_IDENT) { 1476 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1477 } 1478 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1479 if(size > 1) { 1480 ++ismax_c; 1481 } 1482 } 1483 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1484 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1485 } 1486 else { /* if(ismax_c) */ 1487 Mat *A,*B; 1488 IS *isrow_c, *iscol_c; 1489 PetscMPIInt size; 1490 /* 1491 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1492 array of sequential matrices underlying the resulting parallel matrices. 1493 Which arrays to allocate is based on the value of MatReuse scall. 1494 There are as many diag matrices as there are original index sets. 1495 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1496 1497 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1498 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1499 will deposite the extracted diag and off-diag parts. 1500 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1501 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1502 */ 1503 1504 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1505 if (scall != MAT_REUSE_MATRIX) { 1506 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1507 } 1508 else { 1509 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1510 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1511 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1512 for(i = 0, ii = 0; i < ismax; ++i) { 1513 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1514 if(size > 1) { 1515 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1516 ++ii; 1517 } 1518 else { 1519 A[i] = (*submat)[i]; 1520 } 1521 } 1522 } 1523 /* 1524 Construct the complements of the iscol ISs for parallel ISs only. 1525 These are used to extract the off-diag portion of the resulting parallel matrix. 1526 The row IS for the off-diag portion is the same as for the diag portion, 1527 so we merely alias the row IS, while skipping those that are sequential. 1528 */ 1529 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1530 for(i = 0, ii = 0; i < ismax; ++i) { 1531 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1532 if(size > 1) { 1533 isrow_c[ii] = isrow[i]; 1534 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1535 ++ii; 1536 } 1537 } 1538 /* Now obtain the sequential A and B submatrices separately. */ 1539 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1540 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1541 for(ii = 0; ii < ismax_c; ++ii) { 1542 ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr); 1543 } 1544 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1545 /* 1546 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1547 have been extracted directly into the parallel matrices containing them, or 1548 simply into the sequential matrix identical with the corresponding A (if size == 1). 1549 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1550 A is put into the correct submat slot (if size == 1). 1551 */ 1552 if(scall != MAT_REUSE_MATRIX) { 1553 for(i = 0, ii = 0; i < ismax; ++i) { 1554 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1555 if(size > 1) { 1556 /* 1557 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1558 */ 1559 /* Construct submat[i] from the Seq pieces A and B. */ 1560 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1561 1562 ++ii; 1563 } 1564 else { 1565 (*submat)[i] = A[i]; 1566 } 1567 } 1568 } 1569 ierr = PetscFree(A); CHKERRQ(ierr); 1570 ierr = PetscFree(B); CHKERRQ(ierr); 1571 } 1572 PetscFunctionReturn(0); 1573 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1574 1575 1576 1577 #undef __FUNCT__ 1578 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1579 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1580 { 1581 PetscErrorCode ierr; 1582 PetscFunctionBegin; 1583 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1584 MatCreateMPIAIJFromSeqMatrices_Private, 1585 MatMPIAIJExtractSeqMatrices_Private); 1586 CHKERRQ(ierr); 1587 PetscFunctionReturn(0); 1588 } 1589