1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt m,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 70 PetscFunctionBegin; 71 comm = ((PetscObject)C)->comm; 72 size = c->size; 73 rank = c->rank; 74 m = C->rmap->N; 75 76 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 78 79 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 80 81 for (i=0; i<imax; i++) { 82 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 83 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 84 } 85 86 /* evaluate communication - mesg to who,length of mesg, and buffer space 87 required. Based on this, buffers are allocated, and data copied into them*/ 88 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 89 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 90 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 for (i=0; i<imax; i++) { 93 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 94 idx_i = idx[i]; 95 len = n[i]; 96 for (j=0; j<len; j++) { 97 row = idx_i[j]; 98 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 99 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 100 w4[proc]++; 101 } 102 for (j=0; j<size; j++){ 103 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 104 } 105 } 106 107 nrqs = 0; /* no of outgoing messages */ 108 msz = 0; /* total mesg length (for all proc */ 109 w1[rank] = 0; /* no mesg sent to intself */ 110 w3[rank] = 0; 111 for (i=0; i<size; i++) { 112 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 113 } 114 /* pa - is list of processors to communicate with */ 115 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 116 for (i=0,j=0; i<size; i++) { 117 if (w1[i]) {pa[j] = i; j++;} 118 } 119 120 /* Each message would have a header = 1 + 2*(no of IS) + data */ 121 for (i=0; i<nrqs; i++) { 122 j = pa[i]; 123 w1[j] += w2[j] + 2*w3[j]; 124 msz += w1[j]; 125 } 126 127 /* Determine the number of messages to expect, their lengths, from from-ids */ 128 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 129 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 130 131 /* Now post the Irecvs corresponding to these messages */ 132 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 133 134 /* Allocate Memory for outgoing messages */ 135 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 136 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 137 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 139 { 140 PetscInt *iptr = tmp,ict = 0; 141 for (i=0; i<nrqs; i++) { 142 j = pa[i]; 143 iptr += ict; 144 outdat[j] = iptr; 145 ict = w1[j]; 146 } 147 } 148 149 /* Form the outgoing messages */ 150 /*plug in the headers*/ 151 for (i=0; i<nrqs; i++) { 152 j = pa[i]; 153 outdat[j][0] = 0; 154 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 155 ptr[j] = outdat[j] + 2*w3[j] + 1; 156 } 157 158 /* Memory for doing local proc's work*/ 159 { 160 PetscInt *d_p; 161 char *t_p; 162 163 /* should replace with PetscMallocN() */ 164 ierr = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) + 165 (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr); 166 ierr = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) + 167 (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr); 168 data = (PetscInt **)(table + imax); 169 isz = (PetscInt *)(data + imax); 170 d_p = (PetscInt *)(isz + imax); 171 t_p = (char *)(d_p + m*imax); 172 for (i=0; i<imax; i++) { 173 table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i; 174 data[i] = d_p + (m)*i; 175 } 176 } 177 178 /* Parse the IS and update local tables and the outgoing buf with the data*/ 179 { 180 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 181 PetscBT table_i; 182 183 for (i=0; i<imax; i++) { 184 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 185 n_i = n[i]; 186 table_i = table[i]; 187 idx_i = idx[i]; 188 data_i = data[i]; 189 isz_i = isz[i]; 190 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 191 row = idx_i[j]; 192 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 193 if (proc != rank) { /* copy to the outgoing buffer */ 194 ctr[proc]++; 195 *ptr[proc] = row; 196 ptr[proc]++; 197 } else { /* Update the local table */ 198 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 199 } 200 } 201 /* Update the headers for the current IS */ 202 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 203 if ((ctr_j = ctr[j])) { 204 outdat_j = outdat[j]; 205 k = ++outdat_j[0]; 206 outdat_j[2*k] = ctr_j; 207 outdat_j[2*k-1] = i; 208 } 209 } 210 isz[i] = isz_i; 211 } 212 } 213 214 /* Now post the sends */ 215 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 216 for (i=0; i<nrqs; ++i) { 217 j = pa[i]; 218 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 219 } 220 221 /* No longer need the original indices*/ 222 for (i=0; i<imax; ++i) { 223 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 224 } 225 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 226 227 for (i=0; i<imax; ++i) { 228 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 229 } 230 231 /* Do Local work*/ 232 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 233 234 /* Receive messages*/ 235 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 236 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 237 238 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 239 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 240 241 /* Phase 1 sends are complete - deallocate buffers */ 242 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 243 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 244 245 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 246 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 247 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 248 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 249 ierr = PetscFree(rbuf);CHKERRQ(ierr); 250 251 252 /* Send the data back*/ 253 /* Do a global reduction to know the buffer space req for incoming messages*/ 254 { 255 PetscMPIInt *rw1; 256 257 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 258 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 259 260 for (i=0; i<nrqr; ++i) { 261 proc = recv_status[i].MPI_SOURCE; 262 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 263 rw1[proc] = isz1[i]; 264 } 265 ierr = PetscFree(onodes1);CHKERRQ(ierr); 266 ierr = PetscFree(olengths1);CHKERRQ(ierr); 267 268 /* Determine the number of messages to expect, their lengths, from from-ids */ 269 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 270 ierr = PetscFree(rw1);CHKERRQ(ierr); 271 } 272 /* Now post the Irecvs corresponding to these messages */ 273 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 274 275 /* Now post the sends */ 276 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 277 for (i=0; i<nrqr; ++i) { 278 j = recv_status[i].MPI_SOURCE; 279 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 280 } 281 282 /* receive work done on other processors*/ 283 { 284 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 285 PetscMPIInt idex; 286 PetscBT table_i; 287 MPI_Status *status2; 288 289 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 290 for (i=0; i<nrqs; ++i) { 291 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 292 /* Process the message*/ 293 rbuf2_i = rbuf2[idex]; 294 ct1 = 2*rbuf2_i[0]+1; 295 jmax = rbuf2[idex][0]; 296 for (j=1; j<=jmax; j++) { 297 max = rbuf2_i[2*j]; 298 is_no = rbuf2_i[2*j-1]; 299 isz_i = isz[is_no]; 300 data_i = data[is_no]; 301 table_i = table[is_no]; 302 for (k=0; k<max; k++,ct1++) { 303 row = rbuf2_i[ct1]; 304 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 305 } 306 isz[is_no] = isz_i; 307 } 308 } 309 310 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 311 ierr = PetscFree(status2);CHKERRQ(ierr); 312 } 313 314 for (i=0; i<imax; ++i) { 315 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 316 } 317 318 ierr = PetscFree(onodes2);CHKERRQ(ierr); 319 ierr = PetscFree(olengths2);CHKERRQ(ierr); 320 321 ierr = PetscFree(pa);CHKERRQ(ierr); 322 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 323 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 324 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 325 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 326 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 327 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 328 ierr = PetscFree(table);CHKERRQ(ierr); 329 ierr = PetscFree(s_status);CHKERRQ(ierr); 330 ierr = PetscFree(recv_status);CHKERRQ(ierr); 331 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 332 ierr = PetscFree(xdata);CHKERRQ(ierr); 333 ierr = PetscFree(isz1);CHKERRQ(ierr); 334 PetscFunctionReturn(0); 335 } 336 337 #undef __FUNCT__ 338 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 339 /* 340 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 341 the work on the local processor. 342 343 Inputs: 344 C - MAT_MPIAIJ; 345 imax - total no of index sets processed at a time; 346 table - an array of char - size = m bits. 347 348 Output: 349 isz - array containing the count of the solution elements correspondign 350 to each index set; 351 data - pointer to the solutions 352 */ 353 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 354 { 355 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 356 Mat A = c->A,B = c->B; 357 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 358 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 359 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 360 PetscBT table_i; 361 362 PetscFunctionBegin; 363 rstart = C->rmap->rstart; 364 cstart = C->cmap->rstart; 365 ai = a->i; 366 aj = a->j; 367 bi = b->i; 368 bj = b->j; 369 garray = c->garray; 370 371 372 for (i=0; i<imax; i++) { 373 data_i = data[i]; 374 table_i = table[i]; 375 isz_i = isz[i]; 376 for (j=0,max=isz[i]; j<max; j++) { 377 row = data_i[j] - rstart; 378 start = ai[row]; 379 end = ai[row+1]; 380 for (k=start; k<end; k++) { /* Amat */ 381 val = aj[k] + cstart; 382 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 383 } 384 start = bi[row]; 385 end = bi[row+1]; 386 for (k=start; k<end; k++) { /* Bmat */ 387 val = garray[bj[k]]; 388 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 389 } 390 } 391 isz[i] = isz_i; 392 } 393 PetscFunctionReturn(0); 394 } 395 396 #undef __FUNCT__ 397 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 398 /* 399 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 400 and return the output 401 402 Input: 403 C - the matrix 404 nrqr - no of messages being processed. 405 rbuf - an array of pointers to the recieved requests 406 407 Output: 408 xdata - array of messages to be sent back 409 isz1 - size of each message 410 411 For better efficiency perhaps we should malloc separately each xdata[i], 412 then if a remalloc is required we need only copy the data for that one row 413 rather then all previous rows as it is now where a single large chunck of 414 memory is used. 415 416 */ 417 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 418 { 419 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 420 Mat A = c->A,B = c->B; 421 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 422 PetscErrorCode ierr; 423 PetscMPIInt rank; 424 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 425 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 426 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 427 PetscInt *rbuf_i,kmax,rbuf_0; 428 PetscBT xtable; 429 430 PetscFunctionBegin; 431 rank = c->rank; 432 m = C->rmap->N; 433 rstart = C->rmap->rstart; 434 cstart = C->cmap->rstart; 435 ai = a->i; 436 aj = a->j; 437 bi = b->i; 438 bj = b->j; 439 garray = c->garray; 440 441 442 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 443 rbuf_i = rbuf[i]; 444 rbuf_0 = rbuf_i[0]; 445 ct += rbuf_0; 446 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 447 } 448 449 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 450 else max1 = 1; 451 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 452 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 453 ++no_malloc; 454 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 455 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 456 457 ct3 = 0; 458 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 459 rbuf_i = rbuf[i]; 460 rbuf_0 = rbuf_i[0]; 461 ct1 = 2*rbuf_0+1; 462 ct2 = ct1; 463 ct3 += ct1; 464 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 465 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 466 oct2 = ct2; 467 kmax = rbuf_i[2*j]; 468 for (k=0; k<kmax; k++,ct1++) { 469 row = rbuf_i[ct1]; 470 if (!PetscBTLookupSet(xtable,row)) { 471 if (!(ct3 < mem_estimate)) { 472 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 473 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 474 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 475 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 476 xdata[0] = tmp; 477 mem_estimate = new_estimate; ++no_malloc; 478 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 479 } 480 xdata[i][ct2++] = row; 481 ct3++; 482 } 483 } 484 for (k=oct2,max2=ct2; k<max2; k++) { 485 row = xdata[i][k] - rstart; 486 start = ai[row]; 487 end = ai[row+1]; 488 for (l=start; l<end; l++) { 489 val = aj[l] + cstart; 490 if (!PetscBTLookupSet(xtable,val)) { 491 if (!(ct3 < mem_estimate)) { 492 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 493 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 494 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 495 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 496 xdata[0] = tmp; 497 mem_estimate = new_estimate; ++no_malloc; 498 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 499 } 500 xdata[i][ct2++] = val; 501 ct3++; 502 } 503 } 504 start = bi[row]; 505 end = bi[row+1]; 506 for (l=start; l<end; l++) { 507 val = garray[bj[l]]; 508 if (!PetscBTLookupSet(xtable,val)) { 509 if (!(ct3 < mem_estimate)) { 510 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 511 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 512 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 513 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 514 xdata[0] = tmp; 515 mem_estimate = new_estimate; ++no_malloc; 516 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 517 } 518 xdata[i][ct2++] = val; 519 ct3++; 520 } 521 } 522 } 523 /* Update the header*/ 524 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 525 xdata[i][2*j-1] = rbuf_i[2*j-1]; 526 } 527 xdata[i][0] = rbuf_0; 528 xdata[i+1] = xdata[i] + ct2; 529 isz1[i] = ct2; /* size of each message */ 530 } 531 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 532 ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 533 PetscFunctionReturn(0); 534 } 535 /* -------------------------------------------------------------------------*/ 536 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 537 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 538 /* 539 Every processor gets the entire matrix 540 */ 541 #undef __FUNCT__ 542 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 543 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 544 { 545 Mat B; 546 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 547 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 548 PetscErrorCode ierr; 549 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 550 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 551 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 552 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 553 554 PetscFunctionBegin; 555 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 556 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 557 558 if (scall == MAT_INITIAL_MATRIX) { 559 /* ---------------------------------------------------------------- 560 Tell every processor the number of nonzeros per row 561 */ 562 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 563 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 564 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 565 } 566 sendcount = A->rmap->rend - A->rmap->rstart; 567 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 568 for (i=0; i<size; i++) { 569 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 570 displs[i] = A->rmap->range[i]; 571 } 572 #if defined(PETSC_HAVE_MPI_IN_PLACE) 573 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 574 #else 575 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 576 #endif 577 /* --------------------------------------------------------------- 578 Create the sequential matrix of the same type as the local block diagonal 579 */ 580 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 581 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 582 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 583 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 584 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 585 **Bin = B; 586 b = (Mat_SeqAIJ *)B->data; 587 588 /*-------------------------------------------------------------------- 589 Copy my part of matrix column indices over 590 */ 591 sendcount = ad->nz + bd->nz; 592 jsendbuf = b->j + b->i[rstarts[rank]]; 593 a_jsendbuf = ad->j; 594 b_jsendbuf = bd->j; 595 n = A->rmap->rend - A->rmap->rstart; 596 cnt = 0; 597 for (i=0; i<n; i++) { 598 599 /* put in lower diagonal portion */ 600 m = bd->i[i+1] - bd->i[i]; 601 while (m > 0) { 602 /* is it above diagonal (in bd (compressed) numbering) */ 603 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 604 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 605 m--; 606 } 607 608 /* put in diagonal portion */ 609 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 610 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 611 } 612 613 /* put in upper diagonal portion */ 614 while (m-- > 0) { 615 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 616 } 617 } 618 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 619 620 /*-------------------------------------------------------------------- 621 Gather all column indices to all processors 622 */ 623 for (i=0; i<size; i++) { 624 recvcounts[i] = 0; 625 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 626 recvcounts[i] += lens[j]; 627 } 628 } 629 displs[0] = 0; 630 for (i=1; i<size; i++) { 631 displs[i] = displs[i-1] + recvcounts[i-1]; 632 } 633 #if defined(PETSC_HAVE_MPI_IN_PLACE) 634 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 635 #else 636 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 637 #endif 638 /*-------------------------------------------------------------------- 639 Assemble the matrix into useable form (note numerical values not yet set) 640 */ 641 /* set the b->ilen (length of each row) values */ 642 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 643 /* set the b->i indices */ 644 b->i[0] = 0; 645 for (i=1; i<=A->rmap->N; i++) { 646 b->i[i] = b->i[i-1] + lens[i-1]; 647 } 648 ierr = PetscFree(lens);CHKERRQ(ierr); 649 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 650 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 651 652 } else { 653 B = **Bin; 654 b = (Mat_SeqAIJ *)B->data; 655 } 656 657 /*-------------------------------------------------------------------- 658 Copy my part of matrix numerical values into the values location 659 */ 660 if (flag == MAT_GET_VALUES){ 661 sendcount = ad->nz + bd->nz; 662 sendbuf = b->a + b->i[rstarts[rank]]; 663 a_sendbuf = ad->a; 664 b_sendbuf = bd->a; 665 b_sendj = bd->j; 666 n = A->rmap->rend - A->rmap->rstart; 667 cnt = 0; 668 for (i=0; i<n; i++) { 669 670 /* put in lower diagonal portion */ 671 m = bd->i[i+1] - bd->i[i]; 672 while (m > 0) { 673 /* is it above diagonal (in bd (compressed) numbering) */ 674 if (garray[*b_sendj] > A->rmap->rstart + i) break; 675 sendbuf[cnt++] = *b_sendbuf++; 676 m--; 677 b_sendj++; 678 } 679 680 /* put in diagonal portion */ 681 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 682 sendbuf[cnt++] = *a_sendbuf++; 683 } 684 685 /* put in upper diagonal portion */ 686 while (m-- > 0) { 687 sendbuf[cnt++] = *b_sendbuf++; 688 b_sendj++; 689 } 690 } 691 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 692 693 /* ----------------------------------------------------------------- 694 Gather all numerical values to all processors 695 */ 696 if (!recvcounts) { 697 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 698 } 699 for (i=0; i<size; i++) { 700 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 701 } 702 displs[0] = 0; 703 for (i=1; i<size; i++) { 704 displs[i] = displs[i-1] + recvcounts[i-1]; 705 } 706 recvbuf = b->a; 707 #if defined(PETSC_HAVE_MPI_IN_PLACE) 708 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 709 #else 710 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 711 #endif 712 } /* endof (flag == MAT_GET_VALUES) */ 713 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 714 715 if (A->symmetric){ 716 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 717 } else if (A->hermitian) { 718 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 719 } else if (A->structurally_symmetric) { 720 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 721 } 722 PetscFunctionReturn(0); 723 } 724 725 726 727 #undef __FUNCT__ 728 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 729 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 730 { 731 PetscErrorCode ierr; 732 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 733 PetscBool rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix; 734 735 PetscFunctionBegin; 736 /* 737 Check for special case each processor gets entire matrix 738 */ 739 if (ismax == 1 && C->rmap->N == C->cmap->N) { 740 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 741 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 742 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 743 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 744 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 745 wantallmatrix = PETSC_TRUE; 746 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 747 } 748 } 749 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 750 if (twantallmatrix) { 751 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 752 PetscFunctionReturn(0); 753 } 754 755 /* Allocate memory to hold all the submatrices */ 756 if (scall != MAT_REUSE_MATRIX) { 757 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 758 } 759 /* Determine the number of stages through which submatrices are done */ 760 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 761 /* 762 Each stage will extract nmax submatrices. 763 nmax is determined by the matrix column dimension. 764 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 765 */ 766 if (!nmax) nmax = 1; 767 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 768 769 /* Make sure every processor loops through the nstages */ 770 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 771 772 for (i=0,pos=0; i<nstages; i++) { 773 if (pos+nmax <= ismax) max_no = nmax; 774 else if (pos == ismax) max_no = 0; 775 else max_no = ismax-pos; 776 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 777 pos += max_no; 778 } 779 PetscFunctionReturn(0); 780 } 781 782 783 784 785 /* -------------------------------------------------------------------------*/ 786 #undef __FUNCT__ 787 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 788 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 789 { 790 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 791 Mat A = c->A; 792 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 793 const PetscInt **icol,**irow; 794 PetscInt *nrow,*ncol,start; 795 PetscErrorCode ierr; 796 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 797 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 798 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 799 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap; 800 PetscInt **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 801 const PetscInt *irow_i; 802 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 803 PetscInt *rmap_i; 804 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 805 MPI_Request *r_waits4,*s_waits3,*s_waits4; 806 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 807 MPI_Status *r_status3,*r_status4,*s_status4; 808 MPI_Comm comm; 809 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 810 PetscBool sorted; 811 PetscMPIInt *onodes1,*olengths1; 812 PetscMPIInt idex,idex2,end; 813 814 PetscFunctionBegin; 815 comm = ((PetscObject)C)->comm; 816 tag0 = ((PetscObject)C)->tag; 817 size = c->size; 818 rank = c->rank; 819 820 /* Get some new tags to keep the communication clean */ 821 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 822 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 823 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 824 825 /* Check if the col indices are sorted */ 826 for (i=0; i<ismax; i++) { 827 ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr); 828 /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/ 829 ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr); 830 /* if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */ 831 } 832 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 833 834 for (i=0; i<ismax; i++) { 835 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 836 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 837 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 838 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 839 } 840 841 /* evaluate communication - mesg to who, length of mesg, and buffer space 842 required. Based on this, buffers are allocated, and data copied into them*/ 843 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 844 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 845 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 846 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 847 for (i=0; i<ismax; i++) { 848 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 849 jmax = nrow[i]; 850 irow_i = irow[i]; 851 for (j=0; j<jmax; j++) { 852 l = 0; 853 row = irow_i[j]; 854 while (row >= C->rmap->range[l+1]) l++; 855 proc = l; 856 w4[proc]++; 857 } 858 for (j=0; j<size; j++) { 859 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 860 } 861 } 862 863 nrqs = 0; /* no of outgoing messages */ 864 msz = 0; /* total mesg length (for all procs) */ 865 w1[rank] = 0; /* no mesg sent to self */ 866 w3[rank] = 0; 867 for (i=0; i<size; i++) { 868 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 869 } 870 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 871 for (i=0,j=0; i<size; i++) { 872 if (w1[i]) { pa[j] = i; j++; } 873 } 874 875 /* Each message would have a header = 1 + 2*(no of IS) + data */ 876 for (i=0; i<nrqs; i++) { 877 j = pa[i]; 878 w1[j] += w2[j] + 2* w3[j]; 879 msz += w1[j]; 880 } 881 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 882 883 /* Determine the number of messages to expect, their lengths, from from-ids */ 884 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 885 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 886 887 /* Now post the Irecvs corresponding to these messages */ 888 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 889 890 ierr = PetscFree(onodes1);CHKERRQ(ierr); 891 ierr = PetscFree(olengths1);CHKERRQ(ierr); 892 893 /* Allocate Memory for outgoing messages */ 894 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 895 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 896 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 897 898 { 899 PetscInt *iptr = tmp,ict = 0; 900 for (i=0; i<nrqs; i++) { 901 j = pa[i]; 902 iptr += ict; 903 sbuf1[j] = iptr; 904 ict = w1[j]; 905 } 906 } 907 908 /* Form the outgoing messages */ 909 /* Initialize the header space */ 910 for (i=0; i<nrqs; i++) { 911 j = pa[i]; 912 sbuf1[j][0] = 0; 913 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 914 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 915 } 916 917 /* Parse the isrow and copy data into outbuf */ 918 for (i=0; i<ismax; i++) { 919 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 920 irow_i = irow[i]; 921 jmax = nrow[i]; 922 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 923 l = 0; 924 row = irow_i[j]; 925 while (row >= C->rmap->range[l+1]) l++; 926 proc = l; 927 if (proc != rank) { /* copy to the outgoing buf*/ 928 ctr[proc]++; 929 *ptr[proc] = row; 930 ptr[proc]++; 931 } 932 } 933 /* Update the headers for the current IS */ 934 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 935 if ((ctr_j = ctr[j])) { 936 sbuf1_j = sbuf1[j]; 937 k = ++sbuf1_j[0]; 938 sbuf1_j[2*k] = ctr_j; 939 sbuf1_j[2*k-1] = i; 940 } 941 } 942 } 943 944 /* Now post the sends */ 945 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 946 for (i=0; i<nrqs; ++i) { 947 j = pa[i]; 948 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 949 } 950 951 /* Post Receives to capture the buffer size */ 952 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 953 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 954 rbuf2[0] = tmp + msz; 955 for (i=1; i<nrqs; ++i) { 956 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 957 } 958 for (i=0; i<nrqs; ++i) { 959 j = pa[i]; 960 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 961 } 962 963 /* Send to other procs the buf size they should allocate */ 964 965 966 /* Receive messages*/ 967 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 968 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 969 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 970 { 971 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 972 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 973 PetscInt *sbuf2_i; 974 975 for (i=0; i<nrqr; ++i) { 976 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 977 req_size[idex] = 0; 978 rbuf1_i = rbuf1[idex]; 979 start = 2*rbuf1_i[0] + 1; 980 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 981 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 982 sbuf2_i = sbuf2[idex]; 983 for (j=start; j<end; j++) { 984 id = rbuf1_i[j] - rstart; 985 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 986 sbuf2_i[j] = ncols; 987 req_size[idex] += ncols; 988 } 989 req_source[idex] = r_status1[i].MPI_SOURCE; 990 /* form the header */ 991 sbuf2_i[0] = req_size[idex]; 992 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 993 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 994 } 995 } 996 ierr = PetscFree(r_status1);CHKERRQ(ierr); 997 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 998 999 /* recv buffer sizes */ 1000 /* Receive messages*/ 1001 1002 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1003 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1004 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1005 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1006 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1007 1008 for (i=0; i<nrqs; ++i) { 1009 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1010 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1011 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1012 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1013 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1014 } 1015 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1016 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1017 1018 /* Wait on sends1 and sends2 */ 1019 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1020 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1021 1022 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1023 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1024 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1025 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1026 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1027 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1028 1029 /* Now allocate buffers for a->j, and send them off */ 1030 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1031 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1032 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1033 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1034 1035 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1036 { 1037 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1038 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1039 PetscInt cend = C->cmap->rend; 1040 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1041 1042 for (i=0; i<nrqr; i++) { 1043 rbuf1_i = rbuf1[i]; 1044 sbuf_aj_i = sbuf_aj[i]; 1045 ct1 = 2*rbuf1_i[0] + 1; 1046 ct2 = 0; 1047 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1048 kmax = rbuf1[i][2*j]; 1049 for (k=0; k<kmax; k++,ct1++) { 1050 row = rbuf1_i[ct1] - rstart; 1051 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1052 ncols = nzA + nzB; 1053 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1054 1055 /* load the column indices for this row into cols*/ 1056 cols = sbuf_aj_i + ct2; 1057 1058 lwrite = 0; 1059 for (l=0; l<nzB; l++) { 1060 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1061 } 1062 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1063 for (l=0; l<nzB; l++) { 1064 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1065 } 1066 1067 ct2 += ncols; 1068 } 1069 } 1070 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1071 } 1072 } 1073 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1074 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1075 1076 /* Allocate buffers for a->a, and send them off */ 1077 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1078 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1079 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1080 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1081 1082 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1083 { 1084 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1085 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1086 PetscInt cend = C->cmap->rend; 1087 PetscInt *b_j = b->j; 1088 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1089 1090 for (i=0; i<nrqr; i++) { 1091 rbuf1_i = rbuf1[i]; 1092 sbuf_aa_i = sbuf_aa[i]; 1093 ct1 = 2*rbuf1_i[0]+1; 1094 ct2 = 0; 1095 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1096 kmax = rbuf1_i[2*j]; 1097 for (k=0; k<kmax; k++,ct1++) { 1098 row = rbuf1_i[ct1] - rstart; 1099 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1100 ncols = nzA + nzB; 1101 cworkB = b_j + b_i[row]; 1102 vworkA = a_a + a_i[row]; 1103 vworkB = b_a + b_i[row]; 1104 1105 /* load the column values for this row into vals*/ 1106 vals = sbuf_aa_i+ct2; 1107 1108 lwrite = 0; 1109 for (l=0; l<nzB; l++) { 1110 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1111 } 1112 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1113 for (l=0; l<nzB; l++) { 1114 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1115 } 1116 1117 ct2 += ncols; 1118 } 1119 } 1120 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1121 } 1122 } 1123 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1124 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1125 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1126 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1127 1128 /* Form the matrix */ 1129 /* create col map */ 1130 { 1131 const PetscInt *icol_i; 1132 1133 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1134 ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 1135 ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1136 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; } 1137 for (i=0; i<ismax; i++) { 1138 jmax = ncol[i]; 1139 icol_i = icol[i]; 1140 cmap_i = cmap[i]; 1141 for (j=0; j<jmax; j++) { 1142 cmap_i[icol_i[j]] = j+1; 1143 } 1144 } 1145 } 1146 1147 /* Create lens which is required for MatCreate... */ 1148 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1149 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1150 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1151 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1152 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1153 1154 /* Update lens from local data */ 1155 for (i=0; i<ismax; i++) { 1156 jmax = nrow[i]; 1157 cmap_i = cmap[i]; 1158 irow_i = irow[i]; 1159 lens_i = lens[i]; 1160 for (j=0; j<jmax; j++) { 1161 l = 0; 1162 row = irow_i[j]; 1163 while (row >= C->rmap->range[l+1]) l++; 1164 proc = l; 1165 if (proc == rank) { 1166 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1167 for (k=0; k<ncols; k++) { 1168 if (cmap_i[cols[k]]) { lens_i[j]++;} 1169 } 1170 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1171 } 1172 } 1173 } 1174 1175 /* Create row map*/ 1176 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1177 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1178 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1179 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1180 for (i=0; i<ismax; i++) { 1181 rmap_i = rmap[i]; 1182 irow_i = irow[i]; 1183 jmax = nrow[i]; 1184 for (j=0; j<jmax; j++) { 1185 rmap_i[irow_i[j]] = j; 1186 } 1187 } 1188 1189 /* Update lens from offproc data */ 1190 { 1191 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1192 1193 for (tmp2=0; tmp2<nrqs; tmp2++) { 1194 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1195 idex = pa[idex2]; 1196 sbuf1_i = sbuf1[idex]; 1197 jmax = sbuf1_i[0]; 1198 ct1 = 2*jmax+1; 1199 ct2 = 0; 1200 rbuf2_i = rbuf2[idex2]; 1201 rbuf3_i = rbuf3[idex2]; 1202 for (j=1; j<=jmax; j++) { 1203 is_no = sbuf1_i[2*j-1]; 1204 max1 = sbuf1_i[2*j]; 1205 lens_i = lens[is_no]; 1206 cmap_i = cmap[is_no]; 1207 rmap_i = rmap[is_no]; 1208 for (k=0; k<max1; k++,ct1++) { 1209 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1210 max2 = rbuf2_i[ct1]; 1211 for (l=0; l<max2; l++,ct2++) { 1212 if (cmap_i[rbuf3_i[ct2]]) { 1213 lens_i[row]++; 1214 } 1215 } 1216 } 1217 } 1218 } 1219 } 1220 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1221 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1222 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1223 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1224 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1225 1226 /* Create the submatrices */ 1227 if (scall == MAT_REUSE_MATRIX) { 1228 PetscBool flag; 1229 1230 /* 1231 Assumes new rows are same length as the old rows,hence bug! 1232 */ 1233 for (i=0; i<ismax; i++) { 1234 mat = (Mat_SeqAIJ *)(submats[i]->data); 1235 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1236 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1237 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1238 /* Initial matrix as if empty */ 1239 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1240 submats[i]->factortype = C->factortype; 1241 } 1242 } else { 1243 for (i=0; i<ismax; i++) { 1244 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1245 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1246 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1247 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1248 } 1249 } 1250 1251 /* Assemble the matrices */ 1252 /* First assemble the local rows */ 1253 { 1254 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1255 PetscScalar *imat_a; 1256 1257 for (i=0; i<ismax; i++) { 1258 mat = (Mat_SeqAIJ*)submats[i]->data; 1259 imat_ilen = mat->ilen; 1260 imat_j = mat->j; 1261 imat_i = mat->i; 1262 imat_a = mat->a; 1263 cmap_i = cmap[i]; 1264 rmap_i = rmap[i]; 1265 irow_i = irow[i]; 1266 jmax = nrow[i]; 1267 for (j=0; j<jmax; j++) { 1268 l = 0; 1269 row = irow_i[j]; 1270 while (row >= C->rmap->range[l+1]) l++; 1271 proc = l; 1272 if (proc == rank) { 1273 old_row = row; 1274 row = rmap_i[row]; 1275 ilen_row = imat_ilen[row]; 1276 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1277 mat_i = imat_i[row] ; 1278 mat_a = imat_a + mat_i; 1279 mat_j = imat_j + mat_i; 1280 for (k=0; k<ncols; k++) { 1281 if ((tcol = cmap_i[cols[k]])) { 1282 *mat_j++ = tcol - 1; 1283 *mat_a++ = vals[k]; 1284 ilen_row++; 1285 } 1286 } 1287 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1288 imat_ilen[row] = ilen_row; 1289 } 1290 } 1291 } 1292 } 1293 1294 /* Now assemble the off proc rows*/ 1295 { 1296 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1297 PetscInt *imat_j,*imat_i; 1298 PetscScalar *imat_a,*rbuf4_i; 1299 1300 for (tmp2=0; tmp2<nrqs; tmp2++) { 1301 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1302 idex = pa[idex2]; 1303 sbuf1_i = sbuf1[idex]; 1304 jmax = sbuf1_i[0]; 1305 ct1 = 2*jmax + 1; 1306 ct2 = 0; 1307 rbuf2_i = rbuf2[idex2]; 1308 rbuf3_i = rbuf3[idex2]; 1309 rbuf4_i = rbuf4[idex2]; 1310 for (j=1; j<=jmax; j++) { 1311 is_no = sbuf1_i[2*j-1]; 1312 rmap_i = rmap[is_no]; 1313 cmap_i = cmap[is_no]; 1314 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1315 imat_ilen = mat->ilen; 1316 imat_j = mat->j; 1317 imat_i = mat->i; 1318 imat_a = mat->a; 1319 max1 = sbuf1_i[2*j]; 1320 for (k=0; k<max1; k++,ct1++) { 1321 row = sbuf1_i[ct1]; 1322 row = rmap_i[row]; 1323 ilen = imat_ilen[row]; 1324 mat_i = imat_i[row] ; 1325 mat_a = imat_a + mat_i; 1326 mat_j = imat_j + mat_i; 1327 max2 = rbuf2_i[ct1]; 1328 for (l=0; l<max2; l++,ct2++) { 1329 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1330 *mat_j++ = tcol - 1; 1331 *mat_a++ = rbuf4_i[ct2]; 1332 ilen++; 1333 } 1334 } 1335 imat_ilen[row] = ilen; 1336 } 1337 } 1338 } 1339 } 1340 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1341 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1342 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1343 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1344 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1345 1346 /* Restore the indices */ 1347 for (i=0; i<ismax; i++) { 1348 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1349 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1350 } 1351 1352 /* Destroy allocated memory */ 1353 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1354 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1355 ierr = PetscFree(pa);CHKERRQ(ierr); 1356 1357 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1358 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1359 for (i=0; i<nrqr; ++i) { 1360 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1361 } 1362 for (i=0; i<nrqs; ++i) { 1363 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1364 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1365 } 1366 1367 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1368 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1369 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1370 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1371 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1372 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1373 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1374 1375 ierr = PetscFree(cmap[0]);CHKERRQ(ierr); 1376 ierr = PetscFree(cmap);CHKERRQ(ierr); 1377 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1378 ierr = PetscFree(rmap);CHKERRQ(ierr); 1379 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1380 ierr = PetscFree(lens);CHKERRQ(ierr); 1381 1382 for (i=0; i<ismax; i++) { 1383 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1384 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1385 } 1386 PetscFunctionReturn(0); 1387 } 1388 1389 /* 1390 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1391 Be careful not to destroy them elsewhere. 1392 */ 1393 #undef __FUNCT__ 1394 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1395 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1396 { 1397 /* If making this function public, change the error returned in this function away from _PLIB. */ 1398 PetscErrorCode ierr; 1399 Mat_MPIAIJ *aij; 1400 PetscBool seqaij; 1401 1402 PetscFunctionBegin; 1403 /* Check to make sure the component matrices are compatible with C. */ 1404 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1405 if(!seqaij) { 1406 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1407 } 1408 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1409 if(!seqaij) { 1410 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1411 } 1412 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1413 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1414 } 1415 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1416 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1417 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1418 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1419 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1420 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1421 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1422 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1423 } 1424 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1425 aij = (Mat_MPIAIJ*)((*C)->data); 1426 aij->A = A; 1427 aij->B = B; 1428 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1429 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1430 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1431 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1432 PetscFunctionReturn(0); 1433 } 1434 1435 #undef __FUNCT__ 1436 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1437 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1438 { 1439 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1440 PetscFunctionBegin; 1441 PetscValidPointer(A,2); 1442 PetscValidPointer(B,3); 1443 *A = aij->A; 1444 *B = aij->B; 1445 /* Note that we don't incref *A and *B, so be careful! */ 1446 PetscFunctionReturn(0); 1447 } 1448 1449 1450 #undef __FUNCT__ 1451 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1452 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1453 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1454 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1455 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1456 { 1457 PetscErrorCode ierr; 1458 PetscMPIInt size, flag; 1459 PetscInt i,ii; 1460 PetscInt ismax_c; 1461 PetscFunctionBegin; 1462 if(!ismax) { 1463 PetscFunctionReturn(0); 1464 } 1465 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1466 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1467 if(flag != MPI_IDENT) { 1468 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1469 } 1470 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1471 if(size > 1) { 1472 ++ismax_c; 1473 } 1474 } 1475 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1476 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1477 } 1478 else { /* if(ismax_c) */ 1479 Mat *A,*B; 1480 IS *isrow_c, *iscol_c; 1481 PetscMPIInt size; 1482 /* 1483 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1484 array of sequential matrices underlying the resulting parallel matrices. 1485 Which arrays to allocate is based on the value of MatReuse scall. 1486 There are as many diag matrices as there are original index sets. 1487 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1488 1489 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1490 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1491 will deposite the extracted diag and off-diag parts. 1492 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1493 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1494 */ 1495 1496 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1497 if (scall != MAT_REUSE_MATRIX) { 1498 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1499 } 1500 else { 1501 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1502 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1503 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1504 for(i = 0, ii = 0; i < ismax; ++i) { 1505 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1506 if(size > 1) { 1507 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1508 ++ii; 1509 } 1510 else { 1511 A[i] = (*submat)[i]; 1512 } 1513 } 1514 } 1515 /* 1516 Construct the complements of the iscol ISs for parallel ISs only. 1517 These are used to extract the off-diag portion of the resulting parallel matrix. 1518 The row IS for the off-diag portion is the same as for the diag portion, 1519 so we merely alias the row IS, while skipping those that are sequential. 1520 */ 1521 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1522 for(i = 0, ii = 0; i < ismax; ++i) { 1523 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1524 if(size > 1) { 1525 isrow_c[ii] = isrow[i]; 1526 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1527 ++ii; 1528 } 1529 } 1530 /* Now obtain the sequential A and B submatrices separately. */ 1531 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1532 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1533 for(ii = 0; ii < ismax_c; ++ii) { 1534 ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr); 1535 } 1536 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1537 /* 1538 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1539 have been extracted directly into the parallel matrices containing them, or 1540 simply into the sequential matrix identical with the corresponding A (if size == 1). 1541 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1542 A is put into the correct submat slot (if size == 1). 1543 */ 1544 if(scall != MAT_REUSE_MATRIX) { 1545 for(i = 0, ii = 0; i < ismax; ++i) { 1546 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1547 if(size > 1) { 1548 /* 1549 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1550 */ 1551 /* Construct submat[i] from the Seq pieces A and B. */ 1552 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1553 1554 ++ii; 1555 } 1556 else { 1557 (*submat)[i] = A[i]; 1558 } 1559 } 1560 } 1561 ierr = PetscFree(A); CHKERRQ(ierr); 1562 ierr = PetscFree(B); CHKERRQ(ierr); 1563 } 1564 PetscFunctionReturn(0); 1565 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1566 1567 1568 1569 #undef __FUNCT__ 1570 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1571 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1572 { 1573 PetscErrorCode ierr; 1574 PetscFunctionBegin; 1575 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1576 MatCreateMPIAIJFromSeqMatrices_Private, 1577 MatMPIAIJExtractSeqMatrices_Private); 1578 CHKERRQ(ierr); 1579 PetscFunctionReturn(0); 1580 } 1581