1 2 #define PETSCMAT_DLL 3 4 /* 5 Routines to compute overlapping regions of a parallel MPI matrix 6 and to find submatrices that were shared across processors. 7 */ 8 #include "../src/mat/impls/aij/mpi/mpiaij.h" 9 #include "petscbt.h" 10 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 16 17 #undef __FUNCT__ 18 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 19 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 20 { 21 PetscErrorCode ierr; 22 PetscInt i; 23 24 PetscFunctionBegin; 25 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 for (i=0; i<ov; ++i) { 27 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 28 } 29 PetscFunctionReturn(0); 30 } 31 32 /* 33 Sample message format: 34 If a processor A wants processor B to process some elements corresponding 35 to index sets is[1],is[5] 36 mesg [0] = 2 (no of index sets in the mesg) 37 ----------- 38 mesg [1] = 1 => is[1] 39 mesg [2] = sizeof(is[1]); 40 ----------- 41 mesg [3] = 5 => is[5] 42 mesg [4] = sizeof(is[5]); 43 ----------- 44 mesg [5] 45 mesg [n] datas[1] 46 ----------- 47 mesg[n+1] 48 mesg[m] data(is[5]) 49 ----------- 50 51 Notes: 52 nrqs - no of requests sent (or to be sent out) 53 nrqr - no of requests recieved (which have to be or which have been processed 54 */ 55 #undef __FUNCT__ 56 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 57 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 58 { 59 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 60 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 61 const PetscInt **idx,*idx_i; 62 PetscInt *n,*rtable,**data,len; 63 PetscErrorCode ierr; 64 PetscMPIInt size,rank,tag1,tag2; 65 PetscInt m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 66 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 67 PetscBT *table; 68 MPI_Comm comm; 69 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 70 MPI_Status *s_status,*recv_status; 71 72 PetscFunctionBegin; 73 comm = ((PetscObject)C)->comm; 74 size = c->size; 75 rank = c->rank; 76 m = C->rmap->N; 77 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 79 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 80 81 ierr = PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);CHKERRQ(ierr); 82 83 for (i=0; i<imax; i++) { 84 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 85 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 86 } 87 88 /* Create hash table for the mapping :row -> proc*/ 89 for (i=0,j=0; i<size; i++) { 90 len = C->rmap->range[i+1]; 91 for (; j<len; j++) { 92 rtable[j] = i; 93 } 94 } 95 96 /* evaluate communication - mesg to who,length of mesg, and buffer space 97 required. Based on this, buffers are allocated, and data copied into them*/ 98 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 99 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 100 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 101 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 102 for (i=0; i<imax; i++) { 103 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 104 idx_i = idx[i]; 105 len = n[i]; 106 for (j=0; j<len; j++) { 107 row = idx_i[j]; 108 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 109 proc = rtable[row]; 110 w4[proc]++; 111 } 112 for (j=0; j<size; j++){ 113 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 114 } 115 } 116 117 nrqs = 0; /* no of outgoing messages */ 118 msz = 0; /* total mesg length (for all proc */ 119 w1[rank] = 0; /* no mesg sent to intself */ 120 w3[rank] = 0; 121 for (i=0; i<size; i++) { 122 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 123 } 124 /* pa - is list of processors to communicate with */ 125 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 126 for (i=0,j=0; i<size; i++) { 127 if (w1[i]) {pa[j] = i; j++;} 128 } 129 130 /* Each message would have a header = 1 + 2*(no of IS) + data */ 131 for (i=0; i<nrqs; i++) { 132 j = pa[i]; 133 w1[j] += w2[j] + 2*w3[j]; 134 msz += w1[j]; 135 } 136 137 /* Determine the number of messages to expect, their lengths, from from-ids */ 138 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 139 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 140 141 /* Now post the Irecvs corresponding to these messages */ 142 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 143 144 /* Allocate Memory for outgoing messages */ 145 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 146 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 147 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 148 149 { 150 PetscInt *iptr = tmp,ict = 0; 151 for (i=0; i<nrqs; i++) { 152 j = pa[i]; 153 iptr += ict; 154 outdat[j] = iptr; 155 ict = w1[j]; 156 } 157 } 158 159 /* Form the outgoing messages */ 160 /*plug in the headers*/ 161 for (i=0; i<nrqs; i++) { 162 j = pa[i]; 163 outdat[j][0] = 0; 164 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 165 ptr[j] = outdat[j] + 2*w3[j] + 1; 166 } 167 168 /* Memory for doing local proc's work*/ 169 { 170 PetscInt *d_p; 171 char *t_p; 172 173 /* should replace with PetscMallocN() */ 174 ierr = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) + 175 (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr); 176 ierr = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) + 177 (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr); 178 data = (PetscInt **)(table + imax); 179 isz = (PetscInt *)(data + imax); 180 d_p = (PetscInt *)(isz + imax); 181 t_p = (char *)(d_p + m*imax); 182 for (i=0; i<imax; i++) { 183 table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i; 184 data[i] = d_p + (m)*i; 185 } 186 } 187 188 /* Parse the IS and update local tables and the outgoing buf with the data*/ 189 { 190 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 191 PetscBT table_i; 192 193 for (i=0; i<imax; i++) { 194 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 195 n_i = n[i]; 196 table_i = table[i]; 197 idx_i = idx[i]; 198 data_i = data[i]; 199 isz_i = isz[i]; 200 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 201 row = idx_i[j]; 202 proc = rtable[row]; 203 if (proc != rank) { /* copy to the outgoing buffer */ 204 ctr[proc]++; 205 *ptr[proc] = row; 206 ptr[proc]++; 207 } else { /* Update the local table */ 208 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 209 } 210 } 211 /* Update the headers for the current IS */ 212 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 213 if ((ctr_j = ctr[j])) { 214 outdat_j = outdat[j]; 215 k = ++outdat_j[0]; 216 outdat_j[2*k] = ctr_j; 217 outdat_j[2*k-1] = i; 218 } 219 } 220 isz[i] = isz_i; 221 } 222 } 223 224 /* Now post the sends */ 225 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 226 for (i=0; i<nrqs; ++i) { 227 j = pa[i]; 228 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 229 } 230 231 /* No longer need the original indices*/ 232 for (i=0; i<imax; ++i) { 233 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 234 } 235 ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr); 236 237 for (i=0; i<imax; ++i) { 238 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 239 } 240 241 /* Do Local work*/ 242 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 243 244 /* Receive messages*/ 245 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 246 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 247 248 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 249 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 250 251 /* Phase 1 sends are complete - deallocate buffers */ 252 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 253 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 254 255 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 256 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 257 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 258 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 259 ierr = PetscFree(rbuf);CHKERRQ(ierr); 260 261 262 /* Send the data back*/ 263 /* Do a global reduction to know the buffer space req for incoming messages*/ 264 { 265 PetscMPIInt *rw1; 266 267 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 268 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 269 270 for (i=0; i<nrqr; ++i) { 271 proc = recv_status[i].MPI_SOURCE; 272 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 273 rw1[proc] = isz1[i]; 274 } 275 ierr = PetscFree(onodes1);CHKERRQ(ierr); 276 ierr = PetscFree(olengths1);CHKERRQ(ierr); 277 278 /* Determine the number of messages to expect, their lengths, from from-ids */ 279 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 280 ierr = PetscFree(rw1);CHKERRQ(ierr); 281 } 282 /* Now post the Irecvs corresponding to these messages */ 283 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 284 285 /* Now post the sends */ 286 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 287 for (i=0; i<nrqr; ++i) { 288 j = recv_status[i].MPI_SOURCE; 289 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 290 } 291 292 /* receive work done on other processors*/ 293 { 294 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 295 PetscMPIInt idex; 296 PetscBT table_i; 297 MPI_Status *status2; 298 299 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 300 for (i=0; i<nrqs; ++i) { 301 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 302 /* Process the message*/ 303 rbuf2_i = rbuf2[idex]; 304 ct1 = 2*rbuf2_i[0]+1; 305 jmax = rbuf2[idex][0]; 306 for (j=1; j<=jmax; j++) { 307 max = rbuf2_i[2*j]; 308 is_no = rbuf2_i[2*j-1]; 309 isz_i = isz[is_no]; 310 data_i = data[is_no]; 311 table_i = table[is_no]; 312 for (k=0; k<max; k++,ct1++) { 313 row = rbuf2_i[ct1]; 314 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 315 } 316 isz[is_no] = isz_i; 317 } 318 } 319 320 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 321 ierr = PetscFree(status2);CHKERRQ(ierr); 322 } 323 324 for (i=0; i<imax; ++i) { 325 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 326 } 327 328 ierr = PetscFree(onodes2);CHKERRQ(ierr); 329 ierr = PetscFree(olengths2);CHKERRQ(ierr); 330 331 ierr = PetscFree(pa);CHKERRQ(ierr); 332 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 333 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 334 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 335 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 336 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 337 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 338 ierr = PetscFree(table);CHKERRQ(ierr); 339 ierr = PetscFree(s_status);CHKERRQ(ierr); 340 ierr = PetscFree(recv_status);CHKERRQ(ierr); 341 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 342 ierr = PetscFree(xdata);CHKERRQ(ierr); 343 ierr = PetscFree(isz1);CHKERRQ(ierr); 344 PetscFunctionReturn(0); 345 } 346 347 #undef __FUNCT__ 348 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 349 /* 350 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 351 the work on the local processor. 352 353 Inputs: 354 C - MAT_MPIAIJ; 355 imax - total no of index sets processed at a time; 356 table - an array of char - size = m bits. 357 358 Output: 359 isz - array containing the count of the solution elements correspondign 360 to each index set; 361 data - pointer to the solutions 362 */ 363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 364 { 365 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 366 Mat A = c->A,B = c->B; 367 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 368 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 369 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 370 PetscBT table_i; 371 372 PetscFunctionBegin; 373 rstart = C->rmap->rstart; 374 cstart = C->cmap->rstart; 375 ai = a->i; 376 aj = a->j; 377 bi = b->i; 378 bj = b->j; 379 garray = c->garray; 380 381 382 for (i=0; i<imax; i++) { 383 data_i = data[i]; 384 table_i = table[i]; 385 isz_i = isz[i]; 386 for (j=0,max=isz[i]; j<max; j++) { 387 row = data_i[j] - rstart; 388 start = ai[row]; 389 end = ai[row+1]; 390 for (k=start; k<end; k++) { /* Amat */ 391 val = aj[k] + cstart; 392 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 393 } 394 start = bi[row]; 395 end = bi[row+1]; 396 for (k=start; k<end; k++) { /* Bmat */ 397 val = garray[bj[k]]; 398 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 399 } 400 } 401 isz[i] = isz_i; 402 } 403 PetscFunctionReturn(0); 404 } 405 406 #undef __FUNCT__ 407 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 408 /* 409 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 410 and return the output 411 412 Input: 413 C - the matrix 414 nrqr - no of messages being processed. 415 rbuf - an array of pointers to the recieved requests 416 417 Output: 418 xdata - array of messages to be sent back 419 isz1 - size of each message 420 421 For better efficiency perhaps we should malloc separately each xdata[i], 422 then if a remalloc is required we need only copy the data for that one row 423 rather then all previous rows as it is now where a single large chunck of 424 memory is used. 425 426 */ 427 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 428 { 429 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 430 Mat A = c->A,B = c->B; 431 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 432 PetscErrorCode ierr; 433 PetscMPIInt rank; 434 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 435 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 436 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 437 PetscInt *rbuf_i,kmax,rbuf_0; 438 PetscBT xtable; 439 440 PetscFunctionBegin; 441 rank = c->rank; 442 m = C->rmap->N; 443 rstart = C->rmap->rstart; 444 cstart = C->cmap->rstart; 445 ai = a->i; 446 aj = a->j; 447 bi = b->i; 448 bj = b->j; 449 garray = c->garray; 450 451 452 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 453 rbuf_i = rbuf[i]; 454 rbuf_0 = rbuf_i[0]; 455 ct += rbuf_0; 456 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 457 } 458 459 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 460 else max1 = 1; 461 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 462 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 463 ++no_malloc; 464 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 465 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 466 467 ct3 = 0; 468 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 469 rbuf_i = rbuf[i]; 470 rbuf_0 = rbuf_i[0]; 471 ct1 = 2*rbuf_0+1; 472 ct2 = ct1; 473 ct3 += ct1; 474 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 475 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 476 oct2 = ct2; 477 kmax = rbuf_i[2*j]; 478 for (k=0; k<kmax; k++,ct1++) { 479 row = rbuf_i[ct1]; 480 if (!PetscBTLookupSet(xtable,row)) { 481 if (!(ct3 < mem_estimate)) { 482 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 483 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 484 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 485 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 486 xdata[0] = tmp; 487 mem_estimate = new_estimate; ++no_malloc; 488 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 489 } 490 xdata[i][ct2++] = row; 491 ct3++; 492 } 493 } 494 for (k=oct2,max2=ct2; k<max2; k++) { 495 row = xdata[i][k] - rstart; 496 start = ai[row]; 497 end = ai[row+1]; 498 for (l=start; l<end; l++) { 499 val = aj[l] + cstart; 500 if (!PetscBTLookupSet(xtable,val)) { 501 if (!(ct3 < mem_estimate)) { 502 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 503 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 504 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 505 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 506 xdata[0] = tmp; 507 mem_estimate = new_estimate; ++no_malloc; 508 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 509 } 510 xdata[i][ct2++] = val; 511 ct3++; 512 } 513 } 514 start = bi[row]; 515 end = bi[row+1]; 516 for (l=start; l<end; l++) { 517 val = garray[bj[l]]; 518 if (!PetscBTLookupSet(xtable,val)) { 519 if (!(ct3 < mem_estimate)) { 520 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 521 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 522 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 523 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 524 xdata[0] = tmp; 525 mem_estimate = new_estimate; ++no_malloc; 526 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 527 } 528 xdata[i][ct2++] = val; 529 ct3++; 530 } 531 } 532 } 533 /* Update the header*/ 534 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 535 xdata[i][2*j-1] = rbuf_i[2*j-1]; 536 } 537 xdata[i][0] = rbuf_0; 538 xdata[i+1] = xdata[i] + ct2; 539 isz1[i] = ct2; /* size of each message */ 540 } 541 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 542 ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 543 PetscFunctionReturn(0); 544 } 545 /* -------------------------------------------------------------------------*/ 546 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 547 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 548 /* 549 Every processor gets the entire matrix 550 */ 551 #undef __FUNCT__ 552 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 553 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 554 { 555 Mat B; 556 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 557 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 558 PetscErrorCode ierr; 559 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 560 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 561 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 562 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 563 564 PetscFunctionBegin; 565 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 566 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 567 568 if (scall == MAT_INITIAL_MATRIX) { 569 /* ---------------------------------------------------------------- 570 Tell every processor the number of nonzeros per row 571 */ 572 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 573 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 574 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 575 } 576 sendcount = A->rmap->rend - A->rmap->rstart; 577 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 578 for (i=0; i<size; i++) { 579 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 580 displs[i] = A->rmap->range[i]; 581 } 582 #if defined(PETSC_HAVE_MPI_IN_PLACE) 583 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 584 #else 585 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 586 #endif 587 /* --------------------------------------------------------------- 588 Create the sequential matrix of the same type as the local block diagonal 589 */ 590 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 591 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 592 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 593 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 594 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 595 **Bin = B; 596 b = (Mat_SeqAIJ *)B->data; 597 598 /*-------------------------------------------------------------------- 599 Copy my part of matrix column indices over 600 */ 601 sendcount = ad->nz + bd->nz; 602 jsendbuf = b->j + b->i[rstarts[rank]]; 603 a_jsendbuf = ad->j; 604 b_jsendbuf = bd->j; 605 n = A->rmap->rend - A->rmap->rstart; 606 cnt = 0; 607 for (i=0; i<n; i++) { 608 609 /* put in lower diagonal portion */ 610 m = bd->i[i+1] - bd->i[i]; 611 while (m > 0) { 612 /* is it above diagonal (in bd (compressed) numbering) */ 613 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 614 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 615 m--; 616 } 617 618 /* put in diagonal portion */ 619 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 620 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 621 } 622 623 /* put in upper diagonal portion */ 624 while (m-- > 0) { 625 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 626 } 627 } 628 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 629 630 /*-------------------------------------------------------------------- 631 Gather all column indices to all processors 632 */ 633 for (i=0; i<size; i++) { 634 recvcounts[i] = 0; 635 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 636 recvcounts[i] += lens[j]; 637 } 638 } 639 displs[0] = 0; 640 for (i=1; i<size; i++) { 641 displs[i] = displs[i-1] + recvcounts[i-1]; 642 } 643 #if defined(PETSC_HAVE_MPI_IN_PLACE) 644 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 645 #else 646 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 647 #endif 648 /*-------------------------------------------------------------------- 649 Assemble the matrix into useable form (note numerical values not yet set) 650 */ 651 /* set the b->ilen (length of each row) values */ 652 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 653 /* set the b->i indices */ 654 b->i[0] = 0; 655 for (i=1; i<=A->rmap->N; i++) { 656 b->i[i] = b->i[i-1] + lens[i-1]; 657 } 658 ierr = PetscFree(lens);CHKERRQ(ierr); 659 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 660 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 661 662 } else { 663 B = **Bin; 664 b = (Mat_SeqAIJ *)B->data; 665 } 666 667 /*-------------------------------------------------------------------- 668 Copy my part of matrix numerical values into the values location 669 */ 670 if (flag == MAT_GET_VALUES){ 671 sendcount = ad->nz + bd->nz; 672 sendbuf = b->a + b->i[rstarts[rank]]; 673 a_sendbuf = ad->a; 674 b_sendbuf = bd->a; 675 b_sendj = bd->j; 676 n = A->rmap->rend - A->rmap->rstart; 677 cnt = 0; 678 for (i=0; i<n; i++) { 679 680 /* put in lower diagonal portion */ 681 m = bd->i[i+1] - bd->i[i]; 682 while (m > 0) { 683 /* is it above diagonal (in bd (compressed) numbering) */ 684 if (garray[*b_sendj] > A->rmap->rstart + i) break; 685 sendbuf[cnt++] = *b_sendbuf++; 686 m--; 687 b_sendj++; 688 } 689 690 /* put in diagonal portion */ 691 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 692 sendbuf[cnt++] = *a_sendbuf++; 693 } 694 695 /* put in upper diagonal portion */ 696 while (m-- > 0) { 697 sendbuf[cnt++] = *b_sendbuf++; 698 b_sendj++; 699 } 700 } 701 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 702 703 /* ----------------------------------------------------------------- 704 Gather all numerical values to all processors 705 */ 706 if (!recvcounts) { 707 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 708 } 709 for (i=0; i<size; i++) { 710 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 711 } 712 displs[0] = 0; 713 for (i=1; i<size; i++) { 714 displs[i] = displs[i-1] + recvcounts[i-1]; 715 } 716 recvbuf = b->a; 717 #if defined(PETSC_HAVE_MPI_IN_PLACE) 718 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 719 #else 720 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 721 #endif 722 } /* endof (flag == MAT_GET_VALUES) */ 723 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 724 725 if (A->symmetric){ 726 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 727 } else if (A->hermitian) { 728 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 729 } else if (A->structurally_symmetric) { 730 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 731 } 732 PetscFunctionReturn(0); 733 } 734 735 736 737 #undef __FUNCT__ 738 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 739 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 740 { 741 PetscErrorCode ierr; 742 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 743 PetscBool rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix; 744 745 PetscFunctionBegin; 746 /* 747 Check for special case each processor gets entire matrix 748 */ 749 if (ismax == 1 && C->rmap->N == C->cmap->N) { 750 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 751 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 752 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 753 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 754 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 755 wantallmatrix = PETSC_TRUE; 756 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 757 } 758 } 759 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 760 if (twantallmatrix) { 761 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 762 PetscFunctionReturn(0); 763 } 764 765 /* Allocate memory to hold all the submatrices */ 766 if (scall != MAT_REUSE_MATRIX) { 767 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 768 } 769 /* Determine the number of stages through which submatrices are done */ 770 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 771 /* 772 Each stage will extract nmax submatrices. 773 nmax is determined by the matrix column dimension. 774 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 775 */ 776 if (!nmax) nmax = 1; 777 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 778 779 /* Make sure every processor loops through the nstages */ 780 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 781 782 for (i=0,pos=0; i<nstages; i++) { 783 if (pos+nmax <= ismax) max_no = nmax; 784 else if (pos == ismax) max_no = 0; 785 else max_no = ismax-pos; 786 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 787 pos += max_no; 788 } 789 PetscFunctionReturn(0); 790 } 791 792 793 794 795 /* -------------------------------------------------------------------------*/ 796 #undef __FUNCT__ 797 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 798 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 799 { 800 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 801 Mat A = c->A; 802 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 803 const PetscInt **icol,**irow; 804 PetscInt *nrow,*ncol,start; 805 PetscErrorCode ierr; 806 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 807 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 808 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 809 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap; 810 PetscInt **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 811 const PetscInt *irow_i; 812 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 813 PetscInt *rmap_i; 814 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 815 MPI_Request *r_waits4,*s_waits3,*s_waits4; 816 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 817 MPI_Status *r_status3,*r_status4,*s_status4; 818 MPI_Comm comm; 819 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 820 PetscBool sorted; 821 PetscMPIInt *onodes1,*olengths1; 822 PetscMPIInt idex,idex2,end; 823 824 PetscFunctionBegin; 825 comm = ((PetscObject)C)->comm; 826 tag0 = ((PetscObject)C)->tag; 827 size = c->size; 828 rank = c->rank; 829 830 /* Get some new tags to keep the communication clean */ 831 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 832 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 833 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 834 835 /* Check if the col indices are sorted */ 836 for (i=0; i<ismax; i++) { 837 ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr); 838 /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/ 839 ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr); 840 /* if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */ 841 } 842 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 843 844 for (i=0; i<ismax; i++) { 845 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 846 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 847 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 848 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 849 } 850 851 /* evaluate communication - mesg to who, length of mesg, and buffer space 852 required. Based on this, buffers are allocated, and data copied into them*/ 853 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 854 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 855 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 856 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 857 for (i=0; i<ismax; i++) { 858 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 859 jmax = nrow[i]; 860 irow_i = irow[i]; 861 for (j=0; j<jmax; j++) { 862 l = 0; 863 row = irow_i[j]; 864 while (row >= C->rmap->range[l+1]) l++; 865 proc = l; 866 w4[proc]++; 867 } 868 for (j=0; j<size; j++) { 869 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 870 } 871 } 872 873 nrqs = 0; /* no of outgoing messages */ 874 msz = 0; /* total mesg length (for all procs) */ 875 w1[rank] = 0; /* no mesg sent to self */ 876 w3[rank] = 0; 877 for (i=0; i<size; i++) { 878 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 879 } 880 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 881 for (i=0,j=0; i<size; i++) { 882 if (w1[i]) { pa[j] = i; j++; } 883 } 884 885 /* Each message would have a header = 1 + 2*(no of IS) + data */ 886 for (i=0; i<nrqs; i++) { 887 j = pa[i]; 888 w1[j] += w2[j] + 2* w3[j]; 889 msz += w1[j]; 890 } 891 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 892 893 /* Determine the number of messages to expect, their lengths, from from-ids */ 894 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 895 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 896 897 /* Now post the Irecvs corresponding to these messages */ 898 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 899 900 ierr = PetscFree(onodes1);CHKERRQ(ierr); 901 ierr = PetscFree(olengths1);CHKERRQ(ierr); 902 903 /* Allocate Memory for outgoing messages */ 904 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 905 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 906 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 907 908 { 909 PetscInt *iptr = tmp,ict = 0; 910 for (i=0; i<nrqs; i++) { 911 j = pa[i]; 912 iptr += ict; 913 sbuf1[j] = iptr; 914 ict = w1[j]; 915 } 916 } 917 918 /* Form the outgoing messages */ 919 /* Initialize the header space */ 920 for (i=0; i<nrqs; i++) { 921 j = pa[i]; 922 sbuf1[j][0] = 0; 923 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 924 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 925 } 926 927 /* Parse the isrow and copy data into outbuf */ 928 for (i=0; i<ismax; i++) { 929 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 930 irow_i = irow[i]; 931 jmax = nrow[i]; 932 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 933 l = 0; 934 row = irow_i[j]; 935 while (row >= C->rmap->range[l+1]) l++; 936 proc = l; 937 if (proc != rank) { /* copy to the outgoing buf*/ 938 ctr[proc]++; 939 *ptr[proc] = row; 940 ptr[proc]++; 941 } 942 } 943 /* Update the headers for the current IS */ 944 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 945 if ((ctr_j = ctr[j])) { 946 sbuf1_j = sbuf1[j]; 947 k = ++sbuf1_j[0]; 948 sbuf1_j[2*k] = ctr_j; 949 sbuf1_j[2*k-1] = i; 950 } 951 } 952 } 953 954 /* Now post the sends */ 955 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 956 for (i=0; i<nrqs; ++i) { 957 j = pa[i]; 958 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 959 } 960 961 /* Post Receives to capture the buffer size */ 962 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 963 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 964 rbuf2[0] = tmp + msz; 965 for (i=1; i<nrqs; ++i) { 966 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 967 } 968 for (i=0; i<nrqs; ++i) { 969 j = pa[i]; 970 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 971 } 972 973 /* Send to other procs the buf size they should allocate */ 974 975 976 /* Receive messages*/ 977 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 978 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 979 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 980 { 981 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 982 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 983 PetscInt *sbuf2_i; 984 985 for (i=0; i<nrqr; ++i) { 986 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 987 req_size[idex] = 0; 988 rbuf1_i = rbuf1[idex]; 989 start = 2*rbuf1_i[0] + 1; 990 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 991 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 992 sbuf2_i = sbuf2[idex]; 993 for (j=start; j<end; j++) { 994 id = rbuf1_i[j] - rstart; 995 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 996 sbuf2_i[j] = ncols; 997 req_size[idex] += ncols; 998 } 999 req_source[idex] = r_status1[i].MPI_SOURCE; 1000 /* form the header */ 1001 sbuf2_i[0] = req_size[idex]; 1002 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 1003 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1004 } 1005 } 1006 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1007 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1008 1009 /* recv buffer sizes */ 1010 /* Receive messages*/ 1011 1012 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1013 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1014 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1015 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1016 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1017 1018 for (i=0; i<nrqs; ++i) { 1019 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1020 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1021 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1022 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1023 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1024 } 1025 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1026 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1027 1028 /* Wait on sends1 and sends2 */ 1029 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1030 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1031 1032 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1033 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1034 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1035 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1036 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1037 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1038 1039 /* Now allocate buffers for a->j, and send them off */ 1040 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1041 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1042 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1043 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1044 1045 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1046 { 1047 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1048 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1049 PetscInt cend = C->cmap->rend; 1050 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1051 1052 for (i=0; i<nrqr; i++) { 1053 rbuf1_i = rbuf1[i]; 1054 sbuf_aj_i = sbuf_aj[i]; 1055 ct1 = 2*rbuf1_i[0] + 1; 1056 ct2 = 0; 1057 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1058 kmax = rbuf1[i][2*j]; 1059 for (k=0; k<kmax; k++,ct1++) { 1060 row = rbuf1_i[ct1] - rstart; 1061 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1062 ncols = nzA + nzB; 1063 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1064 1065 /* load the column indices for this row into cols*/ 1066 cols = sbuf_aj_i + ct2; 1067 1068 lwrite = 0; 1069 for (l=0; l<nzB; l++) { 1070 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1071 } 1072 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1073 for (l=0; l<nzB; l++) { 1074 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1075 } 1076 1077 ct2 += ncols; 1078 } 1079 } 1080 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1081 } 1082 } 1083 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1084 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1085 1086 /* Allocate buffers for a->a, and send them off */ 1087 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1088 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1089 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1090 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1091 1092 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1093 { 1094 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1095 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1096 PetscInt cend = C->cmap->rend; 1097 PetscInt *b_j = b->j; 1098 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1099 1100 for (i=0; i<nrqr; i++) { 1101 rbuf1_i = rbuf1[i]; 1102 sbuf_aa_i = sbuf_aa[i]; 1103 ct1 = 2*rbuf1_i[0]+1; 1104 ct2 = 0; 1105 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1106 kmax = rbuf1_i[2*j]; 1107 for (k=0; k<kmax; k++,ct1++) { 1108 row = rbuf1_i[ct1] - rstart; 1109 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1110 ncols = nzA + nzB; 1111 cworkB = b_j + b_i[row]; 1112 vworkA = a_a + a_i[row]; 1113 vworkB = b_a + b_i[row]; 1114 1115 /* load the column values for this row into vals*/ 1116 vals = sbuf_aa_i+ct2; 1117 1118 lwrite = 0; 1119 for (l=0; l<nzB; l++) { 1120 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1121 } 1122 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1123 for (l=0; l<nzB; l++) { 1124 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1125 } 1126 1127 ct2 += ncols; 1128 } 1129 } 1130 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1131 } 1132 } 1133 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1134 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1135 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1136 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1137 1138 /* Form the matrix */ 1139 /* create col map */ 1140 { 1141 const PetscInt *icol_i; 1142 1143 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1144 ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 1145 ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1146 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; } 1147 for (i=0; i<ismax; i++) { 1148 jmax = ncol[i]; 1149 icol_i = icol[i]; 1150 cmap_i = cmap[i]; 1151 for (j=0; j<jmax; j++) { 1152 cmap_i[icol_i[j]] = j+1; 1153 } 1154 } 1155 } 1156 1157 /* Create lens which is required for MatCreate... */ 1158 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1159 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1160 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1161 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1162 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1163 1164 /* Update lens from local data */ 1165 for (i=0; i<ismax; i++) { 1166 jmax = nrow[i]; 1167 cmap_i = cmap[i]; 1168 irow_i = irow[i]; 1169 lens_i = lens[i]; 1170 for (j=0; j<jmax; j++) { 1171 l = 0; 1172 row = irow_i[j]; 1173 while (row >= C->rmap->range[l+1]) l++; 1174 proc = l; 1175 if (proc == rank) { 1176 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1177 for (k=0; k<ncols; k++) { 1178 if (cmap_i[cols[k]]) { lens_i[j]++;} 1179 } 1180 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1181 } 1182 } 1183 } 1184 1185 /* Create row map*/ 1186 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1187 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1188 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1189 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1190 for (i=0; i<ismax; i++) { 1191 rmap_i = rmap[i]; 1192 irow_i = irow[i]; 1193 jmax = nrow[i]; 1194 for (j=0; j<jmax; j++) { 1195 rmap_i[irow_i[j]] = j; 1196 } 1197 } 1198 1199 /* Update lens from offproc data */ 1200 { 1201 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1202 1203 for (tmp2=0; tmp2<nrqs; tmp2++) { 1204 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1205 idex = pa[idex2]; 1206 sbuf1_i = sbuf1[idex]; 1207 jmax = sbuf1_i[0]; 1208 ct1 = 2*jmax+1; 1209 ct2 = 0; 1210 rbuf2_i = rbuf2[idex2]; 1211 rbuf3_i = rbuf3[idex2]; 1212 for (j=1; j<=jmax; j++) { 1213 is_no = sbuf1_i[2*j-1]; 1214 max1 = sbuf1_i[2*j]; 1215 lens_i = lens[is_no]; 1216 cmap_i = cmap[is_no]; 1217 rmap_i = rmap[is_no]; 1218 for (k=0; k<max1; k++,ct1++) { 1219 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1220 max2 = rbuf2_i[ct1]; 1221 for (l=0; l<max2; l++,ct2++) { 1222 if (cmap_i[rbuf3_i[ct2]]) { 1223 lens_i[row]++; 1224 } 1225 } 1226 } 1227 } 1228 } 1229 } 1230 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1231 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1232 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1233 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1234 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1235 1236 /* Create the submatrices */ 1237 if (scall == MAT_REUSE_MATRIX) { 1238 PetscBool flag; 1239 1240 /* 1241 Assumes new rows are same length as the old rows,hence bug! 1242 */ 1243 for (i=0; i<ismax; i++) { 1244 mat = (Mat_SeqAIJ *)(submats[i]->data); 1245 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1246 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1247 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1248 /* Initial matrix as if empty */ 1249 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1250 submats[i]->factortype = C->factortype; 1251 } 1252 } else { 1253 for (i=0; i<ismax; i++) { 1254 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1255 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1256 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1257 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1258 } 1259 } 1260 1261 /* Assemble the matrices */ 1262 /* First assemble the local rows */ 1263 { 1264 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1265 PetscScalar *imat_a; 1266 1267 for (i=0; i<ismax; i++) { 1268 mat = (Mat_SeqAIJ*)submats[i]->data; 1269 imat_ilen = mat->ilen; 1270 imat_j = mat->j; 1271 imat_i = mat->i; 1272 imat_a = mat->a; 1273 cmap_i = cmap[i]; 1274 rmap_i = rmap[i]; 1275 irow_i = irow[i]; 1276 jmax = nrow[i]; 1277 for (j=0; j<jmax; j++) { 1278 l = 0; 1279 row = irow_i[j]; 1280 while (row >= C->rmap->range[l+1]) l++; 1281 proc = l; 1282 if (proc == rank) { 1283 old_row = row; 1284 row = rmap_i[row]; 1285 ilen_row = imat_ilen[row]; 1286 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1287 mat_i = imat_i[row] ; 1288 mat_a = imat_a + mat_i; 1289 mat_j = imat_j + mat_i; 1290 for (k=0; k<ncols; k++) { 1291 if ((tcol = cmap_i[cols[k]])) { 1292 *mat_j++ = tcol - 1; 1293 *mat_a++ = vals[k]; 1294 ilen_row++; 1295 } 1296 } 1297 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1298 imat_ilen[row] = ilen_row; 1299 } 1300 } 1301 } 1302 } 1303 1304 /* Now assemble the off proc rows*/ 1305 { 1306 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1307 PetscInt *imat_j,*imat_i; 1308 PetscScalar *imat_a,*rbuf4_i; 1309 1310 for (tmp2=0; tmp2<nrqs; tmp2++) { 1311 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1312 idex = pa[idex2]; 1313 sbuf1_i = sbuf1[idex]; 1314 jmax = sbuf1_i[0]; 1315 ct1 = 2*jmax + 1; 1316 ct2 = 0; 1317 rbuf2_i = rbuf2[idex2]; 1318 rbuf3_i = rbuf3[idex2]; 1319 rbuf4_i = rbuf4[idex2]; 1320 for (j=1; j<=jmax; j++) { 1321 is_no = sbuf1_i[2*j-1]; 1322 rmap_i = rmap[is_no]; 1323 cmap_i = cmap[is_no]; 1324 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1325 imat_ilen = mat->ilen; 1326 imat_j = mat->j; 1327 imat_i = mat->i; 1328 imat_a = mat->a; 1329 max1 = sbuf1_i[2*j]; 1330 for (k=0; k<max1; k++,ct1++) { 1331 row = sbuf1_i[ct1]; 1332 row = rmap_i[row]; 1333 ilen = imat_ilen[row]; 1334 mat_i = imat_i[row] ; 1335 mat_a = imat_a + mat_i; 1336 mat_j = imat_j + mat_i; 1337 max2 = rbuf2_i[ct1]; 1338 for (l=0; l<max2; l++,ct2++) { 1339 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1340 *mat_j++ = tcol - 1; 1341 *mat_a++ = rbuf4_i[ct2]; 1342 ilen++; 1343 } 1344 } 1345 imat_ilen[row] = ilen; 1346 } 1347 } 1348 } 1349 } 1350 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1351 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1352 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1353 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1354 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1355 1356 /* Restore the indices */ 1357 for (i=0; i<ismax; i++) { 1358 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1359 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1360 } 1361 1362 /* Destroy allocated memory */ 1363 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1364 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1365 ierr = PetscFree(pa);CHKERRQ(ierr); 1366 1367 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1368 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1369 for (i=0; i<nrqr; ++i) { 1370 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1371 } 1372 for (i=0; i<nrqs; ++i) { 1373 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1374 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1375 } 1376 1377 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1378 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1379 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1380 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1381 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1382 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1383 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1384 1385 ierr = PetscFree(cmap[0]);CHKERRQ(ierr); 1386 ierr = PetscFree(cmap);CHKERRQ(ierr); 1387 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1388 ierr = PetscFree(rmap);CHKERRQ(ierr); 1389 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1390 ierr = PetscFree(lens);CHKERRQ(ierr); 1391 1392 for (i=0; i<ismax; i++) { 1393 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1394 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1395 } 1396 PetscFunctionReturn(0); 1397 } 1398 1399 /* 1400 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1401 Be careful not to destroy them elsewhere. 1402 */ 1403 #undef __FUNCT__ 1404 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1405 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1406 { 1407 /* If making this function public, change the error returned in this function away from _PLIB. */ 1408 PetscErrorCode ierr; 1409 Mat_MPIAIJ *aij; 1410 PetscBool seqaij; 1411 1412 PetscFunctionBegin; 1413 /* Check to make sure the component matrices are compatible with C. */ 1414 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1415 if(!seqaij) { 1416 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1417 } 1418 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1419 if(!seqaij) { 1420 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1421 } 1422 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1423 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1424 } 1425 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1426 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1427 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1428 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1429 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1430 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1431 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1432 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1433 } 1434 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1435 aij = (Mat_MPIAIJ*)((*C)->data); 1436 aij->A = A; 1437 aij->B = B; 1438 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1439 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1440 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1441 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1442 PetscFunctionReturn(0); 1443 } 1444 1445 #undef __FUNCT__ 1446 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1447 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1448 { 1449 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1450 PetscFunctionBegin; 1451 PetscValidPointer(A,2); 1452 PetscValidPointer(B,3); 1453 *A = aij->A; 1454 *B = aij->B; 1455 /* Note that we don't incref *A and *B, so be careful! */ 1456 PetscFunctionReturn(0); 1457 } 1458 1459 1460 #undef __FUNCT__ 1461 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1462 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1463 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1464 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1465 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1466 { 1467 PetscErrorCode ierr; 1468 PetscMPIInt size, flag; 1469 PetscInt i,ii; 1470 PetscInt ismax_c; 1471 PetscFunctionBegin; 1472 if(!ismax) { 1473 PetscFunctionReturn(0); 1474 } 1475 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1476 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1477 if(flag != MPI_IDENT) { 1478 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1479 } 1480 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1481 if(size > 1) { 1482 ++ismax_c; 1483 } 1484 } 1485 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1486 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1487 } 1488 else { /* if(ismax_c) */ 1489 Mat *A,*B; 1490 IS *isrow_c, *iscol_c; 1491 PetscMPIInt size; 1492 /* 1493 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1494 array of sequential matrices underlying the resulting parallel matrices. 1495 Which arrays to allocate is based on the value of MatReuse scall. 1496 There are as many diag matrices as there are original index sets. 1497 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1498 1499 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1500 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1501 will deposite the extracted diag and off-diag parts. 1502 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1503 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1504 */ 1505 1506 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1507 if (scall != MAT_REUSE_MATRIX) { 1508 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1509 } 1510 else { 1511 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1512 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1513 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1514 for(i = 0, ii = 0; i < ismax; ++i) { 1515 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1516 if(size > 1) { 1517 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1518 ++ii; 1519 } 1520 else { 1521 A[i] = (*submat)[i]; 1522 } 1523 } 1524 } 1525 /* 1526 Construct the complements of the iscol ISs for parallel ISs only. 1527 These are used to extract the off-diag portion of the resulting parallel matrix. 1528 The row IS for the off-diag portion is the same as for the diag portion, 1529 so we merely alias the row IS, while skipping those that are sequential. 1530 */ 1531 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1532 for(i = 0, ii = 0; i < ismax; ++i) { 1533 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1534 if(size > 1) { 1535 isrow_c[ii] = isrow[i]; 1536 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1537 ++ii; 1538 } 1539 } 1540 /* Now obtain the sequential A and B submatrices separately. */ 1541 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1542 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1543 for(ii = 0; ii < ismax_c; ++ii) { 1544 ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr); 1545 } 1546 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1547 /* 1548 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1549 have been extracted directly into the parallel matrices containing them, or 1550 simply into the sequential matrix identical with the corresponding A (if size == 1). 1551 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1552 A is put into the correct submat slot (if size == 1). 1553 */ 1554 if(scall != MAT_REUSE_MATRIX) { 1555 for(i = 0, ii = 0; i < ismax; ++i) { 1556 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1557 if(size > 1) { 1558 /* 1559 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1560 */ 1561 /* Construct submat[i] from the Seq pieces A and B. */ 1562 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1563 1564 ++ii; 1565 } 1566 else { 1567 (*submat)[i] = A[i]; 1568 } 1569 } 1570 } 1571 ierr = PetscFree(A); CHKERRQ(ierr); 1572 ierr = PetscFree(B); CHKERRQ(ierr); 1573 } 1574 PetscFunctionReturn(0); 1575 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1576 1577 1578 1579 #undef __FUNCT__ 1580 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1581 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1582 { 1583 PetscErrorCode ierr; 1584 PetscFunctionBegin; 1585 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1586 MatCreateMPIAIJFromSeqMatrices_Private, 1587 MatMPIAIJExtractSeqMatrices_Private); 1588 CHKERRQ(ierr); 1589 PetscFunctionReturn(0); 1590 } 1591