1 2 #define PETSCMAT_DLL 3 4 /* 5 Routines to compute overlapping regions of a parallel MPI matrix 6 and to find submatrices that were shared across processors. 7 */ 8 #include "../src/mat/impls/aij/mpi/mpiaij.h" 9 #include "petscbt.h" 10 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 13 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 14 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 16 17 #undef __FUNCT__ 18 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 19 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 20 { 21 PetscErrorCode ierr; 22 PetscInt i; 23 24 PetscFunctionBegin; 25 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 26 for (i=0; i<ov; ++i) { 27 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 28 } 29 PetscFunctionReturn(0); 30 } 31 32 /* 33 Sample message format: 34 If a processor A wants processor B to process some elements corresponding 35 to index sets is[1],is[5] 36 mesg [0] = 2 (no of index sets in the mesg) 37 ----------- 38 mesg [1] = 1 => is[1] 39 mesg [2] = sizeof(is[1]); 40 ----------- 41 mesg [3] = 5 => is[5] 42 mesg [4] = sizeof(is[5]); 43 ----------- 44 mesg [5] 45 mesg [n] datas[1] 46 ----------- 47 mesg[n+1] 48 mesg[m] data(is[5]) 49 ----------- 50 51 Notes: 52 nrqs - no of requests sent (or to be sent out) 53 nrqr - no of requests recieved (which have to be or which have been processed 54 */ 55 #undef __FUNCT__ 56 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 57 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 58 { 59 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 60 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 61 const PetscInt **idx,*idx_i; 62 PetscInt *n,*rtable,**data,len; 63 PetscErrorCode ierr; 64 PetscMPIInt size,rank,tag1,tag2; 65 PetscInt m,i,j,k,**rbuf,row,proc,nrqs,msz,**outdat,**ptr; 66 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 67 PetscBT *table; 68 MPI_Comm comm; 69 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 70 MPI_Status *s_status,*recv_status; 71 72 PetscFunctionBegin; 73 comm = ((PetscObject)C)->comm; 74 size = c->size; 75 rank = c->rank; 76 m = C->rmap->N; 77 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 79 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 80 81 ierr = PetscMalloc3(imax,PetscInt*,&idx,imax,PetscInt,&n,m,PetscInt,&rtable);CHKERRQ(ierr); 82 83 for (i=0; i<imax; i++) { 84 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 85 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 86 } 87 88 /* Create hash table for the mapping :row -> proc*/ 89 for (i=0,j=0; i<size; i++) { 90 len = C->rmap->range[i+1]; 91 for (; j<len; j++) { 92 rtable[j] = i; 93 } 94 } 95 96 /* evaluate communication - mesg to who,length of mesg, and buffer space 97 required. Based on this, buffers are allocated, and data copied into them*/ 98 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 99 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 100 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 101 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 102 for (i=0; i<imax; i++) { 103 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 104 idx_i = idx[i]; 105 len = n[i]; 106 for (j=0; j<len; j++) { 107 row = idx_i[j]; 108 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 109 proc = rtable[row]; 110 w4[proc]++; 111 } 112 for (j=0; j<size; j++){ 113 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 114 } 115 } 116 117 nrqs = 0; /* no of outgoing messages */ 118 msz = 0; /* total mesg length (for all proc */ 119 w1[rank] = 0; /* no mesg sent to intself */ 120 w3[rank] = 0; 121 for (i=0; i<size; i++) { 122 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 123 } 124 /* pa - is list of processors to communicate with */ 125 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 126 for (i=0,j=0; i<size; i++) { 127 if (w1[i]) {pa[j] = i; j++;} 128 } 129 130 /* Each message would have a header = 1 + 2*(no of IS) + data */ 131 for (i=0; i<nrqs; i++) { 132 j = pa[i]; 133 w1[j] += w2[j] + 2*w3[j]; 134 msz += w1[j]; 135 } 136 137 /* Determine the number of messages to expect, their lengths, from from-ids */ 138 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 139 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 140 141 /* Now post the Irecvs corresponding to these messages */ 142 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 143 144 /* Allocate Memory for outgoing messages */ 145 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 146 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 147 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 148 149 { 150 PetscInt *iptr = tmp,ict = 0; 151 for (i=0; i<nrqs; i++) { 152 j = pa[i]; 153 iptr += ict; 154 outdat[j] = iptr; 155 ict = w1[j]; 156 } 157 } 158 159 /* Form the outgoing messages */ 160 /*plug in the headers*/ 161 for (i=0; i<nrqs; i++) { 162 j = pa[i]; 163 outdat[j][0] = 0; 164 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 165 ptr[j] = outdat[j] + 2*w3[j] + 1; 166 } 167 168 /* Memory for doing local proc's work*/ 169 { 170 PetscInt *d_p; 171 char *t_p; 172 173 /* should replace with PetscMallocN() */ 174 ierr = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) + 175 (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr); 176 ierr = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) + 177 (m)*imax*sizeof(PetscInt) + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr); 178 data = (PetscInt **)(table + imax); 179 isz = (PetscInt *)(data + imax); 180 d_p = (PetscInt *)(isz + imax); 181 t_p = (char *)(d_p + m*imax); 182 for (i=0; i<imax; i++) { 183 table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i; 184 data[i] = d_p + (m)*i; 185 } 186 } 187 188 /* Parse the IS and update local tables and the outgoing buf with the data*/ 189 { 190 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 191 PetscBT table_i; 192 193 for (i=0; i<imax; i++) { 194 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 195 n_i = n[i]; 196 table_i = table[i]; 197 idx_i = idx[i]; 198 data_i = data[i]; 199 isz_i = isz[i]; 200 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 201 row = idx_i[j]; 202 proc = rtable[row]; 203 if (proc != rank) { /* copy to the outgoing buffer */ 204 ctr[proc]++; 205 *ptr[proc] = row; 206 ptr[proc]++; 207 } else { /* Update the local table */ 208 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 209 } 210 } 211 /* Update the headers for the current IS */ 212 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 213 if ((ctr_j = ctr[j])) { 214 outdat_j = outdat[j]; 215 k = ++outdat_j[0]; 216 outdat_j[2*k] = ctr_j; 217 outdat_j[2*k-1] = i; 218 } 219 } 220 isz[i] = isz_i; 221 } 222 } 223 224 /* Now post the sends */ 225 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 226 for (i=0; i<nrqs; ++i) { 227 j = pa[i]; 228 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 229 } 230 231 /* No longer need the original indices*/ 232 for (i=0; i<imax; ++i) { 233 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 234 } 235 ierr = PetscFree3(idx,n,rtable);CHKERRQ(ierr); 236 237 for (i=0; i<imax; ++i) { 238 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 239 } 240 241 /* Do Local work*/ 242 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 243 244 /* Receive messages*/ 245 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 246 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 247 248 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 249 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 250 251 /* Phase 1 sends are complete - deallocate buffers */ 252 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 253 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 254 255 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 256 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 257 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 258 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 259 ierr = PetscFree(rbuf);CHKERRQ(ierr); 260 261 262 /* Send the data back*/ 263 /* Do a global reduction to know the buffer space req for incoming messages*/ 264 { 265 PetscMPIInt *rw1; 266 267 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 268 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 269 270 for (i=0; i<nrqr; ++i) { 271 proc = recv_status[i].MPI_SOURCE; 272 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 273 rw1[proc] = isz1[i]; 274 } 275 ierr = PetscFree(onodes1);CHKERRQ(ierr); 276 ierr = PetscFree(olengths1);CHKERRQ(ierr); 277 278 /* Determine the number of messages to expect, their lengths, from from-ids */ 279 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 280 ierr = PetscFree(rw1);CHKERRQ(ierr); 281 } 282 /* Now post the Irecvs corresponding to these messages */ 283 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 284 285 /* Now post the sends */ 286 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 287 for (i=0; i<nrqr; ++i) { 288 j = recv_status[i].MPI_SOURCE; 289 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 290 } 291 292 /* receive work done on other processors*/ 293 { 294 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 295 PetscMPIInt idex; 296 PetscBT table_i; 297 MPI_Status *status2; 298 299 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 300 for (i=0; i<nrqs; ++i) { 301 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 302 /* Process the message*/ 303 rbuf2_i = rbuf2[idex]; 304 ct1 = 2*rbuf2_i[0]+1; 305 jmax = rbuf2[idex][0]; 306 for (j=1; j<=jmax; j++) { 307 max = rbuf2_i[2*j]; 308 is_no = rbuf2_i[2*j-1]; 309 isz_i = isz[is_no]; 310 data_i = data[is_no]; 311 table_i = table[is_no]; 312 for (k=0; k<max; k++,ct1++) { 313 row = rbuf2_i[ct1]; 314 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 315 } 316 isz[is_no] = isz_i; 317 } 318 } 319 320 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 321 ierr = PetscFree(status2);CHKERRQ(ierr); 322 } 323 324 for (i=0; i<imax; ++i) { 325 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 326 } 327 328 ierr = PetscFree(onodes2);CHKERRQ(ierr); 329 ierr = PetscFree(olengths2);CHKERRQ(ierr); 330 331 ierr = PetscFree(pa);CHKERRQ(ierr); 332 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 333 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 334 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 335 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 336 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 337 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 338 ierr = PetscFree(table);CHKERRQ(ierr); 339 ierr = PetscFree(s_status);CHKERRQ(ierr); 340 ierr = PetscFree(recv_status);CHKERRQ(ierr); 341 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 342 ierr = PetscFree(xdata);CHKERRQ(ierr); 343 ierr = PetscFree(isz1);CHKERRQ(ierr); 344 PetscFunctionReturn(0); 345 } 346 347 #undef __FUNCT__ 348 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 349 /* 350 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 351 the work on the local processor. 352 353 Inputs: 354 C - MAT_MPIAIJ; 355 imax - total no of index sets processed at a time; 356 table - an array of char - size = m bits. 357 358 Output: 359 isz - array containing the count of the solution elements correspondign 360 to each index set; 361 data - pointer to the solutions 362 */ 363 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 364 { 365 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 366 Mat A = c->A,B = c->B; 367 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 368 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 369 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 370 PetscBT table_i; 371 372 PetscFunctionBegin; 373 rstart = C->rmap->rstart; 374 cstart = C->cmap->rstart; 375 ai = a->i; 376 aj = a->j; 377 bi = b->i; 378 bj = b->j; 379 garray = c->garray; 380 381 382 for (i=0; i<imax; i++) { 383 data_i = data[i]; 384 table_i = table[i]; 385 isz_i = isz[i]; 386 for (j=0,max=isz[i]; j<max; j++) { 387 row = data_i[j] - rstart; 388 start = ai[row]; 389 end = ai[row+1]; 390 for (k=start; k<end; k++) { /* Amat */ 391 val = aj[k] + cstart; 392 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 393 } 394 start = bi[row]; 395 end = bi[row+1]; 396 for (k=start; k<end; k++) { /* Bmat */ 397 val = garray[bj[k]]; 398 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 399 } 400 } 401 isz[i] = isz_i; 402 } 403 PetscFunctionReturn(0); 404 } 405 406 #undef __FUNCT__ 407 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 408 /* 409 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 410 and return the output 411 412 Input: 413 C - the matrix 414 nrqr - no of messages being processed. 415 rbuf - an array of pointers to the recieved requests 416 417 Output: 418 xdata - array of messages to be sent back 419 isz1 - size of each message 420 421 For better efficiency perhaps we should malloc separately each xdata[i], 422 then if a remalloc is required we need only copy the data for that one row 423 rather then all previous rows as it is now where a single large chunck of 424 memory is used. 425 426 */ 427 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 428 { 429 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 430 Mat A = c->A,B = c->B; 431 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 432 PetscErrorCode ierr; 433 PetscMPIInt rank; 434 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 435 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 436 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 437 PetscInt *rbuf_i,kmax,rbuf_0; 438 PetscBT xtable; 439 440 PetscFunctionBegin; 441 rank = c->rank; 442 m = C->rmap->N; 443 rstart = C->rmap->rstart; 444 cstart = C->cmap->rstart; 445 ai = a->i; 446 aj = a->j; 447 bi = b->i; 448 bj = b->j; 449 garray = c->garray; 450 451 452 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 453 rbuf_i = rbuf[i]; 454 rbuf_0 = rbuf_i[0]; 455 ct += rbuf_0; 456 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 457 } 458 459 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 460 else max1 = 1; 461 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 462 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 463 ++no_malloc; 464 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 465 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 466 467 ct3 = 0; 468 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 469 rbuf_i = rbuf[i]; 470 rbuf_0 = rbuf_i[0]; 471 ct1 = 2*rbuf_0+1; 472 ct2 = ct1; 473 ct3 += ct1; 474 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 475 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 476 oct2 = ct2; 477 kmax = rbuf_i[2*j]; 478 for (k=0; k<kmax; k++,ct1++) { 479 row = rbuf_i[ct1]; 480 if (!PetscBTLookupSet(xtable,row)) { 481 if (!(ct3 < mem_estimate)) { 482 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 483 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 484 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 485 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 486 xdata[0] = tmp; 487 mem_estimate = new_estimate; ++no_malloc; 488 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 489 } 490 xdata[i][ct2++] = row; 491 ct3++; 492 } 493 } 494 for (k=oct2,max2=ct2; k<max2; k++) { 495 row = xdata[i][k] - rstart; 496 start = ai[row]; 497 end = ai[row+1]; 498 for (l=start; l<end; l++) { 499 val = aj[l] + cstart; 500 if (!PetscBTLookupSet(xtable,val)) { 501 if (!(ct3 < mem_estimate)) { 502 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 503 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 504 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 505 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 506 xdata[0] = tmp; 507 mem_estimate = new_estimate; ++no_malloc; 508 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 509 } 510 xdata[i][ct2++] = val; 511 ct3++; 512 } 513 } 514 start = bi[row]; 515 end = bi[row+1]; 516 for (l=start; l<end; l++) { 517 val = garray[bj[l]]; 518 if (!PetscBTLookupSet(xtable,val)) { 519 if (!(ct3 < mem_estimate)) { 520 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 521 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 522 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 523 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 524 xdata[0] = tmp; 525 mem_estimate = new_estimate; ++no_malloc; 526 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 527 } 528 xdata[i][ct2++] = val; 529 ct3++; 530 } 531 } 532 } 533 /* Update the header*/ 534 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 535 xdata[i][2*j-1] = rbuf_i[2*j-1]; 536 } 537 xdata[i][0] = rbuf_0; 538 xdata[i+1] = xdata[i] + ct2; 539 isz1[i] = ct2; /* size of each message */ 540 } 541 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 542 ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 543 PetscFunctionReturn(0); 544 } 545 /* -------------------------------------------------------------------------*/ 546 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 547 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 548 /* 549 Every processor gets the entire matrix 550 */ 551 #undef __FUNCT__ 552 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 553 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 554 { 555 Mat B; 556 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 557 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 558 PetscErrorCode ierr; 559 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 560 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 561 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 562 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 563 564 PetscFunctionBegin; 565 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 566 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 567 568 if (scall == MAT_INITIAL_MATRIX) { 569 /* ---------------------------------------------------------------- 570 Tell every processor the number of nonzeros per row 571 */ 572 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 573 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 574 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 575 } 576 sendcount = A->rmap->rend - A->rmap->rstart; 577 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 578 for (i=0; i<size; i++) { 579 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 580 displs[i] = A->rmap->range[i]; 581 } 582 #if defined(PETSC_HAVE_MPI_IN_PLACE) 583 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 584 #else 585 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 586 #endif 587 /* --------------------------------------------------------------- 588 Create the sequential matrix of the same type as the local block diagonal 589 */ 590 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 591 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 592 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 593 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 594 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 595 **Bin = B; 596 b = (Mat_SeqAIJ *)B->data; 597 598 /*-------------------------------------------------------------------- 599 Copy my part of matrix column indices over 600 */ 601 sendcount = ad->nz + bd->nz; 602 jsendbuf = b->j + b->i[rstarts[rank]]; 603 a_jsendbuf = ad->j; 604 b_jsendbuf = bd->j; 605 n = A->rmap->rend - A->rmap->rstart; 606 cnt = 0; 607 for (i=0; i<n; i++) { 608 609 /* put in lower diagonal portion */ 610 m = bd->i[i+1] - bd->i[i]; 611 while (m > 0) { 612 /* is it above diagonal (in bd (compressed) numbering) */ 613 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 614 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 615 m--; 616 } 617 618 /* put in diagonal portion */ 619 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 620 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 621 } 622 623 /* put in upper diagonal portion */ 624 while (m-- > 0) { 625 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 626 } 627 } 628 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 629 630 /*-------------------------------------------------------------------- 631 Gather all column indices to all processors 632 */ 633 for (i=0; i<size; i++) { 634 recvcounts[i] = 0; 635 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 636 recvcounts[i] += lens[j]; 637 } 638 } 639 displs[0] = 0; 640 for (i=1; i<size; i++) { 641 displs[i] = displs[i-1] + recvcounts[i-1]; 642 } 643 #if defined(PETSC_HAVE_MPI_IN_PLACE) 644 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 645 #else 646 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 647 #endif 648 /*-------------------------------------------------------------------- 649 Assemble the matrix into useable form (note numerical values not yet set) 650 */ 651 /* set the b->ilen (length of each row) values */ 652 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 653 /* set the b->i indices */ 654 b->i[0] = 0; 655 for (i=1; i<=A->rmap->N; i++) { 656 b->i[i] = b->i[i-1] + lens[i-1]; 657 } 658 ierr = PetscFree(lens);CHKERRQ(ierr); 659 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 660 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 661 662 } else { 663 B = **Bin; 664 b = (Mat_SeqAIJ *)B->data; 665 } 666 667 /*-------------------------------------------------------------------- 668 Copy my part of matrix numerical values into the values location 669 */ 670 if (flag == MAT_GET_VALUES){ 671 sendcount = ad->nz + bd->nz; 672 sendbuf = b->a + b->i[rstarts[rank]]; 673 a_sendbuf = ad->a; 674 b_sendbuf = bd->a; 675 b_sendj = bd->j; 676 n = A->rmap->rend - A->rmap->rstart; 677 cnt = 0; 678 for (i=0; i<n; i++) { 679 680 /* put in lower diagonal portion */ 681 m = bd->i[i+1] - bd->i[i]; 682 while (m > 0) { 683 /* is it above diagonal (in bd (compressed) numbering) */ 684 if (garray[*b_sendj] > A->rmap->rstart + i) break; 685 sendbuf[cnt++] = *b_sendbuf++; 686 m--; 687 b_sendj++; 688 } 689 690 /* put in diagonal portion */ 691 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 692 sendbuf[cnt++] = *a_sendbuf++; 693 } 694 695 /* put in upper diagonal portion */ 696 while (m-- > 0) { 697 sendbuf[cnt++] = *b_sendbuf++; 698 b_sendj++; 699 } 700 } 701 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 702 703 /* ----------------------------------------------------------------- 704 Gather all numerical values to all processors 705 */ 706 if (!recvcounts) { 707 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 708 } 709 for (i=0; i<size; i++) { 710 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 711 } 712 displs[0] = 0; 713 for (i=1; i<size; i++) { 714 displs[i] = displs[i-1] + recvcounts[i-1]; 715 } 716 recvbuf = b->a; 717 #if defined(PETSC_HAVE_MPI_IN_PLACE) 718 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 719 #else 720 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 721 #endif 722 } /* endof (flag == MAT_GET_VALUES) */ 723 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 724 725 if (A->symmetric){ 726 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 727 } else if (A->hermitian) { 728 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 729 } else if (A->structurally_symmetric) { 730 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 731 } 732 PetscFunctionReturn(0); 733 } 734 735 736 737 #undef __FUNCT__ 738 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 739 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 740 { 741 PetscErrorCode ierr; 742 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 743 PetscBool rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix; 744 745 PetscFunctionBegin; 746 /* 747 Check for special case each processor gets entire matrix 748 */ 749 if (ismax == 1 && C->rmap->N == C->cmap->N) { 750 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 751 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 752 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 753 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 754 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 755 wantallmatrix = PETSC_TRUE; 756 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 757 } 758 } 759 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 760 if (twantallmatrix) { 761 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 762 PetscFunctionReturn(0); 763 } 764 765 /* Allocate memory to hold all the submatrices */ 766 if (scall != MAT_REUSE_MATRIX) { 767 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 768 } 769 /* Determine the number of stages through which submatrices are done */ 770 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 771 /* 772 Each stage will extract nmax submatrices. 773 nmax is determined by the matrix column dimension. 774 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 775 */ 776 if (!nmax) nmax = 1; 777 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 778 779 /* Make sure every processor loops through the nstages */ 780 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 781 782 for (i=0,pos=0; i<nstages; i++) { 783 if (pos+nmax <= ismax) max_no = nmax; 784 else if (pos == ismax) max_no = 0; 785 else max_no = ismax-pos; 786 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 787 pos += max_no; 788 } 789 PetscFunctionReturn(0); 790 } 791 792 793 794 795 /* -------------------------------------------------------------------------*/ 796 #undef __FUNCT__ 797 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 798 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 799 { 800 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 801 Mat A = c->A; 802 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 803 const PetscInt **icol,**irow; 804 PetscInt *nrow,*ncol,start; 805 PetscErrorCode ierr; 806 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 807 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 808 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 809 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap; 810 PetscInt **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 811 const PetscInt *irow_i; 812 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 813 PetscInt *rmap_i; 814 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 815 MPI_Request *r_waits4,*s_waits3,*s_waits4; 816 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 817 MPI_Status *r_status3,*r_status4,*s_status4; 818 MPI_Comm comm; 819 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 820 PetscBool sorted; 821 PetscMPIInt *onodes1,*olengths1; 822 PetscMPIInt idex,idex2,end; 823 824 PetscFunctionBegin; 825 comm = ((PetscObject)C)->comm; 826 tag0 = ((PetscObject)C)->tag; 827 size = c->size; 828 rank = c->rank; 829 830 /* Get some new tags to keep the communication clean */ 831 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 832 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 833 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 834 835 /* Check if the col indices are sorted */ 836 for (i=0; i<ismax; i++) { 837 ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr); 838 /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/ 839 ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr); 840 /* if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */ 841 } 842 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 843 844 for (i=0; i<ismax; i++) { 845 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 846 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 847 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 848 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 849 } 850 851 /* evaluate communication - mesg to who, length of mesg, and buffer space 852 required. Based on this, buffers are allocated, and data copied into them*/ 853 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 854 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 855 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 856 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 857 for (i=0; i<ismax; i++) { 858 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 859 jmax = nrow[i]; 860 irow_i = irow[i]; 861 for (j=0; j<jmax; j++) { 862 l = 0; 863 row = irow_i[j]; 864 while (row >= C->rmap->range[l+1]) l++; 865 proc = l; 866 w4[proc]++; 867 } 868 for (j=0; j<size; j++) { 869 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 870 } 871 } 872 873 nrqs = 0; /* no of outgoing messages */ 874 msz = 0; /* total mesg length (for all procs) */ 875 w1[rank] = 0; /* no mesg sent to self */ 876 w3[rank] = 0; 877 for (i=0; i<size; i++) { 878 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 879 } 880 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 881 for (i=0,j=0; i<size; i++) { 882 if (w1[i]) { pa[j] = i; j++; } 883 } 884 885 /* Each message would have a header = 1 + 2*(no of IS) + data */ 886 for (i=0; i<nrqs; i++) { 887 j = pa[i]; 888 w1[j] += w2[j] + 2* w3[j]; 889 msz += w1[j]; 890 } 891 892 /* Determine the number of messages to expect, their lengths, from from-ids */ 893 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 894 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 895 896 /* Now post the Irecvs corresponding to these messages */ 897 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 898 899 ierr = PetscFree(onodes1);CHKERRQ(ierr); 900 ierr = PetscFree(olengths1);CHKERRQ(ierr); 901 902 /* Allocate Memory for outgoing messages */ 903 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 904 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 905 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 906 907 { 908 PetscInt *iptr = tmp,ict = 0; 909 for (i=0; i<nrqs; i++) { 910 j = pa[i]; 911 iptr += ict; 912 sbuf1[j] = iptr; 913 ict = w1[j]; 914 } 915 } 916 917 /* Form the outgoing messages */ 918 /* Initialize the header space */ 919 for (i=0; i<nrqs; i++) { 920 j = pa[i]; 921 sbuf1[j][0] = 0; 922 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 923 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 924 } 925 926 /* Parse the isrow and copy data into outbuf */ 927 for (i=0; i<ismax; i++) { 928 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 929 irow_i = irow[i]; 930 jmax = nrow[i]; 931 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 932 l = 0; 933 row = irow_i[j]; 934 while (row >= C->rmap->range[l+1]) l++; 935 proc = l; 936 if (proc != rank) { /* copy to the outgoing buf*/ 937 ctr[proc]++; 938 *ptr[proc] = row; 939 ptr[proc]++; 940 } 941 } 942 /* Update the headers for the current IS */ 943 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 944 if ((ctr_j = ctr[j])) { 945 sbuf1_j = sbuf1[j]; 946 k = ++sbuf1_j[0]; 947 sbuf1_j[2*k] = ctr_j; 948 sbuf1_j[2*k-1] = i; 949 } 950 } 951 } 952 953 /* Now post the sends */ 954 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 955 for (i=0; i<nrqs; ++i) { 956 j = pa[i]; 957 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 958 } 959 960 /* Post Receives to capture the buffer size */ 961 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 962 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 963 rbuf2[0] = tmp + msz; 964 for (i=1; i<nrqs; ++i) { 965 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 966 } 967 for (i=0; i<nrqs; ++i) { 968 j = pa[i]; 969 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 970 } 971 972 /* Send to other procs the buf size they should allocate */ 973 974 975 /* Receive messages*/ 976 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 977 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 978 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 979 { 980 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 981 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 982 PetscInt *sbuf2_i; 983 984 for (i=0; i<nrqr; ++i) { 985 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 986 req_size[idex] = 0; 987 rbuf1_i = rbuf1[idex]; 988 start = 2*rbuf1_i[0] + 1; 989 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 990 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 991 sbuf2_i = sbuf2[idex]; 992 for (j=start; j<end; j++) { 993 id = rbuf1_i[j] - rstart; 994 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 995 sbuf2_i[j] = ncols; 996 req_size[idex] += ncols; 997 } 998 req_source[idex] = r_status1[i].MPI_SOURCE; 999 /* form the header */ 1000 sbuf2_i[0] = req_size[idex]; 1001 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 1002 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1003 } 1004 } 1005 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1006 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1007 1008 /* recv buffer sizes */ 1009 /* Receive messages*/ 1010 1011 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1012 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1013 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1014 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1015 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1016 1017 for (i=0; i<nrqs; ++i) { 1018 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1019 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1020 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1021 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1022 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1023 } 1024 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1025 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1026 1027 /* Wait on sends1 and sends2 */ 1028 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1029 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1030 1031 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1032 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1033 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1034 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1035 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1036 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1037 1038 /* Now allocate buffers for a->j, and send them off */ 1039 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1040 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1041 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1042 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1043 1044 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1045 { 1046 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1047 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1048 PetscInt cend = C->cmap->rend; 1049 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1050 1051 for (i=0; i<nrqr; i++) { 1052 rbuf1_i = rbuf1[i]; 1053 sbuf_aj_i = sbuf_aj[i]; 1054 ct1 = 2*rbuf1_i[0] + 1; 1055 ct2 = 0; 1056 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1057 kmax = rbuf1[i][2*j]; 1058 for (k=0; k<kmax; k++,ct1++) { 1059 row = rbuf1_i[ct1] - rstart; 1060 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1061 ncols = nzA + nzB; 1062 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1063 1064 /* load the column indices for this row into cols*/ 1065 cols = sbuf_aj_i + ct2; 1066 1067 lwrite = 0; 1068 for (l=0; l<nzB; l++) { 1069 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1070 } 1071 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1072 for (l=0; l<nzB; l++) { 1073 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1074 } 1075 1076 ct2 += ncols; 1077 } 1078 } 1079 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1080 } 1081 } 1082 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1083 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1084 1085 /* Allocate buffers for a->a, and send them off */ 1086 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1087 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1088 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1089 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1090 1091 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1092 { 1093 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1094 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1095 PetscInt cend = C->cmap->rend; 1096 PetscInt *b_j = b->j; 1097 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1098 1099 for (i=0; i<nrqr; i++) { 1100 rbuf1_i = rbuf1[i]; 1101 sbuf_aa_i = sbuf_aa[i]; 1102 ct1 = 2*rbuf1_i[0]+1; 1103 ct2 = 0; 1104 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1105 kmax = rbuf1_i[2*j]; 1106 for (k=0; k<kmax; k++,ct1++) { 1107 row = rbuf1_i[ct1] - rstart; 1108 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1109 ncols = nzA + nzB; 1110 cworkB = b_j + b_i[row]; 1111 vworkA = a_a + a_i[row]; 1112 vworkB = b_a + b_i[row]; 1113 1114 /* load the column values for this row into vals*/ 1115 vals = sbuf_aa_i+ct2; 1116 1117 lwrite = 0; 1118 for (l=0; l<nzB; l++) { 1119 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1120 } 1121 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1122 for (l=0; l<nzB; l++) { 1123 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1124 } 1125 1126 ct2 += ncols; 1127 } 1128 } 1129 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1130 } 1131 } 1132 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1133 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1134 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1135 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1136 1137 /* Form the matrix */ 1138 /* create col map */ 1139 { 1140 const PetscInt *icol_i; 1141 1142 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1143 ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 1144 ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1145 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; } 1146 for (i=0; i<ismax; i++) { 1147 jmax = ncol[i]; 1148 icol_i = icol[i]; 1149 cmap_i = cmap[i]; 1150 for (j=0; j<jmax; j++) { 1151 cmap_i[icol_i[j]] = j+1; 1152 } 1153 } 1154 } 1155 1156 /* Create lens which is required for MatCreate... */ 1157 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1158 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1159 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1160 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1161 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1162 1163 /* Update lens from local data */ 1164 for (i=0; i<ismax; i++) { 1165 jmax = nrow[i]; 1166 cmap_i = cmap[i]; 1167 irow_i = irow[i]; 1168 lens_i = lens[i]; 1169 for (j=0; j<jmax; j++) { 1170 l = 0; 1171 row = irow_i[j]; 1172 while (row >= C->rmap->range[l+1]) l++; 1173 proc = l; 1174 if (proc == rank) { 1175 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1176 for (k=0; k<ncols; k++) { 1177 if (cmap_i[cols[k]]) { lens_i[j]++;} 1178 } 1179 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1180 } 1181 } 1182 } 1183 1184 /* Create row map*/ 1185 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1186 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1187 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1188 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1189 for (i=0; i<ismax; i++) { 1190 rmap_i = rmap[i]; 1191 irow_i = irow[i]; 1192 jmax = nrow[i]; 1193 for (j=0; j<jmax; j++) { 1194 rmap_i[irow_i[j]] = j; 1195 } 1196 } 1197 1198 /* Update lens from offproc data */ 1199 { 1200 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1201 1202 for (tmp2=0; tmp2<nrqs; tmp2++) { 1203 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1204 idex = pa[idex2]; 1205 sbuf1_i = sbuf1[idex]; 1206 jmax = sbuf1_i[0]; 1207 ct1 = 2*jmax+1; 1208 ct2 = 0; 1209 rbuf2_i = rbuf2[idex2]; 1210 rbuf3_i = rbuf3[idex2]; 1211 for (j=1; j<=jmax; j++) { 1212 is_no = sbuf1_i[2*j-1]; 1213 max1 = sbuf1_i[2*j]; 1214 lens_i = lens[is_no]; 1215 cmap_i = cmap[is_no]; 1216 rmap_i = rmap[is_no]; 1217 for (k=0; k<max1; k++,ct1++) { 1218 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1219 max2 = rbuf2_i[ct1]; 1220 for (l=0; l<max2; l++,ct2++) { 1221 if (cmap_i[rbuf3_i[ct2]]) { 1222 lens_i[row]++; 1223 } 1224 } 1225 } 1226 } 1227 } 1228 } 1229 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1230 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1231 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1232 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1233 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1234 1235 /* Create the submatrices */ 1236 if (scall == MAT_REUSE_MATRIX) { 1237 PetscBool flag; 1238 1239 /* 1240 Assumes new rows are same length as the old rows,hence bug! 1241 */ 1242 for (i=0; i<ismax; i++) { 1243 mat = (Mat_SeqAIJ *)(submats[i]->data); 1244 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1245 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1246 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1247 /* Initial matrix as if empty */ 1248 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1249 submats[i]->factortype = C->factortype; 1250 } 1251 } else { 1252 for (i=0; i<ismax; i++) { 1253 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1254 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1255 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1256 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1257 } 1258 } 1259 1260 /* Assemble the matrices */ 1261 /* First assemble the local rows */ 1262 { 1263 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1264 PetscScalar *imat_a; 1265 1266 for (i=0; i<ismax; i++) { 1267 mat = (Mat_SeqAIJ*)submats[i]->data; 1268 imat_ilen = mat->ilen; 1269 imat_j = mat->j; 1270 imat_i = mat->i; 1271 imat_a = mat->a; 1272 cmap_i = cmap[i]; 1273 rmap_i = rmap[i]; 1274 irow_i = irow[i]; 1275 jmax = nrow[i]; 1276 for (j=0; j<jmax; j++) { 1277 l = 0; 1278 row = irow_i[j]; 1279 while (row >= C->rmap->range[l+1]) l++; 1280 proc = l; 1281 if (proc == rank) { 1282 old_row = row; 1283 row = rmap_i[row]; 1284 ilen_row = imat_ilen[row]; 1285 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1286 mat_i = imat_i[row] ; 1287 mat_a = imat_a + mat_i; 1288 mat_j = imat_j + mat_i; 1289 for (k=0; k<ncols; k++) { 1290 if ((tcol = cmap_i[cols[k]])) { 1291 *mat_j++ = tcol - 1; 1292 *mat_a++ = vals[k]; 1293 ilen_row++; 1294 } 1295 } 1296 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1297 imat_ilen[row] = ilen_row; 1298 } 1299 } 1300 } 1301 } 1302 1303 /* Now assemble the off proc rows*/ 1304 { 1305 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1306 PetscInt *imat_j,*imat_i; 1307 PetscScalar *imat_a,*rbuf4_i; 1308 1309 for (tmp2=0; tmp2<nrqs; tmp2++) { 1310 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1311 idex = pa[idex2]; 1312 sbuf1_i = sbuf1[idex]; 1313 jmax = sbuf1_i[0]; 1314 ct1 = 2*jmax + 1; 1315 ct2 = 0; 1316 rbuf2_i = rbuf2[idex2]; 1317 rbuf3_i = rbuf3[idex2]; 1318 rbuf4_i = rbuf4[idex2]; 1319 for (j=1; j<=jmax; j++) { 1320 is_no = sbuf1_i[2*j-1]; 1321 rmap_i = rmap[is_no]; 1322 cmap_i = cmap[is_no]; 1323 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1324 imat_ilen = mat->ilen; 1325 imat_j = mat->j; 1326 imat_i = mat->i; 1327 imat_a = mat->a; 1328 max1 = sbuf1_i[2*j]; 1329 for (k=0; k<max1; k++,ct1++) { 1330 row = sbuf1_i[ct1]; 1331 row = rmap_i[row]; 1332 ilen = imat_ilen[row]; 1333 mat_i = imat_i[row] ; 1334 mat_a = imat_a + mat_i; 1335 mat_j = imat_j + mat_i; 1336 max2 = rbuf2_i[ct1]; 1337 for (l=0; l<max2; l++,ct2++) { 1338 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 1339 *mat_j++ = tcol - 1; 1340 *mat_a++ = rbuf4_i[ct2]; 1341 ilen++; 1342 } 1343 } 1344 imat_ilen[row] = ilen; 1345 } 1346 } 1347 } 1348 } 1349 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1350 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1351 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1352 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1353 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1354 1355 /* Restore the indices */ 1356 for (i=0; i<ismax; i++) { 1357 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1358 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1359 } 1360 1361 /* Destroy allocated memory */ 1362 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1363 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1364 ierr = PetscFree(pa);CHKERRQ(ierr); 1365 1366 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1367 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1368 for (i=0; i<nrqr; ++i) { 1369 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1370 } 1371 for (i=0; i<nrqs; ++i) { 1372 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1373 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1374 } 1375 1376 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1377 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1378 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1379 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1380 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1381 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1382 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1383 1384 ierr = PetscFree(cmap[0]);CHKERRQ(ierr); 1385 ierr = PetscFree(cmap);CHKERRQ(ierr); 1386 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1387 ierr = PetscFree(rmap);CHKERRQ(ierr); 1388 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1389 ierr = PetscFree(lens);CHKERRQ(ierr); 1390 1391 for (i=0; i<ismax; i++) { 1392 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1393 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1394 } 1395 PetscFunctionReturn(0); 1396 } 1397 1398 /* 1399 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1400 Be careful not to destroy them elsewhere. 1401 */ 1402 #undef __FUNCT__ 1403 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1404 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1405 { 1406 /* If making this function public, change the error returned in this function away from _PLIB. */ 1407 PetscErrorCode ierr; 1408 Mat_MPIAIJ *aij; 1409 PetscBool seqaij; 1410 1411 PetscFunctionBegin; 1412 /* Check to make sure the component matrices are compatible with C. */ 1413 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1414 if(!seqaij) { 1415 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1416 } 1417 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1418 if(!seqaij) { 1419 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1420 } 1421 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1422 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1423 } 1424 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1425 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1426 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1427 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1428 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1429 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1430 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1431 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1432 } 1433 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1434 aij = (Mat_MPIAIJ*)((*C)->data); 1435 aij->A = A; 1436 aij->B = B; 1437 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1438 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1439 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1440 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1441 PetscFunctionReturn(0); 1442 } 1443 1444 #undef __FUNCT__ 1445 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1446 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1447 { 1448 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1449 PetscFunctionBegin; 1450 PetscValidPointer(A,2); 1451 PetscValidPointer(B,3); 1452 *A = aij->A; 1453 *B = aij->B; 1454 /* Note that we don't incref *A and *B, so be careful! */ 1455 PetscFunctionReturn(0); 1456 } 1457 1458 1459 #undef __FUNCT__ 1460 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1461 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1462 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1463 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1464 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1465 { 1466 PetscErrorCode ierr; 1467 PetscMPIInt size, flag; 1468 PetscInt i,ii; 1469 PetscInt ismax_c; 1470 PetscFunctionBegin; 1471 if(!ismax) { 1472 PetscFunctionReturn(0); 1473 } 1474 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1475 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1476 if(flag != MPI_IDENT) { 1477 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1478 } 1479 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1480 if(size > 1) { 1481 ++ismax_c; 1482 } 1483 } 1484 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1485 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1486 } 1487 else { 1488 Mat *A,*B; 1489 IS *isrow_c, *iscol_c; 1490 PetscMPIInt size; 1491 /* 1492 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1493 array of sequential matrices underlying the resulting parallel matrices. 1494 Which arrays to allocate is based on the value of MatReuse scall. 1495 There are as many diag matrices as there are original index sets. 1496 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1497 */ 1498 1499 /* 1500 Sequential matrix arrays are allocated in any event. 1501 Even if the array of parallel matrices already exists, we have to consolidate the underlying seq matrices. 1502 */ 1503 ierr = PetscMalloc2(ismax, Mat, &A, ismax_c, Mat, &B); CHKERRQ(ierr); 1504 1505 /* Parallel matrix array is only allocated if no reuse is taking place. */ 1506 if (scall != MAT_REUSE_MATRIX) { 1507 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1508 } 1509 else { 1510 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1511 for(i = 0, ii = 0; i < ismax; ++i) { 1512 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1513 if(size > 1) { 1514 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1515 ++ii; 1516 } 1517 else { 1518 A[i] = (*submat)[i]; 1519 } 1520 } 1521 } 1522 /* 1523 Construct the complements of the iscol ISs for parallel ISs only. 1524 These are used to extract the off-diag portion of the resulting parallel matrix. 1525 The row IS for the off-diag portion is the same as for the diag portion, 1526 so we merely alias the row IS, while skipping those that are sequential. 1527 */ 1528 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1529 for(i = 0, ii = 0; i < ismax; ++i) { 1530 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1531 if(size > 1) { 1532 isrow_c[ii] = isrow[i]; 1533 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1534 ++ii; 1535 } 1536 } 1537 /* Now obtain the sequential A and B submatrices separately. */ 1538 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1539 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1540 for(ii = 0; ii < ismax_c; ++ii) { 1541 ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr); 1542 } 1543 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1544 /* 1545 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1546 have been extracted directly into the parallel matrices containing them, or 1547 simply into the sequential matrix identical with the corresponding A (if size == 1). 1548 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1549 A is put into the correct submat slot (if size == 1). 1550 */ 1551 if(scall != MAT_REUSE_MATRIX) { 1552 for(i = 0, ii = 0; i < ismax; ++i) { 1553 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1554 if(size > 1) { 1555 /* 1556 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1557 */ 1558 /* Construct submat[i] from the Seq pieces A and B. */ 1559 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1560 1561 ++ii; 1562 } 1563 else { 1564 (*submat)[i] = A[i]; 1565 } 1566 } 1567 } 1568 ierr = PetscFree2(A,B); CHKERRQ(ierr); 1569 } 1570 PetscFunctionReturn(0); 1571 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1572 1573 1574 1575 #undef __FUNCT__ 1576 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1577 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1578 { 1579 PetscErrorCode ierr; 1580 PetscFunctionBegin; 1581 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1582 MatCreateMPIAIJFromSeqMatrices_Private, 1583 MatMPIAIJExtractSeqMatrices_Private); 1584 CHKERRQ(ierr); 1585 PetscFunctionReturn(0); 1586 } 1587