1 #ifndef lint 2 static char vcid[] = "$Id: mpiov.c,v 1.25 1996/02/15 23:59:09 balay Exp balay $"; 3 #endif 4 5 #include "mpiaij.h" 6 #include "inline/bitarray.h" 7 8 static int MatIncreaseOverlap_MPIAIJ_private(Mat, int, IS *); 9 static int FindOverlapLocal(Mat , int , char **,int*, int**); 10 static int FindOverlapRecievedMesg(Mat , int, int **, int**, int* ); 11 12 int MatIncreaseOverlap_MPIAIJ(Mat C, int imax, IS *is, int ov) 13 { 14 int i, ierr; 15 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIAIJ: negative overlap specified\n");} 16 for (i =0; i<ov; ++i) { 17 ierr = MatIncreaseOverlap_MPIAIJ_private(C, imax, is); CHKERRQ(ierr); 18 } 19 return 0; 20 } 21 22 /* 23 Sample message format: 24 If a processor A wants processor B to process some elements corresponding 25 to index sets 1s[1], is[5] 26 mesg [0] = 2 ( no of index sets in the mesg) 27 ----------- 28 mesg [1] = 1 => is[1] 29 mesg [2] = sizeof(is[1]); 30 ----------- 31 mesg [5] = 5 => is[5] 32 mesg [6] = sizeof(is[5]); 33 ----------- 34 mesg [7] 35 mesg [n] datas[1] 36 ----------- 37 mesg[n+1] 38 mesg[m] data(is[5]) 39 ----------- 40 41 Notes: 42 nrqs - no of requests sent (or to be sent out) 43 nrqr - no of requests recieved (which have to be or which have been processed 44 */ 45 static int MatIncreaseOverlap_MPIAIJ_private(Mat C, int imax, IS *is) 46 { 47 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 48 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data; 49 int size, rank, m,i,j,k, ierr, **rbuf, row, proc, nrqs, msz, **outdat, **ptr; 50 int *ctr, *pa, tag, *tmp,bsz, nrqr , *isz, *isz1, **xdata; 51 int bsz1, **rbuf2; 52 char **table; 53 MPI_Comm comm; 54 MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2 ; 55 MPI_Status *send_status ,*recv_status; 56 double space, fr, maxs; 57 58 comm = C->comm; 59 tag = C->tag; 60 size = c->size; 61 rank = c->rank; 62 m = c->M; 63 64 65 TrSpace( &space, &fr, &maxs ); 66 /* MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs); */ 67 68 idx = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(idx); 69 n = (int *)PetscMalloc((imax+1)*sizeof(int )); CHKPTRQ(n); 70 rtable = (int *)PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable); 71 /* Hash table for maping row ->proc */ 72 73 for ( i=0 ; i<imax ; i++) { 74 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 75 ierr = ISGetLocalSize(is[i],&n[i]); CHKERRQ(ierr); 76 } 77 78 /* Create hash table for the mapping :row -> proc*/ 79 for( i=0, j=0; i< size; i++) { 80 for (; j <c->rowners[i+1]; j++) { 81 rtable[j] = i; 82 } 83 } 84 85 /* evaluate communication - mesg to who, length of mesg, and buffer space 86 required. Based on this, buffers are allocated, and data copied into them*/ 87 w1 = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /* mesg size */ 88 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 89 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 90 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 91 PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/ 92 for ( i=0; i<imax ; i++) { 93 PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/ 94 for ( j =0 ; j < n[i] ; j++) { 95 row = idx[i][j]; 96 proc = rtable[row]; 97 w4[proc]++; 98 } 99 for( j = 0; j < size; j++){ 100 if( w4[j] ) { w1[j] += w4[j]; w3[j] += 1;} 101 } 102 } 103 104 nrqs = 0; /* no of outgoing messages */ 105 msz = 0; /* total mesg length (for all proc */ 106 w1[rank] = 0; /* no mesg sent to intself */ 107 w3[rank] = 0; 108 for (i =0; i < size ; i++) { 109 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 110 } 111 pa = (int *)PetscMalloc((nrqs +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */ 112 for (i =0, j=0; i < size ; i++) { 113 if (w1[i]) { pa[j] = i; j++; } 114 } 115 116 /* Each message would have a header = 1 + 2*(no of IS) + data */ 117 for (i = 0; i<nrqs ; i++) { 118 j = pa[i]; 119 w1[j] += w2[j] + 2* w3[j]; 120 msz += w1[j]; 121 } 122 123 124 /* Do a global reduction to determine how many messages to expect*/ 125 { 126 int *rw1, *rw2; 127 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 128 rw2 = rw1+size; 129 MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm); 130 bsz = rw1[rank]; 131 MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm); 132 nrqr = rw2[rank]; 133 PetscFree(rw1); 134 } 135 136 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 137 rbuf = (int**) PetscMalloc((nrqr+1) *sizeof(int*)); CHKPTRQ(rbuf); 138 rbuf[0] = (int *) PetscMalloc((nrqr *bsz+1) * sizeof(int)); CHKPTRQ(rbuf[0]); 139 for (i=1; i<nrqr ; ++i) rbuf[i] = rbuf[i-1] + bsz; 140 141 /* Now post the receives */ 142 recv_waits = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 143 CHKPTRQ(recv_waits); 144 for ( i=0; i<nrqr; ++i){ 145 MPI_Irecv((void *)(rbuf[i]), bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i); 146 } 147 148 /* Allocate Memory for outgoing messages */ 149 outdat = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(outdat); 150 PetscMemzero(outdat, 2*size*sizeof(int*)); 151 tmp = (int *)PetscMalloc((msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */ 152 ptr = outdat +size; /* Pointers to the data in outgoing buffers */ 153 ctr = (int *)PetscMalloc( size*sizeof(int)); CHKPTRQ(ctr); 154 155 { 156 int *iptr = tmp; 157 int ict = 0; 158 for (i = 0; i < nrqs ; i++) { 159 j = pa[i]; 160 iptr += ict; 161 outdat[j] = iptr; 162 ict = w1[j]; 163 } 164 } 165 166 /* Form the outgoing messages */ 167 /*plug in the headers*/ 168 for ( i=0 ; i<nrqs ; i++) { 169 j = pa[i]; 170 outdat[j][0] = 0; 171 PetscMemzero(outdat[j]+1, 2 * w3[j]*sizeof(int)); 172 ptr[j] = outdat[j] + 2*w3[j] +1; 173 } 174 175 /* Memory for doing local proc's work*/ 176 table = (char **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(table); 177 data = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(data); 178 table[0] = (char *)PetscMalloc((m/BITSPERBYTE +1)*(imax)); CHKPTRQ(table[0]); 179 data [0] = (int *)PetscMalloc((m+1)*(imax)*sizeof(int)); CHKPTRQ(data[0]); 180 181 for(i = 1; i<imax ; i++) { 182 table[i] = table[0] + (m/BITSPERBYTE+1)*i; 183 data[i] = data[0] + (m+1)*i; 184 } 185 186 PetscMemzero((void*)*table,(m/BITSPERBYTE+1)*(imax)); 187 isz = (int *)PetscMalloc((imax+1) *sizeof(int)); CHKPTRQ(isz); 188 PetscMemzero((void *)isz,(imax+1) *sizeof(int)); 189 190 /* Parse the IS and update local tables and the outgoing buf with the data*/ 191 for ( i=0 ; i<imax ; i++) { 192 PetscMemzero(ctr,size*sizeof(int)); 193 for( j=0; j<n[i]; j++) { /* parse the indices of each IS */ 194 row = idx[i][j]; 195 proc = rtable[row]; 196 if (proc != rank) { /* copy to the outgoing buf*/ 197 ctr[proc]++; 198 *ptr[proc] = row; 199 ptr[proc]++; 200 } 201 else { /* Update the table */ 202 if ( !BT_LOOKUP(table[i],row)) { data[i][isz[i]++] = row;} 203 } 204 } 205 /* Update the headers for the current IS */ 206 for( j = 0; j<size; j++) { /* Can Optimise this loop too */ 207 if (ctr[j]) { 208 k= ++outdat[j][0]; 209 outdat[j][2*k] = ctr[j]; 210 outdat[j][2*k-1] = i; 211 } 212 } 213 } 214 215 216 217 /* Now post the sends */ 218 send_waits = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 219 CHKPTRQ(send_waits); 220 for( i =0; i< nrqs; ++i){ 221 j = pa[i]; 222 MPI_Isend( (void *)(outdat[j]), w1[j], MPI_INT, j, tag, comm, send_waits+i); 223 } 224 225 /* I nolonger need the original indices*/ 226 for( i=0; i< imax; ++i) { 227 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 228 } 229 PetscFree(idx); 230 PetscFree(n); 231 PetscFree(rtable); 232 for( i=0; i< imax; ++i) { 233 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 234 } 235 236 /* Do Local work*/ 237 ierr = FindOverlapLocal(C, imax, table,isz, data); CHKERRQ(ierr); 238 /* Extract the matrices */ 239 240 /* Receive messages*/ 241 { 242 int index; 243 244 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 245 CHKPTRQ(recv_status); 246 for ( i=0; i< nrqr; ++i) { 247 MPI_Waitany(nrqr, recv_waits, &index, recv_status+i); 248 } 249 250 send_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 251 CHKPTRQ(send_status); 252 MPI_Waitall(nrqs,send_waits,send_status); 253 } 254 /* Pahse 1 sends are complete - deallocate buffers */ 255 PetscFree(outdat); 256 PetscFree(w1); 257 PetscFree(tmp); 258 259 /* int FindOverlapRecievedMesg(Mat C, int imax, int *isz, char **table, int **data)*/ 260 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 261 isz1 = (int *)PetscMalloc((nrqr+1) *sizeof(int)); CHKPTRQ(isz1); 262 ierr = FindOverlapRecievedMesg(C, nrqr, rbuf,xdata,isz1); CHKERRQ(ierr); 263 264 /* Nolonger need rbuf. */ 265 PetscFree(rbuf[0]); 266 PetscFree(rbuf); 267 268 269 /* Send the data back*/ 270 /* Do a global reduction to know the buffer space req for incoming messages*/ 271 { 272 int *rw1, *rw2; 273 274 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 275 PetscMemzero((void*)rw1,2*size*sizeof(int)); 276 rw2 = rw1+size; 277 for (i =0; i < nrqr ; ++i) { 278 proc = recv_status[i].MPI_SOURCE; 279 rw1[proc] = isz1[i]; 280 } 281 282 MPI_Allreduce((void *)rw1, (void *)rw2, size, MPI_INT, MPI_MAX, comm); 283 bsz1 = rw2[rank]; 284 PetscFree(rw1); 285 } 286 287 /* Allocate buffers*/ 288 289 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 290 rbuf2 = (int**) PetscMalloc((nrqs+1) *sizeof(int*)); CHKPTRQ(rbuf2); 291 rbuf2[0] = (int *) PetscMalloc((nrqs*bsz1+1) * sizeof(int)); CHKPTRQ(rbuf2[0]); 292 for (i=1; i<nrqs ; ++i) rbuf2[i] = rbuf2[i-1] + bsz1; 293 294 /* Now post the receives */ 295 recv_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); CHKPTRQ(recv_waits2) 296 CHKPTRQ(recv_waits2); 297 for ( i=0; i<nrqs; ++i){ 298 MPI_Irecv((void *)(rbuf2[i]), bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits2+i); 299 } 300 301 /* Now post the sends */ 302 send_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 303 CHKPTRQ(send_waits2); 304 for( i =0; i< nrqr; ++i){ 305 j = recv_status[i].MPI_SOURCE; 306 MPI_Isend( (void *)(xdata[i]), isz1[i], MPI_INT, j, tag, comm, send_waits2+i); 307 } 308 309 /* recieve work done on other processors*/ 310 { 311 int index, is_no, ct1, max; 312 MPI_Status *send_status2 ,*recv_status2; 313 314 recv_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 315 CHKPTRQ(recv_status2); 316 317 318 for ( i=0; i< nrqs; ++i) { 319 MPI_Waitany(nrqs, recv_waits2, &index, recv_status2+i); 320 /* Process the message*/ 321 ct1 = 2*rbuf2[index][0]+1; 322 for (j=1; j<=rbuf2[index][0]; j++) { 323 max = rbuf2[index][2*j]; 324 is_no = rbuf2[index][2*j-1]; 325 for (k=0; k < max ; k++, ct1++) { 326 row = rbuf2[index][ct1]; 327 if(!BT_LOOKUP(table[is_no],row)) { data[is_no][isz[is_no]++] = row;} 328 } 329 } 330 } 331 332 333 send_status2 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 334 CHKPTRQ(send_status2); 335 MPI_Waitall(nrqr,send_waits2,send_status2); 336 337 PetscFree(send_status2); PetscFree(recv_status2); 338 } 339 340 for ( i=0; i<imax; ++i) { 341 ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 342 } 343 TrSpace( &space, &fr, &maxs ); 344 /* MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs);*/ 345 346 PetscFree(ctr); 347 PetscFree(pa); 348 PetscFree(rbuf2[0]); 349 PetscFree(rbuf2); 350 PetscFree(send_waits); 351 PetscFree(recv_waits); 352 PetscFree(send_waits2); 353 PetscFree(recv_waits2); 354 PetscFree(table[0]); 355 PetscFree(table); 356 PetscFree(send_status); 357 PetscFree(recv_status); 358 PetscFree(isz1); 359 PetscFree(xdata[0]); 360 PetscFree(xdata); 361 PetscFree(isz); 362 PetscFree(data[0]); 363 PetscFree(data); 364 365 return 0; 366 } 367 368 /* FindOverlapLocal() - Called by MatincreaseOverlap, to do the work on 369 the local processor. 370 371 Inputs: 372 C - MAT_MPIAIJ; 373 imax - total no of index sets processed at a time; 374 table - an array of char - size = m bits. 375 376 Output: 377 isz - array containing the count of the solution elements correspondign 378 to each index set; 379 data - pointer to the solutions 380 */ 381 static int FindOverlapLocal(Mat C, int imax, char **table, int *isz,int **data) 382 { 383 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 384 Mat A = c->A, B = c->B; 385 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 386 int start, end, val, max, rstart,cstart, ashift, bshift,*ai, *aj; 387 int *bi, *bj, *garray, i, j, k, row; 388 389 rstart = c->rstart; 390 cstart = c->cstart; 391 ashift = a->indexshift; 392 ai = a->i; 393 aj = a->j +ashift; 394 bshift = b->indexshift; 395 bi = b->i; 396 bj = b->j +bshift; 397 garray = c->garray; 398 399 400 for( i=0; i<imax; i++) { 401 for ( j=0, max =isz[i] ; j< max; j++) { 402 row = data[i][j] - rstart; 403 start = ai[row]; 404 end = ai[row+1]; 405 for ( k=start; k < end; k++) { /* Amat */ 406 val = aj[k] + ashift + cstart; 407 if(!BT_LOOKUP(table[i],val)) { data[i][isz[i]++] = val;} 408 } 409 start = bi[row]; 410 end = bi[row+1]; 411 for ( k=start; k < end; k++) { /* Bmat */ 412 val = garray[bj[k]+bshift] ; 413 if(! BT_LOOKUP(table[i],val)) { data[i][isz[i]++] = val;} 414 } 415 } 416 } 417 418 return 0; 419 } 420 /* FindOverlapRecievedMesg - Process the recieved messages, 421 and return the output 422 423 Input: 424 C - the matrix 425 nrqr - no of messages being processed. 426 rbuf - an array of pointers to the recieved requests 427 428 Output: 429 xdata - array of messages to be sent back 430 isz1 - size of each message 431 */ 432 static int FindOverlapRecievedMesg(Mat C, int nrqr, int ** rbuf, int ** xdata, int * isz1 ) 433 { 434 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 435 Mat A = c->A, B = c->B; 436 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 437 int rstart,cstart, ashift, bshift,*ai, *aj, *bi, *bj, *garray, i, j, k; 438 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 439 int val, max1, max2, rank, m, no_malloc =0, *tmp, new_estimate, ctr; 440 char *xtable; 441 442 rank = c->rank; 443 m = c->M; 444 rstart = c->rstart; 445 cstart = c->cstart; 446 ashift = a->indexshift; 447 ai = a->i; 448 aj = a->j +ashift; 449 bshift = b->indexshift; 450 bi = b->i; 451 bj = b->j +bshift; 452 garray = c->garray; 453 454 455 for (i =0, ct =0, total_sz =0; i< nrqr; ++i){ 456 ct+= rbuf[i][0]; 457 for ( j = 1; j <= rbuf[i][0] ; j++ ) { total_sz += rbuf[i][2*j]; } 458 } 459 460 max1 = ct*(a->nz +b->nz)/c->m; 461 mem_estimate = 3*((total_sz > max1?total_sz:max1)+1); 462 xdata[0] = (int *)PetscMalloc(mem_estimate *sizeof(int)); CHKPTRQ(xdata[0]); 463 ++no_malloc; 464 xtable = (char *)PetscMalloc((m/BITSPERBYTE+1)); CHKPTRQ(xtable); 465 PetscMemzero((void *)isz1,(nrqr+1) *sizeof(int)); 466 467 ct3 = 0; 468 for (i =0; i< nrqr; i++) { /* for easch mesg from proc i */ 469 ct1 = 2*rbuf[i][0]+1; 470 ct2 = ct1; 471 ct3+= ct1; 472 for (j = 1, max1= rbuf[i][0]; j<=max1; j++) { /* for each IS from proc i*/ 473 PetscMemzero((void *)xtable,(m/BITSPERBYTE+1)); 474 oct2 = ct2; 475 for (k =0; k < rbuf[i][2*j]; k++, ct1++) { 476 row = rbuf[i][ct1]; 477 if(!BT_LOOKUP(xtable,row)) { 478 if (!(ct3 < mem_estimate)) { 479 new_estimate = (int)(1.5*mem_estimate)+1; 480 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 481 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 482 PetscFree(xdata[0]); 483 xdata[0] = tmp; 484 mem_estimate = new_estimate; ++no_malloc; 485 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 486 } 487 xdata[i][ct2++] = row;ct3++; 488 } 489 } 490 for ( k=oct2, max2 =ct2 ; k< max2; k++) { 491 row = xdata[i][k] - rstart; 492 start = ai[row]; 493 end = ai[row+1]; 494 for ( l=start; l < end; l++) { 495 val = aj[l] +ashift + cstart; 496 if(!BT_LOOKUP(xtable,val)) { 497 if (!(ct3 < mem_estimate)) { 498 new_estimate = (int)(1.5*mem_estimate)+1; 499 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 500 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 501 PetscFree(xdata[0]); 502 xdata[0] = tmp; 503 mem_estimate = new_estimate; ++no_malloc; 504 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 505 } 506 xdata[i][ct2++] = val;ct3++; 507 } 508 } 509 start = bi[row]; 510 end = bi[row+1]; 511 for ( l=start; l < end; l++) { 512 val = garray[bj[l]+bshift] ; 513 if(!BT_LOOKUP(xtable,val)) { 514 if (!(ct3 < mem_estimate)) { 515 new_estimate = (int)(1.5*mem_estimate)+1; 516 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 517 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 518 PetscFree(xdata[0]); 519 xdata[0] = tmp; 520 mem_estimate = new_estimate; ++no_malloc; 521 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 522 } 523 xdata[i][ct2++] = val;ct3++; 524 } 525 } 526 } 527 /* Update the header*/ 528 xdata[i][2*j] = ct2-oct2; /* Undo the vector isz1 and use only a var*/ 529 xdata[i][2*j-1] = rbuf[i][2*j-1]; 530 } 531 xdata[i][0] = rbuf[i][0]; 532 xdata[i+1] = xdata[i] +ct2; 533 isz1[i] = ct2; /* size of each message */ 534 } 535 PetscFree(xtable); 536 PLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 537 return 0; 538 } 539 540 541 int MatGetSubMatrices_MPIAIJ (Mat C,int ismax, IS *isrow,IS *iscol,MatGetSubMatrixCall 542 scall, Mat **submat) 543 { 544 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 545 Mat A = c->A; 546 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *mat; 547 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable, start, end, size; 548 int **sbuf1,**sbuf2, rank, m,i,j,k,l,ct1,ct2, ierr, **rbuf1, row, proc; 549 int nrqs, msz, **ptr, index, *req_size, *ctr, *pa, tag, *tmp,tcol,bsz, nrqr; 550 int **rbuf3,*req_source,**sbuf_aj, ashift, **rbuf2, max1,max2, **rmap; 551 int **cmap,**lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2; 552 MPI_Request *send_waits, *recv_waits, *send_waits2, *recv_waits2, *recv_waits3 ; 553 MPI_Request *recv_waits4,*send_waits3,*send_waits4; 554 MPI_Status *recv_status ,*recv_status2,*send_status,*send_status3 ,*send_status2; 555 MPI_Status *recv_status3,*recv_status4,*send_status4; 556 MPI_Comm comm; 557 Scalar **rbuf4, **sbuf_aa, *vals, *mat_a; 558 559 comm = C->comm; 560 tag = C->tag; 561 size = c->size; 562 rank = c->rank; 563 m = c->M; 564 ashift = a->indexshift; 565 566 /* Check if the col indices are sorted */ 567 for (i=0; i<ismax; i++) { 568 if (ISSorted(iscol[i],&j),!j) SETERRQ(1,"MatGetSubmatrices_MPIAIJ: col IS is not sorted"); 569 } 570 571 irow = (int **)PetscMalloc((ismax+1)*sizeof(int *)); CHKPTRQ(irow); 572 icol = (int **)PetscMalloc((ismax+1)*sizeof(int *)); CHKPTRQ(icol); 573 nrow = (int *) PetscMalloc((ismax+1)*sizeof(int )); CHKPTRQ(nrow); 574 ncol = (int *) PetscMalloc((ismax+1)*sizeof(int )); CHKPTRQ(ncol); 575 rtable = (int *) PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable); 576 /* Hash table for maping row ->proc */ 577 578 for ( i=0 ; i<ismax ; i++) { /* Extract the indicies and sort them */ 579 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 580 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 581 ierr = ISGetLocalSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 582 ierr = ISGetLocalSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 583 } 584 585 /* Create hash table for the mapping :row -> proc*/ 586 for( i=0, j=0; i< size; i++) { 587 for (; j <c->rowners[i+1]; j++) { 588 rtable[j] = i; 589 } 590 } 591 592 /* evaluate communication - mesg to who, length of mesg, and buffer space 593 required. Based on this, buffers are allocated, and data copied into them*/ 594 w1 = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /* mesg size */ 595 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 596 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 597 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 598 PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/ 599 for ( i=0; i<ismax ; i++) { 600 PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/ 601 for ( j =0 ; j < nrow[i] ; j++) { 602 row = irow[i][j]; 603 proc = rtable[row]; 604 w4[proc]++; 605 } 606 for( j = 0; j < size; j++){ 607 if( w4[j] ) { w1[j] += w4[j]; w3[j] += 1;} 608 } 609 } 610 611 nrqs = 0; /* no of outgoing messages */ 612 msz = 0; /* total mesg length (for all proc */ 613 w1[rank] = 0; /* no mesg sent to intself */ 614 w3[rank] = 0; 615 for (i =0; i < size ; i++) { 616 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 617 } 618 pa = (int *)PetscMalloc((nrqs +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */ 619 for (i =0, j=0; i < size ; i++) { 620 if (w1[i]) { pa[j] = i; j++; } 621 } 622 623 /* Each message would have a header = 1 + 2*(no of IS) + data */ 624 for (i = 0; i<nrqs ; i++) { 625 j = pa[i]; 626 w1[j] += w2[j] + 2* w3[j]; 627 msz += w1[j]; 628 } 629 630 /* Do a global reduction to determine how many messages to expect*/ 631 { 632 int *rw1, *rw2; 633 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 634 rw2 = rw1+size; 635 MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm); 636 bsz = rw1[rank]; 637 MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm); 638 nrqr = rw2[rank]; 639 PetscFree(rw1); 640 } 641 642 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 643 rbuf1 = (int**) PetscMalloc((nrqr+1) *sizeof(int*)); CHKPTRQ(rbuf1); 644 rbuf1[0] = (int *) PetscMalloc((nrqr *bsz+1) * sizeof(int)); CHKPTRQ(rbuf1[0]); 645 for (i=1; i<nrqr ; ++i) rbuf1[i] = rbuf1[i-1] + bsz; 646 647 /* Now post the receives */ 648 recv_waits = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 649 CHKPTRQ(recv_waits); 650 for ( i=0; i<nrqr; ++i){ 651 MPI_Irecv((void *)(rbuf1[i]), bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i); 652 } 653 654 /* Allocate Memory for outgoing messages */ 655 sbuf1 = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(sbuf1); 656 PetscMemzero(sbuf1, 2*size*sizeof(int*)); 657 /* allocate memory for outgoing data + buf to recive the first reply */ 658 tmp = (int *)PetscMalloc((2*msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */ 659 ptr = sbuf1 +size; /* Pointers to the data in outgoing buffers */ 660 ctr = (int *)PetscMalloc( size*sizeof(int)); CHKPTRQ(ctr); 661 662 { 663 int *iptr = tmp; 664 int ict = 0; 665 for (i = 0; i < nrqs ; i++) { 666 j = pa[i]; 667 iptr += ict; 668 sbuf1[j] = iptr; 669 ict = w1[j]; 670 } 671 } 672 673 /* Form the outgoing messages */ 674 /* Initialise the header space */ 675 for ( i=0 ; i<nrqs ; i++) { 676 j = pa[i]; 677 sbuf1[j][0] = 0; 678 PetscMemzero(sbuf1[j]+1, 2 * w3[j]*sizeof(int)); 679 ptr[j] = sbuf1[j] + 2*w3[j] +1; 680 } 681 682 683 /* Parse the isrow and copy data into outbuf */ 684 for ( i=0 ; i<ismax ; i++) { 685 PetscMemzero(ctr,size*sizeof(int)); 686 for( j=0; j<nrow[i]; j++) { /* parse the indices of each IS */ 687 row = irow[i][j]; 688 proc = rtable[row]; 689 if (proc != rank) { /* copy to the outgoing buf*/ 690 ctr[proc]++; 691 *ptr[proc] = row; 692 ptr[proc]++; 693 } 694 } 695 /* Update the headers for the current IS */ 696 for( j = 0; j<size; j++) { /* Can Optimise this loop too */ 697 if (ctr[j]) { 698 k= ++sbuf1[j][0]; 699 sbuf1[j][2*k] = ctr[j]; 700 sbuf1[j][2*k-1] = i; 701 } 702 } 703 } 704 705 /* Now post the sends */ 706 send_waits = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 707 CHKPTRQ(send_waits); 708 for( i =0; i< nrqs; ++i){ 709 j = pa[i]; 710 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 711 MPI_Isend( (void *)(sbuf1[j]), w1[j], MPI_INT, j, tag, comm, send_waits+i); 712 } 713 714 /* Post Recieves to capture the buffer size */ 715 recv_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 716 CHKPTRQ(recv_waits2); 717 rbuf2 = (int**)PetscMalloc((nrqs+1) *sizeof(int *)); CHKPTRQ(rbuf2); 718 rbuf2[0] = tmp + msz; 719 for( i =1; i< nrqs; ++i){ 720 j = pa[i]; 721 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 722 } 723 for( i =0; i< nrqs; ++i){ 724 j = pa[i]; 725 MPI_Irecv( (void *)(rbuf2[i]), w1[j], MPI_INT, j, tag+1, comm, recv_waits2+i); 726 } 727 728 /* Send to other procs the buf size they should allocate */ 729 730 731 /* Receive messages*/ 732 send_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 733 CHKPTRQ(send_waits2); 734 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 735 CHKPTRQ(recv_status); 736 req_size = (int *) PetscMalloc( (nrqr +1) * sizeof(int)) ; CHKPTRQ(req_size); 737 req_source = (int *) PetscMalloc( (nrqr +1) * sizeof(int)) ; CHKPTRQ(req_source); 738 sbuf2 = (int**) PetscMalloc( (nrqr +1) * sizeof(int*)) ; CHKPTRQ(sbuf2); 739 740 for ( i=0; i< nrqr; ++i) { 741 MPI_Waitany(nrqr, recv_waits, &index, recv_status+i); 742 /* req_size[index] = 2*rbuf1[index][0];*/ 743 req_size[index] = 0; 744 start = 2*rbuf1[index][0] + 1 ; 745 MPI_Get_count(recv_status+i,MPI_INT, &end); 746 sbuf2 [index] = (int *)PetscMalloc(end*sizeof(int)); CHKPTRQ(sbuf2[index]); 747 for (j=start; j< end; j++) { 748 ierr = MatGetRow(C,rbuf1[index][j], &ncols,0,0); CHKERRQ(ierr); 749 sbuf2[index][j] = ncols; 750 req_size[index] += ncols; 751 } 752 req_source[index] = recv_status[i].MPI_SOURCE; 753 /* form the header */ 754 sbuf2[index][0] = req_size[index]; 755 for (j=1; j<start; j++){ sbuf2[index][j] = rbuf1[index][j]; } 756 MPI_Isend((void *)(sbuf2[index]),end,MPI_INT,req_source[index],tag+1, comm, send_waits2+i); 757 } 758 759 /* recv buffer sizes */ 760 /* Receive messages*/ 761 762 rbuf3 = (int**)PetscMalloc((nrqs+1) *sizeof(int*)); CHKPTRQ(rbuf3); 763 rbuf4 = (Scalar**)PetscMalloc((nrqs+1) *sizeof(Scalar*)); CHKPTRQ(rbuf4); 764 recv_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 765 CHKPTRQ(recv_waits3); 766 recv_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 767 CHKPTRQ(recv_waits4); 768 recv_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 769 CHKPTRQ(recv_status2); 770 for ( i=0; i< nrqs; ++i) { 771 MPI_Waitany(nrqs, recv_waits2, &index, recv_status2+i); 772 773 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 774 CHKPTRQ(rbuf3[index]); 775 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*sizeof(Scalar)); 776 CHKPTRQ(rbuf4[index]); 777 MPI_Irecv((void *)(rbuf3[index]),rbuf2[index][0], MPI_INT, 778 recv_status2[i].MPI_SOURCE, tag+2, comm, recv_waits3+index); 779 MPI_Irecv((void *)(rbuf4[index]),rbuf2[index][0], MPIU_SCALAR, 780 recv_status2[i].MPI_SOURCE, tag+3, comm, recv_waits4+index); 781 } 782 783 /* Wait on sends1 and sends2 */ 784 send_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 785 CHKPTRQ(send_status); 786 send_status2 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 787 CHKPTRQ(send_status2); 788 789 MPI_Waitall(nrqs,send_waits,send_status); 790 MPI_Waitall(nrqr,send_waits2,send_status2); 791 792 793 /* Now allocate buffers for a->j, and send them off */ 794 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(sbuf_aj); 795 for ( i=0, j =0; i< nrqr; i++) j += req_size[i]; 796 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 797 for (i =1; i< nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 798 799 send_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 800 CHKPTRQ(send_waits3); 801 for (i=0; i<nrqr; i++) { 802 ct1 = 2*rbuf1[i][0]+1; 803 ct2 = 0; 804 for (j=1, max1 = rbuf1[i][0]; j<= max1; j++){ 805 for( k=0; k< rbuf1[i][2*j]; k++, ct1++) { 806 row = rbuf1[i][ct1]; 807 ierr = MatGetRow(C, row, &ncols, &cols, 0); CHKERRQ(ierr); 808 PetscMemcpy(sbuf_aj[i]+ct2, cols, ncols*sizeof(int)); 809 ct2 += ncols; 810 ierr = MatRestoreRow(C,row, &ncols, &cols,0); CHKERRQ(ierr); 811 } 812 } 813 /* no header for this message */ 814 MPI_Isend((void *)(sbuf_aj[i]),req_size[i],MPI_INT,req_source[i],tag+2, comm, send_waits3+i); 815 } 816 recv_status3 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 817 CHKPTRQ(recv_status3); 818 send_status3 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 819 CHKPTRQ(send_status3); 820 821 /* Now allocate buffers for a->a, and send them off */ 822 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *)); CHKPTRQ(sbuf_aa); 823 for ( i=0, j =0; i< nrqr; i++) j += req_size[i]; 824 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*sizeof(Scalar)); CHKPTRQ(sbuf_aa[0]); 825 for (i =1; i< nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 826 827 send_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 828 CHKPTRQ(send_waits4); 829 for (i=0; i<nrqr; i++) { 830 ct1 = 2*rbuf1[i][0]+1; 831 ct2 = 0; 832 for (j=1, max1 = rbuf1[i][0]; j<= max1; j++){ 833 for( k=0; k< rbuf1[i][2*j]; k++, ct1++) { 834 row = rbuf1[i][ct1]; 835 ierr = MatGetRow(C, row, &ncols, 0, &vals); CHKERRQ(ierr); 836 PetscMemcpy(sbuf_aa[i]+ct2, vals, ncols*sizeof(Scalar)); 837 ct2 += ncols; 838 ierr = MatRestoreRow(C,row, &ncols,0,&vals); CHKERRQ(ierr); 839 } 840 } 841 /* no header for this message */ 842 MPI_Isend((void *)(sbuf_aa[i]),req_size[i],MPIU_SCALAR,req_source[i],tag+3, comm, send_waits4+i); 843 } 844 recv_status4 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 845 CHKPTRQ(recv_status4); 846 send_status4 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 847 CHKPTRQ(send_status4); 848 849 850 /* Form the matrix */ 851 /* create col map */ 852 cmap = (int **) PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(cmap); 853 cmap[0] = (int *)PetscMalloc((1+ ismax*c->N)*sizeof(int)); CHKPTRQ(cmap[0]); 854 PetscMemzero((char *)cmap[0],(1+ ismax*c->N)*sizeof(int)); 855 for (i =1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->N; } 856 for (i=0; i< ismax; i++) { 857 for ( j=0; j< ncol[i]; j++) { 858 cmap[i][icol[i][j]] = j+1; 859 } 860 } 861 862 /* Create lens which is required for MatCreate... */ 863 lens = (int **)PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(lens); 864 for (i =0, j=0; i<ismax; i++) { j +=nrow[i]; } 865 lens[0] = (int *)PetscMalloc((1+ j)*sizeof(int)); CHKPTRQ(lens[0]); 866 PetscMemzero((char *)lens[0], (1+ j)*sizeof(int)); 867 for (i =1; i<ismax; i++) { lens[i] = lens[i-1] +nrow[i-1]; } 868 869 /* Update lens from local data */ 870 for (i=0; i< ismax; i++) { 871 for (j =0; j< nrow[i]; j++) { 872 row = irow[i][j] ; 873 proc = rtable[row]; 874 if (proc == rank) { 875 ierr = MatGetRow(C,row,&ncols,&cols,0); CHKERRQ(ierr); 876 for (k =0; k< ncols; k++) { 877 if ( cmap[i][cols[k]]) { lens[i][j]++ ;} 878 } 879 ierr = MatRestoreRow(C,row,&ncols,&cols,0); CHKERRQ(ierr); 880 } 881 } 882 } 883 884 /* Create row map*/ 885 rmap = (int **)PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(rmap); 886 rmap[0] = (int *)PetscMalloc((1+ ismax*c->M)*sizeof(int)); CHKPTRQ(rmap[0]); 887 PetscMemzero((char *)rmap[0],(1+ ismax*c->M)*sizeof(int)); 888 for (i =1; i<ismax; i++) { rmap[i] = rmap[i-1] + c->M ;} 889 for (i=0; i< ismax; i++) { 890 for ( j=0; j< nrow[i]; j++) { 891 rmap[i][irow[i][j]] = j; 892 } 893 } 894 895 /* Update lens from offproc data */ 896 for ( tmp2 =0; tmp2 < nrqs; tmp2++) { 897 MPI_Waitany(nrqs, recv_waits3, &i, recv_status3+tmp2); 898 index = pa[i]; 899 ct1 = 2*sbuf1[index][0]+1; /* sbuf1, rbuf2*/ 900 ct2 = 0; /* rbuf3, rbuf4 */ 901 for (j =1; j<= sbuf1[index][0]; j++) { 902 is_no = sbuf1[index][2*j-1]; 903 max1 = sbuf1[index][2*j]; 904 for (k =0; k< max1; k++, ct1++) { 905 row = sbuf1[index][ct1]; 906 row = rmap[is_no][row]; /* the val in the new matrix to be */ 907 max2 = rbuf2[i][ct1]; 908 for (l=0; l<max2; l++, ct2++) { 909 if (cmap[is_no][rbuf3[i][ct2]]) { 910 lens[is_no][row]++; 911 } 912 } 913 } 914 } 915 } 916 MPI_Waitall(nrqr,send_waits3,send_status3); 917 918 /* Create the submatrices */ 919 if( scall == MAT_REUSE_MATRIX) { 920 int n_cols, n_rows; 921 for (i=0; i<ismax; i++){ 922 ierr = MatGetSize((*submat)[i],&n_rows, &n_cols); CHKERRQ(ierr); 923 if ((n_rows !=nrow[i]) || (n_cols !=ncol[i])) { 924 SETERRQ(1,"MatGetSubmatrices_MPIAIJ:"); 925 } 926 } 927 } 928 else { 929 *submat = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(*submat); 930 for ( i=0; i<ismax; i++) { 931 ierr = MatCreateSeqAIJ(comm, nrow[i],ncol[i],0,lens[i],(*submat)+i); CHKERRQ(ierr); 932 } 933 } 934 935 /* Assemble the matrices */ 936 /* First assemble the local rows */ 937 for (i=0; i< ismax; i++) { 938 mat = (Mat_SeqAIJ *)((*submat)[i]->data); 939 for (j =0; j< nrow[i]; j++) { 940 row = irow[i][j] ; 941 proc = rtable[row]; 942 if (proc == rank) { 943 ierr = MatGetRow(C,row,&ncols,&cols,&vals); CHKERRQ(ierr); 944 row = rmap[i][row]; 945 mat_i = mat->i[row] + ashift; 946 mat_a = mat->a + mat_i; 947 mat_j = mat->j + mat_i; 948 for (k =0; k< ncols; k++) { 949 if ((tcol = cmap[i][cols[k]])) { 950 *mat_j++ = tcol - (!ashift); 951 *mat_a++ = vals[k]; 952 mat->ilen[row]++; 953 } 954 } 955 ierr = MatRestoreRow(C,row,&ncols,&cols,&vals); CHKERRQ(ierr); 956 } 957 } 958 } 959 960 /* Now assemble the off proc rows*/ 961 for(tmp2 =0; tmp2 <nrqs; tmp2++) { 962 MPI_Waitany(nrqs, recv_waits4, &i, recv_status4+tmp2); 963 index = pa[i]; 964 ct1 = 2*sbuf1[index][0]+1; /* sbuf1, rbuf2*/ 965 ct2 = 0; /* rbuf3, rbuf4 */ 966 for (j =1; j<= sbuf1[index][0]; j++) { 967 is_no = sbuf1[index][2*j-1]; 968 mat = (Mat_SeqAIJ *)((*submat)[is_no]->data); 969 max1 = sbuf1[index][2*j]; 970 for (k =0; k< max1; k++, ct1++) { 971 row = sbuf1[index][ct1]; 972 row = rmap[is_no][row]; /* the val in the new matrix to be */ 973 mat_i = mat->i[row] + ashift; 974 mat_a = mat->a + mat_i; 975 mat_j = mat->j + mat_i; 976 max2 = rbuf2[i][ct1]; 977 for (l=0; l<max2; l++, ct2++) { 978 if ((tcol = cmap[is_no][rbuf3[i][ct2]])) { 979 *mat_j++ = tcol - (! ashift); 980 *mat_a++ = rbuf4[i][ct2]; 981 mat->ilen[row]++; 982 } 983 } 984 } 985 } 986 } 987 MPI_Waitall(nrqr,send_waits4,send_status4); 988 989 /* Packup*/ 990 for( i=0; i< ismax; i++) { 991 ierr = MatAssemblyBegin((*submat)[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 992 } 993 for( i=0; i< ismax; i++) { 994 ierr = MatAssemblyEnd((*submat)[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 995 } 996 /* Restore the indices */ 997 for (i=0; i<ismax; i++) { 998 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 999 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1000 } 1001 /* Destroy allocated memory */ 1002 PetscFree(nrow); 1003 PetscFree(ncol); 1004 PetscFree(irow); 1005 PetscFree(icol); 1006 PetscFree(rtable); 1007 PetscFree(w1); 1008 PetscFree(pa); 1009 PetscFree(rbuf1[0]); 1010 PetscFree(rbuf1); 1011 PetscFree(sbuf1 ); 1012 PetscFree(tmp); 1013 PetscFree(ctr); 1014 PetscFree(rbuf2); 1015 PetscFree(req_size); 1016 PetscFree(req_source); 1017 for ( i=0; i< nrqr; ++i) { 1018 PetscFree(sbuf2[i]); 1019 } 1020 for ( i=0; i< nrqs; ++i) { 1021 PetscFree(rbuf3[i]); 1022 PetscFree(rbuf4[i]); 1023 } 1024 1025 PetscFree( sbuf2 ); 1026 PetscFree(rbuf3); 1027 PetscFree(rbuf4 ); 1028 PetscFree(sbuf_aj[0]); 1029 PetscFree(sbuf_aj); 1030 PetscFree(sbuf_aa[0]); 1031 PetscFree(sbuf_aa); 1032 1033 PetscFree(cmap[0]); 1034 PetscFree(rmap[0]); 1035 PetscFree(cmap); 1036 PetscFree(rmap); 1037 PetscFree(lens[0]); 1038 PetscFree(lens); 1039 1040 PetscFree(recv_waits ); 1041 PetscFree(recv_waits2); 1042 PetscFree(recv_waits3); 1043 PetscFree(recv_waits4); 1044 1045 PetscFree(recv_status); 1046 PetscFree(recv_status2); 1047 PetscFree(recv_status3); 1048 PetscFree(recv_status4); 1049 1050 PetscFree(send_waits); 1051 PetscFree(send_waits2); 1052 PetscFree(send_waits3); 1053 PetscFree(send_waits4); 1054 1055 PetscFree( send_status); 1056 PetscFree(send_status2); 1057 PetscFree(send_status3); 1058 PetscFree(send_status4); 1059 1060 return 0; 1061 } 1062 1063 1064 1065 1066