1 #ifndef lint 2 static char vcid[] = "$Id: mpiov.c,v 1.23 1996/02/15 16:53:52 balay Exp balay $"; 3 #endif 4 5 #include "mpiaij.h" 6 #include "inline/bitarray.h" 7 8 static int MatIncreaseOverlap_MPIAIJ_private(Mat, int, IS *); 9 static int FindOverlapLocal(Mat , int , char **,int*, int**); 10 static int FindOverlapRecievedMesg(Mat , int, int **, int**, int* ); 11 12 int MatIncreaseOverlap_MPIAIJ(Mat C, int imax, IS *is, int ov) 13 { 14 int i, ierr; 15 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIAIJ: negative overlap specified\n");} 16 for (i =0; i<ov; ++i) { 17 ierr = MatIncreaseOverlap_MPIAIJ_private(C, imax, is); CHKERRQ(ierr); 18 } 19 return 0; 20 } 21 22 /* 23 Sample message format: 24 If a processor A wants processor B to process some elements corresponding 25 to index sets 1s[1], is[5] 26 mesg [0] = 2 ( no of index sets in the mesg) 27 ----------- 28 mesg [1] = 1 => is[1] 29 mesg [2] = sizeof(is[1]); 30 ----------- 31 mesg [5] = 5 => is[5] 32 mesg [6] = sizeof(is[5]); 33 ----------- 34 mesg [7] 35 mesg [n] datas[1] 36 ----------- 37 mesg[n+1] 38 mesg[m] data(is[5]) 39 ----------- 40 41 Notes: 42 nrqs - no of requests sent (or to be sent out) 43 nrqr - no of requests recieved (which have to be or which have been processed 44 */ 45 static int MatIncreaseOverlap_MPIAIJ_private(Mat C, int imax, IS *is) 46 { 47 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 48 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data; 49 int size, rank, m,i,j,k, ierr, **rbuf, row, proc, nrqs, msz, **outdat, **ptr; 50 int *ctr, *pa, tag, *tmp,bsz, nrqr , *isz, *isz1, **xdata; 51 int bsz1, **rbuf2; 52 char **table; 53 MPI_Comm comm; 54 MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2 ; 55 MPI_Status *send_status ,*recv_status; 56 double space, fr, maxs; 57 58 comm = C->comm; 59 tag = C->tag; 60 size = c->size; 61 rank = c->rank; 62 m = c->M; 63 64 65 TrSpace( &space, &fr, &maxs ); 66 /* MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs); */ 67 68 idx = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(idx); 69 n = (int *)PetscMalloc((imax+1)*sizeof(int )); CHKPTRQ(n); 70 rtable = (int *)PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable); 71 /* Hash table for maping row ->proc */ 72 73 for ( i=0 ; i<imax ; i++) { 74 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 75 ierr = ISGetLocalSize(is[i],&n[i]); CHKERRQ(ierr); 76 } 77 78 /* Create hash table for the mapping :row -> proc*/ 79 for( i=0, j=0; i< size; i++) { 80 for (; j <c->rowners[i+1]; j++) { 81 rtable[j] = i; 82 } 83 } 84 85 /* evaluate communication - mesg to who, length of mesg, and buffer space 86 required. Based on this, buffers are allocated, and data copied into them*/ 87 w1 = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /* mesg size */ 88 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 89 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 90 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 91 PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/ 92 for ( i=0; i<imax ; i++) { 93 PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/ 94 for ( j =0 ; j < n[i] ; j++) { 95 row = idx[i][j]; 96 proc = rtable[row]; 97 w4[proc]++; 98 } 99 for( j = 0; j < size; j++){ 100 if( w4[j] ) { w1[j] += w4[j]; w3[j] += 1;} 101 } 102 } 103 104 nrqs = 0; /* no of outgoing messages */ 105 msz = 0; /* total mesg length (for all proc */ 106 w1[rank] = 0; /* no mesg sent to intself */ 107 w3[rank] = 0; 108 for (i =0; i < size ; i++) { 109 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 110 } 111 pa = (int *)PetscMalloc((nrqs +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */ 112 for (i =0, j=0; i < size ; i++) { 113 if (w1[i]) { pa[j] = i; j++; } 114 } 115 116 /* Each message would have a header = 1 + 2*(no of IS) + data */ 117 for (i = 0; i<nrqs ; i++) { 118 j = pa[i]; 119 w1[j] += w2[j] + 2* w3[j]; 120 msz += w1[j]; 121 } 122 123 124 /* Do a global reduction to determine how many messages to expect*/ 125 { 126 int *rw1, *rw2; 127 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 128 rw2 = rw1+size; 129 MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm); 130 bsz = rw1[rank]; 131 MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm); 132 nrqr = rw2[rank]; 133 PetscFree(rw1); 134 } 135 136 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 137 rbuf = (int**) PetscMalloc((nrqr+1) *sizeof(int*)); CHKPTRQ(rbuf); 138 rbuf[0] = (int *) PetscMalloc((nrqr *bsz+1) * sizeof(int)); CHKPTRQ(rbuf[0]); 139 for (i=1; i<nrqr ; ++i) rbuf[i] = rbuf[i-1] + bsz; 140 141 /* Now post the receives */ 142 recv_waits = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 143 CHKPTRQ(recv_waits); 144 for ( i=0; i<nrqr; ++i){ 145 MPI_Irecv((void *)(rbuf[i]), bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i); 146 } 147 148 /* Allocate Memory for outgoing messages */ 149 outdat = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(outdat); 150 PetscMemzero(outdat, 2*size*sizeof(int*)); 151 tmp = (int *)PetscMalloc((msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */ 152 ptr = outdat +size; /* Pointers to the data in outgoing buffers */ 153 ctr = (int *)PetscMalloc( size*sizeof(int)); CHKPTRQ(ctr); 154 155 { 156 int *iptr = tmp; 157 int ict = 0; 158 for (i = 0; i < nrqs ; i++) { 159 j = pa[i]; 160 iptr += ict; 161 outdat[j] = iptr; 162 ict = w1[j]; 163 } 164 } 165 166 /* Form the outgoing messages */ 167 /*plug in the headers*/ 168 for ( i=0 ; i<nrqs ; i++) { 169 j = pa[i]; 170 outdat[j][0] = 0; 171 PetscMemzero(outdat[j]+1, 2 * w3[j]*sizeof(int)); 172 ptr[j] = outdat[j] + 2*w3[j] +1; 173 } 174 175 /* Memory for doing local proc's work*/ 176 table = (char **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(table); 177 data = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(data); 178 table[0] = (char *)PetscMalloc((m/BITSPERBYTE +1)*(imax)); CHKPTRQ(table[0]); 179 data [0] = (int *)PetscMalloc((m+1)*(imax)*sizeof(int)); CHKPTRQ(data[0]); 180 181 for(i = 1; i<imax ; i++) { 182 table[i] = table[0] + (m/BITSPERBYTE+1)*i; 183 data[i] = data[0] + (m+1)*i; 184 } 185 186 PetscMemzero((void*)*table,(m/BITSPERBYTE+1)*(imax)); 187 isz = (int *)PetscMalloc((imax+1) *sizeof(int)); CHKPTRQ(isz); 188 PetscMemzero((void *)isz,(imax+1) *sizeof(int)); 189 190 /* Parse the IS and update local tables and the outgoing buf with the data*/ 191 for ( i=0 ; i<imax ; i++) { 192 PetscMemzero(ctr,size*sizeof(int)); 193 for( j=0; j<n[i]; j++) { /* parse the indices of each IS */ 194 row = idx[i][j]; 195 proc = rtable[row]; 196 if (proc != rank) { /* copy to the outgoing buf*/ 197 ctr[proc]++; 198 *ptr[proc] = row; 199 ptr[proc]++; 200 } 201 else { /* Update the table */ 202 if ( !BT_LOOKUP(table[i],row)) { data[i][isz[i]++] = row;} 203 } 204 } 205 /* Update the headers for the current IS */ 206 for( j = 0; j<size; j++) { /* Can Optimise this loop too */ 207 if (ctr[j]) { 208 k= ++outdat[j][0]; 209 outdat[j][2*k] = ctr[j]; 210 outdat[j][2*k-1] = i; 211 } 212 } 213 } 214 215 216 217 /* Now post the sends */ 218 send_waits = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 219 CHKPTRQ(send_waits); 220 for( i =0; i< nrqs; ++i){ 221 j = pa[i]; 222 MPI_Isend( (void *)(outdat[j]), w1[j], MPI_INT, j, tag, comm, send_waits+i); 223 } 224 225 /* I nolonger need the original indices*/ 226 for( i=0; i< imax; ++i) { 227 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 228 } 229 PetscFree(idx); 230 PetscFree(n); 231 PetscFree(rtable); 232 for( i=0; i< imax; ++i) { 233 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 234 } 235 236 /* Do Local work*/ 237 ierr = FindOverlapLocal(C, imax, table,isz, data); CHKERRQ(ierr); 238 /* Extract the matrices */ 239 240 /* Receive messages*/ 241 { 242 int index; 243 244 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 245 CHKPTRQ(recv_status); 246 for ( i=0; i< nrqr; ++i) { 247 MPI_Waitany(nrqr, recv_waits, &index, recv_status+i); 248 } 249 250 send_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 251 CHKPTRQ(send_status); 252 MPI_Waitall(nrqs,send_waits,send_status); 253 } 254 /* Pahse 1 sends are complete - deallocate buffers */ 255 PetscFree(outdat); 256 PetscFree(w1); 257 PetscFree(tmp); 258 259 /* int FindOverlapRecievedMesg(Mat C, int imax, int *isz, char **table, int **data)*/ 260 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 261 isz1 = (int *)PetscMalloc((nrqr+1) *sizeof(int)); CHKPTRQ(isz1); 262 ierr = FindOverlapRecievedMesg(C, nrqr, rbuf,xdata,isz1); CHKERRQ(ierr); 263 264 /* Nolonger need rbuf. */ 265 PetscFree(rbuf[0]); 266 PetscFree(rbuf); 267 268 269 /* Send the data back*/ 270 /* Do a global reduction to know the buffer space req for incoming messages*/ 271 { 272 int *rw1, *rw2; 273 274 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 275 PetscMemzero((void*)rw1,2*size*sizeof(int)); 276 rw2 = rw1+size; 277 for (i =0; i < nrqr ; ++i) { 278 proc = recv_status[i].MPI_SOURCE; 279 rw1[proc] = isz1[i]; 280 } 281 282 MPI_Allreduce((void *)rw1, (void *)rw2, size, MPI_INT, MPI_MAX, comm); 283 bsz1 = rw2[rank]; 284 PetscFree(rw1); 285 } 286 287 /* Allocate buffers*/ 288 289 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 290 rbuf2 = (int**) PetscMalloc((nrqs+1) *sizeof(int*)); CHKPTRQ(rbuf2); 291 rbuf2[0] = (int *) PetscMalloc((nrqs*bsz1+1) * sizeof(int)); CHKPTRQ(rbuf2[0]); 292 for (i=1; i<nrqs ; ++i) rbuf2[i] = rbuf2[i-1] + bsz1; 293 294 /* Now post the receives */ 295 recv_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); CHKPTRQ(recv_waits2) 296 CHKPTRQ(recv_waits2); 297 for ( i=0; i<nrqs; ++i){ 298 MPI_Irecv((void *)(rbuf2[i]), bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits2+i); 299 } 300 301 /* Now post the sends */ 302 send_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 303 CHKPTRQ(send_waits2); 304 for( i =0; i< nrqr; ++i){ 305 j = recv_status[i].MPI_SOURCE; 306 MPI_Isend( (void *)(xdata[i]), isz1[i], MPI_INT, j, tag, comm, send_waits2+i); 307 } 308 309 /* recieve work done on other processors*/ 310 { 311 int index, is_no, ct1, max; 312 MPI_Status *send_status2 ,*recv_status2; 313 314 recv_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 315 CHKPTRQ(recv_status2); 316 317 318 for ( i=0; i< nrqs; ++i) { 319 MPI_Waitany(nrqs, recv_waits2, &index, recv_status2+i); 320 /* Process the message*/ 321 ct1 = 2*rbuf2[index][0]+1; 322 for (j=1; j<=rbuf2[index][0]; j++) { 323 max = rbuf2[index][2*j]; 324 is_no = rbuf2[index][2*j-1]; 325 for (k=0; k < max ; k++, ct1++) { 326 row = rbuf2[index][ct1]; 327 if(!BT_LOOKUP(table[is_no],row)) { data[is_no][isz[is_no]++] = row;} 328 } 329 } 330 } 331 332 333 send_status2 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 334 CHKPTRQ(send_status2); 335 MPI_Waitall(nrqr,send_waits2,send_status2); 336 337 PetscFree(send_status2); PetscFree(recv_status2); 338 } 339 340 TrSpace( &space, &fr, &maxs ); 341 /* MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs);*/ 342 343 PetscFree(ctr); 344 PetscFree(pa); 345 PetscFree(rbuf2[0]); 346 PetscFree(rbuf2); 347 PetscFree(send_waits); 348 PetscFree(recv_waits); 349 PetscFree(send_waits2); 350 PetscFree(recv_waits2); 351 PetscFree(table[0]); 352 PetscFree(table); 353 PetscFree(send_status); 354 PetscFree(recv_status); 355 PetscFree(isz1); 356 PetscFree(xdata[0]); 357 PetscFree(xdata); 358 359 for ( i=0; i<imax; ++i) { 360 ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 361 } 362 PetscFree(isz); 363 PetscFree(data[0]); 364 PetscFree(data); 365 366 return 0; 367 } 368 369 /* FindOverlapLocal() - Called by MatincreaseOverlap, to do the work on 370 the local processor. 371 372 Inputs: 373 C - MAT_MPIAIJ; 374 imax - total no of index sets processed at a time; 375 table - an array of char - size = m bits. 376 377 Output: 378 isz - array containing the count of the solution elements correspondign 379 to each index set; 380 data - pointer to the solutions 381 */ 382 static int FindOverlapLocal(Mat C, int imax, char **table, int *isz,int **data) 383 { 384 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 385 Mat A = c->A, B = c->B; 386 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 387 int start, end, val, max, rstart,cstart, ashift, bshift,*ai, *aj; 388 int *bi, *bj, *garray, i, j, k, row; 389 390 rstart = c->rstart; 391 cstart = c->cstart; 392 ashift = a->indexshift; 393 ai = a->i; 394 aj = a->j +ashift; 395 bshift = b->indexshift; 396 bi = b->i; 397 bj = b->j +bshift; 398 garray = c->garray; 399 400 401 for( i=0; i<imax; i++) { 402 for ( j=0, max =isz[i] ; j< max; j++) { 403 row = data[i][j] - rstart; 404 start = ai[row]; 405 end = ai[row+1]; 406 for ( k=start; k < end; k++) { /* Amat */ 407 val = aj[k] + ashift + cstart; 408 if(!BT_LOOKUP(table[i],val)) { data[i][isz[i]++] = val;} 409 } 410 start = bi[row]; 411 end = bi[row+1]; 412 for ( k=start; k < end; k++) { /* Bmat */ 413 val = garray[bj[k]+bshift] ; 414 if(! BT_LOOKUP(table[i],val)) { data[i][isz[i]++] = val;} 415 } 416 } 417 } 418 419 return 0; 420 } 421 /* FindOverlapRecievedMesg - Process the recieved messages, 422 and return the output 423 424 Input: 425 C - the matrix 426 nrqr - no of messages being processed. 427 rbuf - an array of pointers to the recieved requests 428 429 Output: 430 xdata - array of messages to be sent back 431 isz1 - size of each message 432 */ 433 static int FindOverlapRecievedMesg(Mat C, int nrqr, int ** rbuf, int ** xdata, int * isz1 ) 434 { 435 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 436 Mat A = c->A, B = c->B; 437 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 438 int rstart,cstart, ashift, bshift,*ai, *aj, *bi, *bj, *garray, i, j, k; 439 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 440 int val, max1, max2, rank, m, no_malloc =0, *tmp, new_estimate, ctr; 441 char *xtable; 442 443 rank = c->rank; 444 m = c->M; 445 rstart = c->rstart; 446 cstart = c->cstart; 447 ashift = a->indexshift; 448 ai = a->i; 449 aj = a->j +ashift; 450 bshift = b->indexshift; 451 bi = b->i; 452 bj = b->j +bshift; 453 garray = c->garray; 454 455 456 for (i =0, ct =0, total_sz =0; i< nrqr; ++i){ 457 ct+= rbuf[i][0]; 458 for ( j = 1; j <= rbuf[i][0] ; j++ ) { total_sz += rbuf[i][2*j]; } 459 } 460 461 max1 = ct*(a->nz +b->nz)/c->m; 462 mem_estimate = 3*((total_sz > max1?total_sz:max1)+1); 463 xdata[0] = (int *)PetscMalloc(mem_estimate *sizeof(int)); CHKPTRQ(xdata[0]); 464 ++no_malloc; 465 xtable = (char *)PetscMalloc((m/BITSPERBYTE+1)); CHKPTRQ(xtable); 466 PetscMemzero((void *)isz1,(nrqr+1) *sizeof(int)); 467 468 ct3 = 0; 469 for (i =0; i< nrqr; i++) { /* for easch mesg from proc i */ 470 ct1 = 2*rbuf[i][0]+1; 471 ct2 = ct1; 472 ct3+= ct1; 473 for (j = 1, max1= rbuf[i][0]; j<=max1; j++) { /* for each IS from proc i*/ 474 PetscMemzero((void *)xtable,(m/BITSPERBYTE+1)); 475 oct2 = ct2; 476 for (k =0; k < rbuf[i][2*j]; k++, ct1++) { 477 row = rbuf[i][ct1]; 478 if(!BT_LOOKUP(xtable,row)) { 479 if (!(ct3 < mem_estimate)) { 480 new_estimate = (int)(1.5*mem_estimate)+1; 481 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 482 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 483 PetscFree(xdata[0]); 484 xdata[0] = tmp; 485 mem_estimate = new_estimate; ++no_malloc; 486 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 487 } 488 xdata[i][ct2++] = row;ct3++; 489 } 490 } 491 for ( k=oct2, max2 =ct2 ; k< max2; k++) { 492 row = xdata[i][k] - rstart; 493 start = ai[row]; 494 end = ai[row+1]; 495 for ( l=start; l < end; l++) { 496 val = aj[l] +ashift + cstart; 497 if(!BT_LOOKUP(xtable,val)) { 498 if (!(ct3 < mem_estimate)) { 499 new_estimate = (int)(1.5*mem_estimate)+1; 500 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 501 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 502 PetscFree(xdata[0]); 503 xdata[0] = tmp; 504 mem_estimate = new_estimate; ++no_malloc; 505 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 506 } 507 xdata[i][ct2++] = val;ct3++; 508 } 509 } 510 start = bi[row]; 511 end = bi[row+1]; 512 for ( l=start; l < end; l++) { 513 val = garray[bj[l]+bshift] ; 514 if(!BT_LOOKUP(xtable,val)) { 515 if (!(ct3 < mem_estimate)) { 516 new_estimate = (int)(1.5*mem_estimate)+1; 517 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 518 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 519 PetscFree(xdata[0]); 520 xdata[0] = tmp; 521 mem_estimate = new_estimate; ++no_malloc; 522 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 523 } 524 xdata[i][ct2++] = val;ct3++; 525 } 526 } 527 } 528 /* Update the header*/ 529 xdata[i][2*j] = ct2-oct2; /* Undo the vector isz1 and use only a var*/ 530 xdata[i][2*j-1] = rbuf[i][2*j-1]; 531 } 532 xdata[i][0] = rbuf[i][0]; 533 xdata[i+1] = xdata[i] +ct2; 534 isz1[i] = ct2; /* size of each message */ 535 } 536 PetscFree(xtable); 537 PLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 538 return 0; 539 } 540 541 542 int MatGetSubMatrices_MPIAIJ (Mat C,int ismax, IS *isrow,IS *iscol,MatGetSubMatrixCall 543 scall, Mat **submat) 544 { 545 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 546 Mat A = c->A, B = c->B; 547 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data, *mat; 548 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable, start, end, size; 549 int **sbuf1,**sbuf2, rank, m,i,j,k,l,ct1,ct2, ierr, **rbuf1, row, proc; 550 int nrqs, msz, **ptr, index, *req_size, *ctr, *pa, tag, *tmp,tcol,bsz, nrqr; 551 int **rbuf3,*req_source,**sbuf_aj, rstart,cstart, ashift, bshift,*ai, *aj; 552 int *bi, *bj,*garray,**rbuf2, max1,max2, **rmap, **cmap, **srmap, **scmap; 553 int **lens, is_no, ncols, *cols, mat_i, *mat_j, tmp2; 554 MPI_Request *send_waits, *recv_waits, *send_waits2, *recv_waits2, *recv_waits3 ; 555 MPI_Request *recv_waits4,*send_waits3,*send_waits4; 556 MPI_Status *recv_status ,*recv_status2,*send_status,*send_status3 ,*send_status2; 557 MPI_Status *recv_status3,*recv_status4,*send_status4; 558 MPI_Comm comm; 559 Scalar **rbuf4, *aa, *ba, **sbuf_aa, *vals, *mat_a; 560 561 comm = C->comm; 562 tag = C->tag; 563 size = c->size; 564 rank = c->rank; 565 m = c->M; 566 rstart = c->rstart; 567 cstart = c->cstart; 568 ashift = a->indexshift; 569 ai = a->i; 570 aj = a->j; 571 aa = a->a; 572 bshift = b->indexshift; 573 bi = b->i; 574 bj = b->j; 575 ba = b->a; 576 garray = c->garray; 577 578 irow = (int **)PetscMalloc((ismax+1)*sizeof(int *)); CHKPTRQ(irow); 579 icol = (int **)PetscMalloc((ismax+1)*sizeof(int *)); CHKPTRQ(icol); 580 nrow = (int *) PetscMalloc((ismax+1)*sizeof(int )); CHKPTRQ(nrow); 581 ncol = (int *) PetscMalloc((ismax+1)*sizeof(int )); CHKPTRQ(ncol); 582 rtable = (int *) PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable); 583 /* Hash table for maping row ->proc */ 584 585 for ( i=0 ; i<ismax ; i++) { /* Extract the indicies and sort them */ 586 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 587 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 588 ierr = ISGetLocalSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 589 ierr = ISGetLocalSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 590 ierr = SYIsort(nrow[i], irow[i]); CHKERRQ(ierr); 591 ierr = SYIsort(ncol[i], icol[i]); CHKERRQ(ierr); 592 } 593 594 /* Create hash table for the mapping :row -> proc*/ 595 for( i=0, j=0; i< size; i++) { 596 for (; j <c->rowners[i+1]; j++) { 597 rtable[j] = i; 598 } 599 } 600 601 /* evaluate communication - mesg to who, length of mesg, and buffer space 602 required. Based on this, buffers are allocated, and data copied into them*/ 603 w1 = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /* mesg size */ 604 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 605 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 606 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 607 PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/ 608 for ( i=0; i<ismax ; i++) { 609 PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/ 610 for ( j =0 ; j < nrow[i] ; j++) { 611 row = irow[i][j]; 612 proc = rtable[row]; 613 w4[proc]++; 614 } 615 for( j = 0; j < size; j++){ 616 if( w4[j] ) { w1[j] += w4[j]; w3[j] += 1;} 617 } 618 } 619 620 nrqs = 0; /* no of outgoing messages */ 621 msz = 0; /* total mesg length (for all proc */ 622 w1[rank] = 0; /* no mesg sent to intself */ 623 w3[rank] = 0; 624 for (i =0; i < size ; i++) { 625 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 626 } 627 pa = (int *)PetscMalloc((nrqs +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */ 628 for (i =0, j=0; i < size ; i++) { 629 if (w1[i]) { pa[j] = i; j++; } 630 } 631 632 /* Each message would have a header = 1 + 2*(no of IS) + data */ 633 for (i = 0; i<nrqs ; i++) { 634 j = pa[i]; 635 w1[j] += w2[j] + 2* w3[j]; 636 msz += w1[j]; 637 } 638 639 /* Do a global reduction to determine how many messages to expect*/ 640 { 641 int *rw1, *rw2; 642 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 643 rw2 = rw1+size; 644 MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm); 645 bsz = rw1[rank]; 646 MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm); 647 nrqr = rw2[rank]; 648 PetscFree(rw1); 649 } 650 651 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 652 rbuf1 = (int**) PetscMalloc((nrqr+1) *sizeof(int*)); CHKPTRQ(rbuf1); 653 rbuf1[0] = (int *) PetscMalloc((nrqr *bsz+1) * sizeof(int)); CHKPTRQ(rbuf1[0]); 654 for (i=1; i<nrqr ; ++i) rbuf1[i] = rbuf1[i-1] + bsz; 655 656 /* Now post the receives */ 657 recv_waits = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 658 CHKPTRQ(recv_waits); 659 for ( i=0; i<nrqr; ++i){ 660 MPI_Irecv((void *)(rbuf1[i]), bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i); 661 } 662 663 /* Allocate Memory for outgoing messages */ 664 sbuf1 = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(sbuf1); 665 PetscMemzero(sbuf1, 2*size*sizeof(int*)); 666 /* allocate memory for outgoing data + buf to recive the first reply */ 667 tmp = (int *)PetscMalloc((2*msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */ 668 ptr = sbuf1 +size; /* Pointers to the data in outgoing buffers */ 669 ctr = (int *)PetscMalloc( size*sizeof(int)); CHKPTRQ(ctr); 670 671 { 672 int *iptr = tmp; 673 int ict = 0; 674 for (i = 0; i < nrqs ; i++) { 675 j = pa[i]; 676 iptr += ict; 677 sbuf1[j] = iptr; 678 ict = w1[j]; 679 } 680 } 681 682 /* Form the outgoing messages */ 683 /* Initialise the header space */ 684 for ( i=0 ; i<nrqs ; i++) { 685 j = pa[i]; 686 sbuf1[j][0] = 0; 687 PetscMemzero(sbuf1[j]+1, 2 * w3[j]*sizeof(int)); 688 ptr[j] = sbuf1[j] + 2*w3[j] +1; 689 } 690 691 692 /* Parse the isrow and copy data into outbuf */ 693 for ( i=0 ; i<ismax ; i++) { 694 PetscMemzero(ctr,size*sizeof(int)); 695 for( j=0; j<nrow[i]; j++) { /* parse the indices of each IS */ 696 row = irow[i][j]; 697 proc = rtable[row]; 698 if (proc != rank) { /* copy to the outgoing buf*/ 699 ctr[proc]++; 700 *ptr[proc] = row; 701 ptr[proc]++; 702 } 703 } 704 /* Update the headers for the current IS */ 705 for( j = 0; j<size; j++) { /* Can Optimise this loop too */ 706 if (ctr[j]) { 707 k= ++sbuf1[j][0]; 708 sbuf1[j][2*k] = ctr[j]; 709 sbuf1[j][2*k-1] = i; 710 } 711 } 712 } 713 714 /* Now post the sends */ 715 send_waits = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 716 CHKPTRQ(send_waits); 717 for( i =0; i< nrqs; ++i){ 718 j = pa[i]; 719 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 720 MPI_Isend( (void *)(sbuf1[j]), w1[j], MPI_INT, j, tag, comm, send_waits+i); 721 } 722 723 /* Post Recieves to capture the buffer size */ 724 recv_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 725 CHKPTRQ(recv_waits2); 726 rbuf2 = (int**)PetscMalloc((nrqs+1) *sizeof(int *)); CHKPTRQ(rbuf2); 727 rbuf2[0] = tmp + msz; 728 for( i =1; i< nrqs; ++i){ 729 j = pa[i]; 730 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 731 } 732 for( i =0; i< nrqs; ++i){ 733 j = pa[i]; 734 MPI_Irecv( (void *)(rbuf2[i]), w1[j], MPI_INT, j, tag+1, comm, recv_waits2+i); 735 } 736 737 /* Send to other procs the buf size they should allocate */ 738 739 740 /* Receive messages*/ 741 send_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 742 CHKPTRQ(send_waits2); 743 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 744 CHKPTRQ(recv_status); 745 req_size = (int *) PetscMalloc( (nrqr +1) * sizeof(int)) ; CHKPTRQ(req_size); 746 req_source = (int *) PetscMalloc( (nrqr +1) * sizeof(int)) ; CHKPTRQ(req_source); 747 sbuf2 = (int**) PetscMalloc( (nrqr +1) * sizeof(int*)) ; CHKPTRQ(sbuf2); 748 749 for ( i=0; i< nrqr; ++i) { 750 MPI_Waitany(nrqr, recv_waits, &index, recv_status+i); 751 /* req_size[index] = 2*rbuf1[index][0];*/ 752 req_size[index] = 0; 753 start = 2*rbuf1[index][0] + 1 ; 754 MPI_Get_count(recv_status+i,MPI_INT, &end); 755 sbuf2 [index] = (int *)PetscMalloc(end*sizeof(int)); CHKPTRQ(sbuf2[index]); 756 757 for (j=start; j< end; j++) { 758 row = rbuf1[index][j] - rstart; 759 sbuf2[index][j] = (ai[row+1] - ai[row]); /*overite it with nz count of that row */ 760 sbuf2[index][j] += (bi[row+1] - bi[row]); 761 req_size[index] += sbuf2[index][j]; 762 } 763 req_source[index] = recv_status[i].MPI_SOURCE; 764 /* form the header */ 765 sbuf2[index][0] = req_size[index]; 766 for (j=1; j<start; j++){ sbuf2[index][j] = rbuf1[index][j]; } 767 MPI_Isend((void *)(sbuf2[index]),end,MPI_INT,req_source[index],tag+1, comm, send_waits2+i); 768 } 769 770 /* recv buffer sizes */ 771 /* Receive messages*/ 772 773 rbuf3 = (int**)PetscMalloc((nrqs+1) *sizeof(int*)); CHKPTRQ(rbuf3); 774 rbuf4 = (Scalar**)PetscMalloc((nrqs+1) *sizeof(Scalar*)); CHKPTRQ(rbuf4); 775 recv_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 776 CHKPTRQ(recv_waits3); 777 recv_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 778 CHKPTRQ(recv_waits4); 779 recv_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 780 CHKPTRQ(recv_status2); 781 for ( i=0; i< nrqs; ++i) { 782 MPI_Waitany(nrqs, recv_waits2, &index, recv_status2+i); 783 784 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 785 CHKPTRQ(rbuf3[index]); 786 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*sizeof(Scalar)); 787 CHKPTRQ(rbuf4[index]); 788 MPI_Irecv((void *)(rbuf3[index]),rbuf2[index][0], MPI_INT, 789 recv_status2[i].MPI_SOURCE, tag+2, comm, recv_waits3+index); 790 MPI_Irecv((void *)(rbuf4[index]),rbuf2[index][0], MPIU_SCALAR, 791 recv_status2[i].MPI_SOURCE, tag+3, comm, recv_waits4+index); 792 } 793 794 /* Wait on sends1 and sends2 */ 795 send_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 796 CHKPTRQ(send_status); 797 send_status2 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 798 CHKPTRQ(send_status2); 799 800 MPI_Waitall(nrqs,send_waits,send_status); 801 MPI_Waitall(nrqr,send_waits2,send_status2); 802 803 804 /* Now allocate buffers for a->j, and send them off */ 805 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(sbuf_aj); 806 for ( i=0, j =0; i< nrqr; i++) j += req_size[i]; 807 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 808 for (i =1; i< nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 809 810 send_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 811 CHKPTRQ(send_waits3); 812 for (i=0; i<nrqr; i++) { 813 ct1 = 2*rbuf1[i][0]+1; 814 ct2 = 0; 815 for (j=1, max1 = rbuf1[i][0]; j<= max1; j++){ 816 for( k=0; k< rbuf1[i][2*j]; k++, ct1++) { 817 row = rbuf1[i][ct1]; 818 ierr = MatGetRow(C, row, &ncols, &cols, 0); CHKERRQ(ierr); 819 PetscMemcpy(sbuf_aj[i]+ct2, cols, ncols*sizeof(int)); 820 ct2 += ncols; 821 ierr = MatRestoreRow(C,row, &ncols, &cols,0); CHKERRQ(ierr); 822 } 823 } 824 /* no header for this message */ 825 MPI_Isend((void *)(sbuf_aj[i]),req_size[i],MPI_INT,req_source[i],tag+2, comm, send_waits3+i); 826 } 827 recv_status3 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 828 CHKPTRQ(recv_status3); 829 send_status3 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 830 CHKPTRQ(send_status3); 831 832 /* Now allocate buffers for a->a, and send them off */ 833 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *)); CHKPTRQ(sbuf_aa); 834 for ( i=0, j =0; i< nrqr; i++) j += req_size[i]; 835 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*sizeof(Scalar)); CHKPTRQ(sbuf_aa[0]); 836 for (i =1; i< nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 837 838 send_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 839 CHKPTRQ(send_waits4); 840 for (i=0; i<nrqr; i++) { 841 ct1 = 2*rbuf1[i][0]+1; 842 ct2 = 0; 843 for (j=1, max1 = rbuf1[i][0]; j<= max1; j++){ 844 for( k=0; k< rbuf1[i][2*j]; k++, ct1++) { 845 row = rbuf1[i][ct1]; 846 ierr = MatGetRow(C, row, &ncols, 0, &vals); CHKERRQ(ierr); 847 PetscMemcpy(sbuf_aa[i]+ct2, vals, ncols*sizeof(Scalar)); 848 ct2 += ncols; 849 ierr = MatRestoreRow(C,row, &ncols,0,&vals); CHKERRQ(ierr); 850 } 851 } 852 /* no header for this message */ 853 MPI_Isend((void *)(sbuf_aa[i]),req_size[i],MPIU_SCALAR,req_source[i],tag+3, comm, send_waits4+i); 854 } 855 recv_status4 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 856 CHKPTRQ(recv_status4); 857 send_status4 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 858 CHKPTRQ(send_status4); 859 860 861 /* Form the matrix */ 862 /* create col map */ 863 cmap = (int **) PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(cmap); 864 scmap = (int **)PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(scmap); 865 cmap[0] = (int *)PetscMalloc((1+ ismax*c->N)*sizeof(int)); CHKPTRQ(cmap[0]); 866 PetscMemzero((char *)cmap[0],(1+ ismax*c->N)*sizeof(int)); 867 for (i =1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->N; } 868 for (i =0; i<ismax; i++) { scmap[i] = cmap[i] +ashift;} 869 for (i=0; i< ismax; i++) { 870 for ( j=0; j< ncol[i]; j++) { 871 scmap[i][icol[i][j]] = j+1; 872 } 873 } 874 875 /* Create lens which is required for MatCreate... */ 876 lens = (int **)PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(lens); 877 for (i =0, j=0; i<ismax; i++) { j +=nrow[i]; } 878 lens[0] = (int *)PetscMalloc((1+ j)*sizeof(int)); CHKPTRQ(lens[0]); 879 PetscMemzero((char *)lens[0], (1+ j)*sizeof(int)); 880 for (i =1; i<ismax; i++) { lens[i] = lens[i-1] +nrow[i-1]; } 881 882 /* Update lens from local data */ 883 for (i=0; i< ismax; i++) { 884 for (j =0; j< nrow[i]; j++) { 885 row = irow[i][j] ; 886 proc = rtable[row]; 887 if (proc == rank) { 888 ierr = MatGetRow(C,row,&ncols,&cols,0); CHKERRQ(ierr); 889 for (k =0; k< ncols; k++) { 890 if ( scmap[i][cols[k]]) { lens[i][j]++ ;} 891 } 892 ierr = MatRestoreRow(C,row,&ncols,&cols,0); CHKERRQ(ierr); 893 } 894 } 895 } 896 897 /* Create row map*/ 898 rmap = (int **)PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(rmap); 899 srmap = (int **)PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(srmap); 900 rmap[0] = (int *)PetscMalloc((1+ ismax*c->M)*sizeof(int)); CHKPTRQ(rmap[0]); 901 PetscMemzero((char *)rmap[0],(1+ ismax*c->M)*sizeof(int)); 902 for (i =1; i<ismax; i++) { rmap[i] = rmap[i-1] + c->M ;} 903 for (i =0; i<ismax; i++) { srmap[i] = rmap[i] +ashift;} 904 for (i=0; i< ismax; i++) { 905 for ( j=0; j< nrow[i]; j++) { 906 srmap[i][irow[i][j]] = j; 907 } 908 } 909 910 /* Update lens from offproc data */ 911 for ( tmp2 =0; tmp2 < nrqs; tmp2++) { 912 MPI_Waitany(nrqs, recv_waits3, &i, recv_status3+tmp2); 913 index = pa[i]; 914 ct1 = 2*sbuf1[index][0]+1; /* sbuf1, rbuf2*/ 915 ct2 = 0; /* rbuf3, rbuf4 */ 916 for (j =1; j<= sbuf1[index][0]; j++) { 917 is_no = sbuf1[index][2*j-1]; 918 max1 = sbuf1[index][2*j]; 919 for (k =0; k< max1; k++, ct1++) { 920 row = sbuf1[index][ct1]; 921 row = srmap[is_no][row]; /* the val in the new matrix to be */ 922 max2 = rbuf2[i][ct1]; 923 for (l=0; l<max2; l++, ct2++) { 924 if (scmap[is_no][rbuf3[i][ct2]]) { 925 lens[is_no][row]++; 926 } 927 } 928 } 929 } 930 } 931 MPI_Waitall(nrqr,send_waits3,send_status3); 932 933 /* Create the submatrices */ 934 if( scall == MAT_REUSE_MATRIX) { 935 int n_cols, n_rows; 936 for (i=0; i<ismax; i++){ 937 ierr = MatGetSize((*submat)[i],&n_rows, &n_cols); CHKERRQ(ierr); 938 if ((n_rows !=nrow[i]) || (n_cols !=ncol[i])) { 939 SETERRQ(1,"MatGetSubmatrices_MPIAIJ:"); 940 } 941 } 942 } 943 else { 944 *submat = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(*submat); 945 for ( i=0; i<ismax; i++) { 946 ierr = MatCreateSeqAIJ(comm, nrow[i],ncol[i],0,lens[i],(*submat)+i); CHKERRQ(ierr); 947 } 948 } 949 950 /* Assemble the matrices */ 951 /* First assemble the local rows */ 952 for (i=0; i< ismax; i++) { 953 mat = (Mat_SeqAIJ *)((*submat)[i]->data); 954 for (j =0; j< nrow[i]; j++) { 955 row = irow[i][j] ; 956 proc = rtable[row]; 957 if (proc == rank) { 958 ierr = MatGetRow(C,row,&ncols,&cols,&vals); CHKERRQ(ierr); 959 row = srmap[i][row]; 960 mat_i = mat->i[row]; 961 mat_a = mat->a + mat_i; 962 mat_j = mat->j + mat_i; 963 for (k =0; k< ncols; k++) { 964 if ((tcol = scmap[i][cols[k]])) { 965 *mat_j++ = tcol - 1; 966 *mat_a++ = vals[k]; 967 mat->ilen[row]++; 968 } 969 } 970 ierr = MatRestoreRow(C,row,&ncols,&cols,&vals); CHKERRQ(ierr); 971 } 972 } 973 } 974 975 /* Now assemble the off proc rows*/ 976 for(tmp2 =0; tmp2 <nrqs; tmp2++) { 977 MPI_Waitany(nrqs, recv_waits4, &i, recv_status4+tmp2); 978 index = pa[i]; 979 ct1 = 2*sbuf1[index][0]+1; /* sbuf1, rbuf2*/ 980 ct2 = 0; /* rbuf3, rbuf4 */ 981 for (j =1; j<= sbuf1[index][0]; j++) { 982 is_no = sbuf1[index][2*j-1]; 983 mat = (Mat_SeqAIJ *)((*submat)[is_no]->data); 984 max1 = sbuf1[index][2*j]; 985 for (k =0; k< max1; k++, ct1++) { 986 row = sbuf1[index][ct1]; 987 row = srmap[is_no][row]; /* the val in the new matrix to be */ 988 mat_i = mat->i[row]; 989 mat_a = mat->a + mat_i; 990 mat_j = mat->j + mat_i; 991 max2 = rbuf2[i][ct1]; 992 for (l=0; l<max2; l++, ct2++) { 993 if ((tcol = scmap[is_no][rbuf3[i][ct2]])) { 994 *mat_j++ = tcol - 1; 995 *mat_a++ = rbuf4[i][ct2]; 996 mat->ilen[row]++; 997 } 998 } 999 } 1000 } 1001 } 1002 MPI_Waitall(nrqr,send_waits4,send_status4); 1003 1004 /* Packup*/ 1005 for( i=0; i< ismax; i++) { 1006 ierr = MatAssemblyBegin((*submat)[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1007 } 1008 for( i=0; i< ismax; i++) { 1009 ierr = MatAssemblyEnd((*submat)[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1010 } 1011 /* Restore the indices */ 1012 for (i=0; i<ismax; i++) { 1013 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 1014 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1015 } 1016 /* Destroy allocated memory */ 1017 PetscFree(nrow); 1018 PetscFree(ncol); 1019 PetscFree(irow); 1020 PetscFree(icol); 1021 PetscFree(rtable); 1022 PetscFree(w1); 1023 PetscFree(pa); 1024 PetscFree(rbuf1[0]); 1025 PetscFree(rbuf1); 1026 PetscFree(sbuf1 ); 1027 PetscFree(tmp); 1028 PetscFree(ctr); 1029 PetscFree(rbuf2); 1030 PetscFree(req_size); 1031 PetscFree(req_source); 1032 for ( i=0; i< nrqr; ++i) { 1033 PetscFree(sbuf2[i]); 1034 } 1035 for ( i=0; i< nrqs; ++i) { 1036 PetscFree(rbuf3[i]); 1037 PetscFree(rbuf4[i]); 1038 } 1039 1040 PetscFree( sbuf2 ); 1041 PetscFree(rbuf3); 1042 PetscFree(rbuf4 ); 1043 PetscFree(sbuf_aj[0]); 1044 PetscFree(sbuf_aj); 1045 PetscFree(sbuf_aa[0]); 1046 PetscFree(sbuf_aa); 1047 1048 PetscFree(cmap[0]); 1049 PetscFree(rmap[0]); 1050 PetscFree(cmap); 1051 PetscFree(rmap); 1052 PetscFree(scmap); 1053 PetscFree(srmap); 1054 PetscFree(lens[0]); 1055 PetscFree(lens); 1056 1057 PetscFree(recv_waits ); 1058 PetscFree(recv_waits2); 1059 PetscFree(recv_waits3); 1060 PetscFree(recv_waits4); 1061 1062 PetscFree(recv_status); 1063 PetscFree(recv_status2); 1064 PetscFree(recv_status3); 1065 PetscFree(recv_status4); 1066 1067 PetscFree(send_waits); 1068 PetscFree(send_waits2); 1069 PetscFree(send_waits3); 1070 PetscFree(send_waits4); 1071 1072 PetscFree( send_status); 1073 PetscFree(send_status2); 1074 PetscFree(send_status3); 1075 PetscFree(send_status4); 1076 1077 return 0; 1078 } 1079 1080 1081 1082 1083