1 #ifndef lint 2 static char vcid[] = "$Id: mpiov.c,v 1.18 1996/02/08 23:32:10 balay Exp balay $"; 3 #endif 4 5 #include "mpiaij.h" 6 #include "inline/bitarray.h" 7 8 static int MatIncreaseOverlap_MPIAIJ_private(Mat, int, IS *); 9 static int FindOverlapLocal(Mat , int , char **,int*, int**); 10 static int FindOverlapRecievedMesg(Mat , int, int **, int**, int* ); 11 12 int MatIncreaseOverlap_MPIAIJ(Mat C, int imax, IS *is, int ov) 13 { 14 int i, ierr; 15 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIAIJ: negative overlap specified\n");} 16 for (i =0; i<ov; ++i) { 17 ierr = MatIncreaseOverlap_MPIAIJ_private(C, imax, is); CHKERRQ(ierr); 18 } 19 return 0; 20 } 21 22 /* 23 Sample message format: 24 If a processor A wants processor B to process some elements corresponding 25 to index sets 1s[1], is[5] 26 mesg [0] = 2 ( no of index sets in the mesg) 27 ----------- 28 mesg [1] = 1 => is[1] 29 mesg [2] = sizeof(is[1]); 30 ----------- 31 mesg [5] = 5 => is[5] 32 mesg [6] = sizeof(is[5]); 33 ----------- 34 mesg [7] 35 mesg [n] datas[1] 36 ----------- 37 mesg[n+1] 38 mesg[m] data(is[5]) 39 ----------- 40 41 Notes: 42 nrqs - no of requests sent (or to be sent out) 43 nrqr - no of requests recieved (which have to be or which have been processed 44 */ 45 static int MatIncreaseOverlap_MPIAIJ_private(Mat C, int imax, IS *is) 46 { 47 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 48 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data; 49 int size, rank, m,i,j,k, ierr, **rbuf, row, proc, nrqs, msz, **outdat, **ptr; 50 int *ctr, *pa, tag, *tmp,bsz, nrqr , *isz, *isz1, **xdata; 51 int bsz1, **rbuf2; 52 char **table; 53 MPI_Comm comm; 54 MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2 ; 55 MPI_Status *send_status ,*recv_status; 56 double space, fr, maxs; 57 58 comm = C->comm; 59 tag = C->tag; 60 size = c->size; 61 rank = c->rank; 62 m = c->M; 63 64 65 TrSpace( &space, &fr, &maxs ); 66 /* MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs); */ 67 68 idx = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(idx); 69 n = (int *)PetscMalloc((imax+1)*sizeof(int )); CHKPTRQ(n); 70 rtable = (int *)PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable); 71 /* Hash table for maping row ->proc */ 72 73 for ( i=0 ; i<imax ; i++) { 74 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 75 ierr = ISGetLocalSize(is[i],&n[i]); CHKERRQ(ierr); 76 } 77 78 /* Create hash table for the mapping :row -> proc*/ 79 for( i=0, j=0; i< size; i++) { 80 for (; j <c->rowners[i+1]; j++) { 81 rtable[j] = i; 82 } 83 } 84 85 /* evaluate communication - mesg to who, length of mesg, and buffer space 86 required. Based on this, buffers are allocated, and data copied into them*/ 87 w1 = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /* mesg size */ 88 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 89 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 90 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 91 PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/ 92 for ( i=0; i<imax ; i++) { 93 PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/ 94 for ( j =0 ; j < n[i] ; j++) { 95 row = idx[i][j]; 96 proc = rtable[row]; 97 w4[proc]++; 98 } 99 for( j = 0; j < size; j++){ 100 if( w4[j] ) { w1[j] += w4[j]; w3[j] += 1;} 101 } 102 } 103 104 nrqs = 0; /* no of outgoing messages */ 105 msz = 0; /* total mesg length (for all proc */ 106 w1[rank] = 0; /* no mesg sent to intself */ 107 w3[rank] = 0; 108 for (i =0; i < size ; i++) { 109 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 110 } 111 pa = (int *)PetscMalloc((nrqs +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */ 112 for (i =0, j=0; i < size ; i++) { 113 if (w1[i]) { pa[j] = i; j++; } 114 } 115 116 /* Each message would have a header = 1 + 2*(no of IS) + data */ 117 for (i = 0; i<nrqs ; i++) { 118 j = pa[i]; 119 w1[j] += w2[j] + 2* w3[j]; 120 msz += w1[j]; 121 } 122 123 124 /* Do a global reduction to determine how many messages to expect*/ 125 { 126 int *rw1, *rw2; 127 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 128 rw2 = rw1+size; 129 MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm); 130 bsz = rw1[rank]; 131 MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm); 132 nrqr = rw2[rank]; 133 PetscFree(rw1); 134 } 135 136 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 137 rbuf = (int**) PetscMalloc((nrqr+1) *sizeof(int*)); CHKPTRQ(rbuf); 138 rbuf[0] = (int *) PetscMalloc((nrqr *bsz+1) * sizeof(int)); CHKPTRQ(rbuf[0]); 139 for (i=1; i<nrqr ; ++i) rbuf[i] = rbuf[i-1] + bsz; 140 141 /* Now post the receives */ 142 recv_waits = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 143 CHKPTRQ(recv_waits); 144 for ( i=0; i<nrqr; ++i){ 145 MPI_Irecv((void *)(rbuf[i]), bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i); 146 } 147 148 /* Allocate Memory for outgoing messages */ 149 outdat = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(outdat); 150 PetscMemzero(outdat, 2*size*sizeof(int*)); 151 tmp = (int *)PetscMalloc((msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */ 152 ptr = outdat +size; /* Pointers to the data in outgoing buffers */ 153 ctr = (int *)PetscMalloc( size*sizeof(int)); CHKPTRQ(ctr); 154 155 { 156 int *iptr = tmp; 157 int ict = 0; 158 for (i = 0; i < nrqs ; i++) { 159 j = pa[i]; 160 iptr += ict; 161 outdat[j] = iptr; 162 ict = w1[j]; 163 } 164 } 165 166 /* Form the outgoing messages */ 167 /*plug in the headers*/ 168 for ( i=0 ; i<nrqs ; i++) { 169 j = pa[i]; 170 outdat[j][0] = 0; 171 PetscMemzero(outdat[j]+1, 2 * w3[j]*sizeof(int)); 172 ptr[j] = outdat[j] + 2*w3[j] +1; 173 } 174 175 /* Memory for doing local proc's work*/ 176 table = (char **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(table); 177 data = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(data); 178 table[0] = (char *)PetscMalloc((m/BITSPERBYTE +1)*(imax)); CHKPTRQ(table[0]); 179 data [0] = (int *)PetscMalloc((m+1)*(imax)*sizeof(int)); CHKPTRQ(data[0]); 180 181 for(i = 1; i<imax ; i++) { 182 table[i] = table[0] + (m/BITSPERBYTE+1)*i; 183 data[i] = data[0] + (m+1)*i; 184 } 185 186 PetscMemzero((void*)*table,(m/BITSPERBYTE+1)*(imax)); 187 isz = (int *)PetscMalloc((imax+1) *sizeof(int)); CHKPTRQ(isz); 188 PetscMemzero((void *)isz,(imax+1) *sizeof(int)); 189 190 /* Parse the IS and update local tables and the outgoing buf with the data*/ 191 for ( i=0 ; i<imax ; i++) { 192 PetscMemzero(ctr,size*sizeof(int)); 193 for( j=0; j<n[i]; j++) { /* parse the indices of each IS */ 194 row = idx[i][j]; 195 proc = rtable[row]; 196 if (proc != rank) { /* copy to the outgoing buf*/ 197 ctr[proc]++; 198 *ptr[proc] = row; 199 ptr[proc]++; 200 } 201 else { /* Update the table */ 202 if ( !BT_LOOKUP(table[i],row)) { data[i][isz[i]++] = row;} 203 } 204 } 205 /* Update the headers for the current IS */ 206 for( j = 0; j<size; j++) { /* Can Optimise this loop too */ 207 if (ctr[j]) { 208 k= ++outdat[j][0]; 209 outdat[j][2*k] = ctr[j]; 210 outdat[j][2*k-1] = i; 211 } 212 } 213 } 214 215 216 217 /* Now post the sends */ 218 send_waits = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 219 CHKPTRQ(send_waits); 220 for( i =0; i< nrqs; ++i){ 221 j = pa[i]; 222 MPI_Isend( (void *)(outdat[j]), w1[j], MPI_INT, j, tag, comm, send_waits+i); 223 } 224 225 /* I nolonger need the original indices*/ 226 for( i=0; i< imax; ++i) { 227 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 228 } 229 PetscFree(idx); 230 PetscFree(n); 231 PetscFree(rtable); 232 for( i=0; i< imax; ++i) { 233 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 234 } 235 236 /* Do Local work*/ 237 ierr = FindOverlapLocal(C, imax, table,isz, data); CHKERRQ(ierr); 238 /* Extract the matrices */ 239 240 /* Receive messages*/ 241 { 242 int index; 243 244 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 245 CHKPTRQ(recv_status); 246 for ( i=0; i< nrqr; ++i) { 247 MPI_Waitany(nrqr, recv_waits, &index, recv_status+i); 248 } 249 250 send_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 251 CHKPTRQ(send_status); 252 MPI_Waitall(nrqs,send_waits,send_status); 253 } 254 /* Pahse 1 sends are complete - deallocate buffers */ 255 PetscFree(outdat); 256 PetscFree(w1); 257 PetscFree(tmp); 258 259 /* int FindOverlapRecievedMesg(Mat C, int imax, int *isz, char **table, int **data)*/ 260 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 261 isz1 = (int *)PetscMalloc((nrqr+1) *sizeof(int)); CHKPTRQ(isz1); 262 ierr = FindOverlapRecievedMesg(C, nrqr, rbuf,xdata,isz1); CHKERRQ(ierr); 263 264 /* Nolonger need rbuf. */ 265 PetscFree(rbuf[0]); 266 PetscFree(rbuf); 267 268 269 /* Send the data back*/ 270 /* Do a global reduction to know the buffer space req for incoming messages*/ 271 { 272 int *rw1, *rw2; 273 274 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 275 PetscMemzero((void*)rw1,2*size*sizeof(int)); 276 rw2 = rw1+size; 277 for (i =0; i < nrqr ; ++i) { 278 proc = recv_status[i].MPI_SOURCE; 279 rw1[proc] = isz1[i]; 280 } 281 282 MPI_Allreduce((void *)rw1, (void *)rw2, size, MPI_INT, MPI_MAX, comm); 283 bsz1 = rw2[rank]; 284 PetscFree(rw1); 285 } 286 287 /* Allocate buffers*/ 288 289 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 290 rbuf2 = (int**) PetscMalloc((nrqs+1) *sizeof(int*)); CHKPTRQ(rbuf2); 291 rbuf2[0] = (int *) PetscMalloc((nrqs*bsz1+1) * sizeof(int)); CHKPTRQ(rbuf2[0]); 292 for (i=1; i<nrqs ; ++i) rbuf2[i] = rbuf2[i-1] + bsz1; 293 294 /* Now post the receives */ 295 recv_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); CHKPTRQ(recv_waits2) 296 CHKPTRQ(recv_waits2); 297 for ( i=0; i<nrqs; ++i){ 298 MPI_Irecv((void *)(rbuf2[i]), bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits2+i); 299 } 300 301 /* Now post the sends */ 302 send_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 303 CHKPTRQ(send_waits2); 304 for( i =0; i< nrqr; ++i){ 305 j = recv_status[i].MPI_SOURCE; 306 MPI_Isend( (void *)(xdata[i]), isz1[i], MPI_INT, j, tag, comm, send_waits2+i); 307 } 308 309 /* recieve work done on other processors*/ 310 { 311 int index, is_no, ct1, max; 312 MPI_Status *send_status2 ,*recv_status2; 313 314 recv_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 315 CHKPTRQ(recv_status2); 316 317 318 for ( i=0; i< nrqs; ++i) { 319 MPI_Waitany(nrqs, recv_waits2, &index, recv_status2+i); 320 /* Process the message*/ 321 ct1 = 2*rbuf2[index][0]+1; 322 for (j=1; j<=rbuf2[index][0]; j++) { 323 max = rbuf2[index][2*j]; 324 is_no = rbuf2[index][2*j-1]; 325 for (k=0; k < max ; k++, ct1++) { 326 row = rbuf2[index][ct1]; 327 if(!BT_LOOKUP(table[is_no],row)) { data[is_no][isz[is_no]++] = row;} 328 } 329 } 330 } 331 332 333 send_status2 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 334 CHKPTRQ(send_status2); 335 MPI_Waitall(nrqr,send_waits2,send_status2); 336 337 PetscFree(send_status2); PetscFree(recv_status2); 338 } 339 340 TrSpace( &space, &fr, &maxs ); 341 /* MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs);*/ 342 343 PetscFree(ctr); 344 PetscFree(pa); 345 PetscFree(rbuf2[0]); 346 PetscFree(rbuf2); 347 PetscFree(send_waits); 348 PetscFree(recv_waits); 349 PetscFree(send_waits2); 350 PetscFree(recv_waits2); 351 PetscFree(table[0]); 352 PetscFree(table); 353 PetscFree(send_status); 354 PetscFree(recv_status); 355 PetscFree(isz1); 356 PetscFree(xdata[0]); 357 PetscFree(xdata); 358 359 for ( i=0; i<imax; ++i) { 360 ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 361 } 362 PetscFree(isz); 363 PetscFree(data[0]); 364 PetscFree(data); 365 366 return 0; 367 } 368 369 /* FindOverlapLocal() - Called by MatincreaseOverlap, to do the work on 370 the local processor. 371 372 Inputs: 373 C - MAT_MPIAIJ; 374 imax - total no of index sets processed at a time; 375 table - an array of char - size = m bits. 376 377 Output: 378 isz - array containing the count of the solution elements correspondign 379 to each index set; 380 data - pointer to the solutions 381 */ 382 static int FindOverlapLocal(Mat C, int imax, char **table, int *isz,int **data) 383 { 384 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 385 Mat A = c->A, B = c->B; 386 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 387 int start, end, val, max, rstart,cstart, ashift, bshift,*ai, *aj; 388 int *bi, *bj, *garray, i, j, k, row; 389 390 rstart = c->rstart; 391 cstart = c->cstart; 392 ashift = a->indexshift; 393 ai = a->i; 394 aj = a->j +ashift; 395 bshift = b->indexshift; 396 bi = b->i; 397 bj = b->j +bshift; 398 garray = c->garray; 399 400 401 for( i=0; i<imax; i++) { 402 for ( j=0, max =isz[i] ; j< max; j++) { 403 row = data[i][j] - rstart; 404 start = ai[row]; 405 end = ai[row+1]; 406 for ( k=start; k < end; k++) { /* Amat */ 407 val = aj[k] + ashift + cstart; 408 if(!BT_LOOKUP(table[i],val)) { data[i][isz[i]++] = val;} 409 } 410 start = bi[row]; 411 end = bi[row+1]; 412 for ( k=start; k < end; k++) { /* Bmat */ 413 val = garray[bj[k]+bshift] ; 414 if(! BT_LOOKUP(table[i],val)) { data[i][isz[i]++] = val;} 415 } 416 } 417 } 418 419 return 0; 420 } 421 /* FindOverlapRecievedMesg - Process the recieved messages, 422 and return the output 423 424 Input: 425 C - the matrix 426 nrqr - no of messages being processed. 427 rbuf - an array of pointers to the recieved requests 428 429 Output: 430 xdata - array of messages to be sent back 431 isz1 - size of each message 432 */ 433 static int FindOverlapRecievedMesg(Mat C, int nrqr, int ** rbuf, int ** xdata, int * isz1 ) 434 { 435 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 436 Mat A = c->A, B = c->B; 437 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 438 int rstart,cstart, ashift, bshift,*ai, *aj, *bi, *bj, *garray, i, j, k; 439 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 440 int val, max1, max2, rank, m, no_malloc =0, *tmp, new_estimate, ctr; 441 char *xtable; 442 443 rank = c->rank; 444 m = c->M; 445 rstart = c->rstart; 446 cstart = c->cstart; 447 ashift = a->indexshift; 448 ai = a->i; 449 aj = a->j +ashift; 450 bshift = b->indexshift; 451 bi = b->i; 452 bj = b->j +bshift; 453 garray = c->garray; 454 455 456 for (i =0, ct =0, total_sz =0; i< nrqr; ++i){ 457 ct+= rbuf[i][0]; 458 for ( j = 1; j <= rbuf[i][0] ; j++ ) { total_sz += rbuf[i][2*j]; } 459 } 460 461 max1 = ct*(a->nz +b->nz)/c->m; 462 mem_estimate = 3*((total_sz > max1?total_sz:max1)+1); 463 xdata[0] = (int *)PetscMalloc(mem_estimate *sizeof(int)); CHKPTRQ(xdata[0]); 464 ++no_malloc; 465 xtable = (char *)PetscMalloc((m/BITSPERBYTE+1)); CHKPTRQ(xtable); 466 PetscMemzero((void *)isz1,(nrqr+1) *sizeof(int)); 467 468 ct3 = 0; 469 for (i =0; i< nrqr; i++) { /* for easch mesg from proc i */ 470 ct1 = 2*rbuf[i][0]+1; 471 ct2 = ct1; 472 ct3+= ct1; 473 for (j = 1, max1= rbuf[i][0]; j<=max1; j++) { /* for each IS from proc i*/ 474 PetscMemzero((void *)xtable,(m/BITSPERBYTE+1)); 475 oct2 = ct2; 476 for (k =0; k < rbuf[i][2*j]; k++, ct1++) { 477 row = rbuf[i][ct1]; 478 if(!BT_LOOKUP(xtable,row)) { 479 if (!(ct3 < mem_estimate)) { 480 new_estimate = (int)1.5*mem_estimate+1; 481 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 482 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 483 PetscFree(xdata[0]); 484 xdata[0] = tmp; 485 mem_estimate = new_estimate; ++no_malloc; 486 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 487 } 488 xdata[i][ct2++] = row;ct3++; 489 } 490 } 491 for ( k=oct2, max2 =ct2 ; k< max2; k++) { 492 row = xdata[i][k] - rstart; 493 start = ai[row]; 494 end = ai[row+1]; 495 for ( l=start; l < end; l++) { 496 val = aj[l] +ashift + cstart; 497 if(!BT_LOOKUP(xtable,val)) { 498 if (!(ct3 < mem_estimate)) { 499 new_estimate = (int)1.5*mem_estimate+1; 500 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 501 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 502 PetscFree(xdata[0]); 503 xdata[0] = tmp; 504 mem_estimate = new_estimate; ++no_malloc; 505 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 506 } 507 xdata[i][ct2++] = val;ct3++; 508 } 509 } 510 start = bi[row]; 511 end = bi[row+1]; 512 for ( l=start; l < end; l++) { 513 val = garray[bj[l]+bshift] ; 514 if(!BT_LOOKUP(xtable,val)) { 515 if (!(ct3 < mem_estimate)) { 516 new_estimate = (int)1.5*mem_estimate+1; 517 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 518 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 519 PetscFree(xdata[0]); 520 xdata[0] = tmp; 521 mem_estimate = new_estimate; ++no_malloc; 522 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 523 } 524 xdata[i][ct2++] = val;ct3++; 525 } 526 } 527 } 528 /* Update the header*/ 529 xdata[i][2*j] = ct2-oct2; /* Undo the vector isz1 and use only a var*/ 530 xdata[i][2*j-1] = rbuf[i][2*j-1]; 531 } 532 xdata[i][0] = rbuf[i][0]; 533 xdata[i+1] = xdata[i] +ct2; 534 isz1[i] = ct2; /* size of each message */ 535 } 536 PetscFree(xtable); 537 PLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 538 return 0; 539 } 540 541 542 int MatGetSubMatrices_MPIAIJ (Mat C,int imax, IS *irow,IS *icol,MatGetSubMatrixCall 543 scall, Mat **submat) 544 { 545 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 546 Mat A = c->A, B = c->B; 547 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 548 int **idx,*n,*w1,*w2,*w3,*w4,*rtable, start, end, size,**sbuf1,**sbuf2, rank; 549 int m,i,j,k,l,ct1,ct2, ierr, **rbuf1, row, proc, nrqs, msz, **ptr, index; 550 int *req_size, *ctr, *pa, tag, *tmp,bsz, nrqr , **rbuf3,*req_source,**sbuf_aj; 551 int rstart,cstart, ashift, bshift,*ai, *aj, *bi, *bj,*garray,**rbuf2, max1; 552 MPI_Request *send_waits, *recv_waits, *send_waits2, *recv_waits2, *recv_waits3 ; 553 MPI_Request *recv_waits4,*send_waits3,*send_waits4; 554 MPI_Status *recv_status ,*recv_status2,*send_status,*send_status3 ,*send_status2; 555 MPI_Status *recv_status3,*recv_status4,*send_status4; 556 MPI_Comm comm; 557 Scalar **rbuf4, *aa, *ba, **sbuf_aa; 558 double space, fr, maxs; 559 560 561 comm = C->comm; 562 tag = C->tag; 563 size = c->size; 564 rank = c->rank; 565 m = c->M; 566 rstart = c->rstart; 567 cstart = c->cstart; 568 ashift = a->indexshift; 569 ai = a->i; 570 aj = a->j; 571 aa = a->a; 572 bshift = b->indexshift; 573 bi = b->i; 574 bj = b->j; 575 ba = b->a; 576 garray = c->garray; 577 578 TrSpace( &space, &fr, &maxs ); 579 /* MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs); */ 580 581 idx = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(idx); 582 n = (int *)PetscMalloc((imax+1)*sizeof(int )); CHKPTRQ(n); 583 rtable = (int *)PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable); 584 /* Hash table for maping row ->proc */ 585 586 for ( i=0 ; i<imax ; i++) { 587 ierr = ISGetIndices(irow[i],&idx[i]); CHKERRQ(ierr); 588 ierr = ISGetLocalSize(irow[i],&n[i]); CHKERRQ(ierr); 589 } 590 591 /* Create hash table for the mapping :row -> proc*/ 592 for( i=0, j=0; i< size; i++) { 593 for (; j <c->rowners[i+1]; j++) { 594 rtable[j] = i; 595 } 596 } 597 598 /* evaluate communication - mesg to who, length of mesg, and buffer space 599 required. Based on this, buffers are allocated, and data copied into them*/ 600 w1 = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /* mesg size */ 601 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 602 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 603 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 604 PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/ 605 for ( i=0; i<imax ; i++) { 606 PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/ 607 for ( j =0 ; j < n[i] ; j++) { 608 row = idx[i][j]; 609 proc = rtable[row]; 610 w4[proc]++; 611 } 612 for( j = 0; j < size; j++){ 613 if( w4[j] ) { w1[j] += w4[j]; w3[j] += 1;} 614 } 615 } 616 617 nrqs = 0; /* no of outgoing messages */ 618 msz = 0; /* total mesg length (for all proc */ 619 w1[rank] = 0; /* no mesg sent to intself */ 620 w3[rank] = 0; 621 for (i =0; i < size ; i++) { 622 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 623 } 624 pa = (int *)PetscMalloc((nrqs +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */ 625 for (i =0, j=0; i < size ; i++) { 626 if (w1[i]) { pa[j] = i; j++; } 627 } 628 629 /* Each message would have a header = 1 + 2*(no of IS) + data */ 630 for (i = 0; i<nrqs ; i++) { 631 j = pa[i]; 632 w1[j] += w2[j] + 2* w3[j]; 633 msz += w1[j]; 634 } 635 636 637 /* Do a global reduction to determine how many messages to expect*/ 638 { 639 int *rw1, *rw2; 640 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 641 rw2 = rw1+size; 642 MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm); 643 bsz = rw1[rank]; 644 MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm); 645 nrqr = rw2[rank]; 646 PetscFree(rw1); 647 } 648 649 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 650 rbuf1 = (int**) PetscMalloc((nrqr+1) *sizeof(int*)); CHKPTRQ(rbuf1); 651 rbuf1[0] = (int *) PetscMalloc((nrqr *bsz+1) * sizeof(int)); CHKPTRQ(rbuf1[0]); 652 for (i=1; i<nrqr ; ++i) rbuf1[i] = rbuf1[i-1] + bsz; 653 654 /* Now post the receives */ 655 recv_waits = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 656 CHKPTRQ(recv_waits); 657 for ( i=0; i<nrqr; ++i){ 658 MPI_Irecv((void *)(rbuf1[i]), bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i); 659 } 660 661 /* Allocate Memory for outgoing messages */ 662 sbuf1 = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(sbuf1); 663 PetscMemzero(sbuf1, 2*size*sizeof(int*)); 664 /* allocate memory for outgoing data + buf to recive the first reply */ 665 tmp = (int *)PetscMalloc((2*msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */ 666 ptr = sbuf1 +size; /* Pointers to the data in outgoing buffers */ 667 ctr = (int *)PetscMalloc( size*sizeof(int)); CHKPTRQ(ctr); 668 669 { 670 int *iptr = tmp; 671 int ict = 0; 672 for (i = 0; i < nrqs ; i++) { 673 j = pa[i]; 674 iptr += ict; 675 sbuf1[j] = iptr; 676 ict = w1[j]; 677 } 678 } 679 680 /* Form the outgoing messages */ 681 /* Initialise the header space */ 682 for ( i=0 ; i<nrqs ; i++) { 683 j = pa[i]; 684 sbuf1[j][0] = 0; 685 PetscMemzero(sbuf1[j]+1, 2 * w3[j]*sizeof(int)); 686 ptr[j] = sbuf1[j] + 2*w3[j] +1; 687 } 688 689 690 /* Parse the irow and copy data into outbuf */ 691 for ( i=0 ; i<imax ; i++) { 692 PetscMemzero(ctr,size*sizeof(int)); 693 for( j=0; j<n[i]; j++) { /* parse the indices of each IS */ 694 row = idx[i][j]; 695 proc = rtable[row]; 696 if (proc != rank) { /* copy to the outgoing buf*/ 697 ctr[proc]++; 698 *ptr[proc] = row; 699 ptr[proc]++; 700 } 701 } 702 /* Update the headers for the current IS */ 703 for( j = 0; j<size; j++) { /* Can Optimise this loop too */ 704 if (ctr[j]) { 705 k= ++sbuf1[j][0]; 706 sbuf1[j][2*k] = ctr[j]; 707 sbuf1[j][2*k-1] = i; 708 } 709 } 710 } 711 712 /* Now post the sends */ 713 send_waits = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 714 CHKPTRQ(send_waits); 715 for( i =0; i< nrqs; ++i){ 716 j = pa[i]; 717 printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); 718 MPI_Isend( (void *)(sbuf1[j]), w1[j], MPI_INT, j, tag, comm, send_waits+i); 719 } 720 721 /* Post Recieves to capture the buffer size */ 722 recv_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 723 CHKPTRQ(recv_waits2); 724 rbuf2 = (int**)PetscMalloc((nrqs+1) *sizeof(int *)); CHKPTRQ(rbuf2); 725 rbuf2[0] = tmp + msz; 726 for( i =1; i< nrqs; ++i){ 727 j = pa[i]; 728 rbuf2[i] = rbuf2[i-1]+w1[i-1]; 729 } 730 for( i =0; i< nrqs; ++i){ 731 j = pa[i]; 732 MPI_Irecv( (void *)(rbuf2[i]), w1[j], MPI_INT, j, tag+1, comm, recv_waits2+i); 733 } 734 735 /* Send to other procs the buf size they should allocate */ 736 737 738 /* Receive messages*/ 739 send_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 740 CHKPTRQ(send_waits2); 741 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 742 CHKPTRQ(recv_status); 743 req_size = (int *) PetscMalloc( (nrqr +1) * sizeof(int)) ; CHKPTRQ(req_size); 744 req_source = (int *) PetscMalloc( (nrqr +1) * sizeof(int)) ; CHKPTRQ(req_source); 745 sbuf2 = (int**) PetscMalloc( (nrqr +1) * sizeof(int*)) ; CHKPTRQ(sbuf2); 746 747 for ( i=0; i< nrqr; ++i) { 748 MPI_Waitany(nrqr, recv_waits, &index, recv_status+i); 749 /* req_size[index] = 2*rbuf1[index][0];*/ 750 req_size[index] = 0; 751 start = 2*rbuf1[index][0] + 1 ; 752 MPI_Get_count(recv_status+i,MPI_INT, &end); 753 sbuf2 [index] = (int *)PetscMalloc(end*sizeof(int)); CHKPTRQ(sbuf2[index]); 754 755 for (j=start; j< end; j++) { 756 row = rbuf1[index][j] - rstart; 757 sbuf2[index][j] = (ai[row+1] - ai[row]); /*overite it with nz count of that row */ 758 sbuf2[index][j] += (bi[row+1] - bi[row]); 759 req_size[index] += sbuf2[index][j]; 760 } 761 req_source[index] = recv_status[i].MPI_SOURCE; 762 /* form the header */ 763 sbuf2[index][0] = req_size[index]; 764 for (j=1; j<start; j++){ sbuf2[index][j] = rbuf1[index][j]; } 765 printf("[%d] Send to %d: size %d, index = %d start = %d end = %d \n", rank, 766 req_source[index] , *(req_size+index), index, start, end); 767 MPI_Isend((void *)(sbuf2[index]),end,MPI_INT,req_source[index],tag+1, comm, send_waits2+i); 768 769 } 770 771 772 /* recv buffer sizes */ 773 /* Receive messages*/ 774 775 rbuf3 = (int**)PetscMalloc((nrqs+1) *sizeof(int*)); CHKPTRQ(rbuf3); 776 rbuf4 = (Scalar**)PetscMalloc((nrqs+1) *sizeof(Scalar*)); CHKPTRQ(rbuf4); 777 recv_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 778 CHKPTRQ(recv_waits3); 779 recv_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 780 CHKPTRQ(recv_waits4); 781 recv_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 782 CHKPTRQ(recv_status2); 783 for ( i=0; i< nrqs; ++i) { 784 MPI_Waitany(nrqs, recv_waits2, &index, recv_status2+i); 785 786 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 787 CHKPTRQ(rbuf3[index]); 788 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*sizeof(Scalar)); 789 CHKPTRQ(rbuf4[index]); 790 printf("[%d] Posting Irecv for aj, aa from %d, size of buf = %d \n",rank, 791 recv_status2[i].MPI_SOURCE,rbuf2[index][0]); 792 MPI_Irecv((void *)(rbuf3[index]),rbuf2[index][0], MPI_INT, 793 recv_status2[i].MPI_SOURCE, tag+2, comm, recv_waits3+index); 794 MPI_Irecv((void *)(rbuf4[index]),rbuf2[index][0], MPIU_SCALAR, 795 recv_status2[i].MPI_SOURCE, tag+3, comm, recv_waits4+index); 796 797 } 798 799 /* Wait on sends1 and sends2 */ 800 send_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 801 CHKPTRQ(send_status); 802 send_status2 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 803 CHKPTRQ(send_status2); 804 805 MPI_Waitall(nrqs,send_waits,send_status); 806 MPI_Waitall(nrqr,send_waits2,send_status2); 807 808 809 /* Now allocate buffers for a->j, and send them off */ 810 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(sbuf_aj); 811 for ( i=0, j =0; i< nrqr; i++) j += req_size[i]; 812 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 813 for (i =1; i< nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 814 815 send_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 816 CHKPTRQ(send_waits3); 817 for (i=0; i<nrqr; i++) { 818 ct1 = 2*rbuf1[i][0]+1; 819 ct2 = 0; 820 for (j=1, max1 = rbuf1[i][0]; j<= max1; j++){ 821 for( k=0; k< rbuf1[i][2*j]; k++, ct1++) { 822 row = rbuf1[i][ct1] - rstart; 823 start = ai[row]; 824 end = ai[row+1]; 825 for (l = start; l< end; l++) { 826 sbuf_aj[i][ct2++] = aj[l] + cstart; 827 } 828 start = bi[row]; 829 end = bi[row+1]; 830 for (l = start; l< end; l++) { 831 sbuf_aj[i][ct2++] = garray[bj[l]]; 832 } 833 } 834 } 835 /* no header for this message */ 836 printf("[%d] Send AJ to %d: size %d, ct2 = %d \n", rank, req_source[i] , req_size[i], ct2); 837 MPI_Isend((void *)(sbuf_aj[i]),req_size[i],MPI_INT,req_source[i],tag+2, comm, send_waits3+i); 838 } 839 recv_status3 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 840 CHKPTRQ(recv_status3); 841 send_status3 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 842 CHKPTRQ(send_status3); 843 844 for (i=0; i< nrqs; ++i){ 845 MPI_Waitany(nrqs, recv_waits3, &index, recv_status3+i); 846 } 847 848 for (i=0; i< nrqr; ++i){ 849 MPI_Waitany(nrqr, send_waits3, &index, send_status3+i); 850 } 851 852 /* MPI_Waitall(nrqs,recv_waits3,recv_status3); 853 MPI_Waitall(nrqr,send_waits3,send_status3); */ 854 855 856 /* Now allocate buffers for a->a, and send them off */ 857 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *)); CHKPTRQ(sbuf_aa); 858 for ( i=0, j =0; i< nrqr; i++) j += req_size[i]; 859 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*sizeof(Scalar)); CHKPTRQ(sbuf_aa[0]); 860 for (i =1; i< nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 861 862 send_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 863 CHKPTRQ(send_waits4); 864 for (i=0; i<nrqr; i++) { 865 ct1 = 2*rbuf1[i][0]+1; 866 ct2 = 0; 867 for (j=1, max1 = rbuf1[i][0]; j<= max1; j++){ 868 for( k=0; k< rbuf1[i][2*j]; k++, ct1++) { 869 row = rbuf1[i][ct1] - rstart; 870 start = ai[row]; 871 end = ai[row+1]; 872 for (l = start; l< end; l++) { 873 sbuf_aa[i][ct2++] = aa[l]; 874 } 875 start = bi[row]; 876 end = bi[row+1]; 877 for (l = start; l< end; l++) { 878 sbuf_aj[i][ct2++] = ba[l]; 879 } 880 } 881 } 882 /* no header for this message */ 883 printf("[%d] Send AA to %d: size %d, ct2 = %d \n", rank, req_source[i] , req_size[i], ct2); 884 MPI_Isend((void *)(sbuf_aa[i]),req_size[i],MPIU_SCALAR,req_source[i],tag+3, comm, send_waits4+i); 885 } 886 recv_status4 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 887 CHKPTRQ(recv_status4); 888 send_status4 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 889 CHKPTRQ(send_status4); 890 891 for (i=0; i< nrqs; ++i){ 892 MPI_Waitany(nrqs, recv_waits4, &index, recv_status4+i); 893 } 894 895 for (i=0; i< nrqr; ++i){ 896 MPI_Waitany(nrqr, send_waits4, &index, send_status4+i); 897 } 898 899 /* MPI_Waitall(nrqs,recv_waits4,recv_status4); 900 MPI_Waitall(nrqr,send_waits4,send_status4); */ 901 902 903 /* Now u have all the data required, so form the matrix */ 904 /* rbuf3->aj, rbuf4 -> aa */ 905 906 PetscFree(idx); 907 PetscFree(n); 908 PetscFree(rtable); 909 PetscFree(w1); 910 PetscFree(pa); 911 PetscFree(rbuf1[0]); 912 PetscFree(rbuf1); 913 PetscFree(recv_waits ); 914 PetscFree(sbuf1 ); 915 PetscFree(tmp); 916 PetscFree(ctr); 917 PetscFree(send_waits); 918 PetscFree(recv_waits2); 919 PetscFree(rbuf2); 920 PetscFree(send_waits2); 921 PetscFree(recv_status); 922 PetscFree(req_size); 923 PetscFree(req_source); 924 for ( i=0; i< nrqr; ++i) { 925 PetscFree(sbuf2[i]); 926 } 927 for ( i=0; i< nrqs; ++i) { 928 PetscFree(rbuf3[i]); 929 PetscFree(rbuf4[i]); 930 } 931 932 PetscFree( sbuf2 ); 933 PetscFree(rbuf3); 934 PetscFree(rbuf4 ); 935 PetscFree(recv_waits3); 936 PetscFree(recv_waits4); 937 PetscFree(recv_status2); 938 PetscFree( send_status); 939 PetscFree(send_status2); 940 PetscFree(sbuf_aj[0]); 941 PetscFree(sbuf_aj); 942 PetscFree(send_waits3); 943 PetscFree(recv_status3); 944 PetscFree(send_status3); 945 PetscFree(sbuf_aa[0]); 946 PetscFree(sbuf_aa); 947 PetscFree(send_waits4); 948 PetscFree(recv_status4); 949 950 return 0; 951 } 952 953 954 955 956