1 #ifndef lint 2 static char vcid[] = "$Id: mpiov.c,v 1.21 1996/02/13 22:36:43 balay Exp balay $"; 3 #endif 4 5 #include "mpiaij.h" 6 #include "inline/bitarray.h" 7 8 static int MatIncreaseOverlap_MPIAIJ_private(Mat, int, IS *); 9 static int FindOverlapLocal(Mat , int , char **,int*, int**); 10 static int FindOverlapRecievedMesg(Mat , int, int **, int**, int* ); 11 12 int MatIncreaseOverlap_MPIAIJ(Mat C, int imax, IS *is, int ov) 13 { 14 int i, ierr; 15 if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIAIJ: negative overlap specified\n");} 16 for (i =0; i<ov; ++i) { 17 ierr = MatIncreaseOverlap_MPIAIJ_private(C, imax, is); CHKERRQ(ierr); 18 } 19 return 0; 20 } 21 22 /* 23 Sample message format: 24 If a processor A wants processor B to process some elements corresponding 25 to index sets 1s[1], is[5] 26 mesg [0] = 2 ( no of index sets in the mesg) 27 ----------- 28 mesg [1] = 1 => is[1] 29 mesg [2] = sizeof(is[1]); 30 ----------- 31 mesg [5] = 5 => is[5] 32 mesg [6] = sizeof(is[5]); 33 ----------- 34 mesg [7] 35 mesg [n] datas[1] 36 ----------- 37 mesg[n+1] 38 mesg[m] data(is[5]) 39 ----------- 40 41 Notes: 42 nrqs - no of requests sent (or to be sent out) 43 nrqr - no of requests recieved (which have to be or which have been processed 44 */ 45 static int MatIncreaseOverlap_MPIAIJ_private(Mat C, int imax, IS *is) 46 { 47 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 48 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data; 49 int size, rank, m,i,j,k, ierr, **rbuf, row, proc, nrqs, msz, **outdat, **ptr; 50 int *ctr, *pa, tag, *tmp,bsz, nrqr , *isz, *isz1, **xdata; 51 int bsz1, **rbuf2; 52 char **table; 53 MPI_Comm comm; 54 MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2 ; 55 MPI_Status *send_status ,*recv_status; 56 double space, fr, maxs; 57 58 comm = C->comm; 59 tag = C->tag; 60 size = c->size; 61 rank = c->rank; 62 m = c->M; 63 64 65 TrSpace( &space, &fr, &maxs ); 66 /* MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs); */ 67 68 idx = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(idx); 69 n = (int *)PetscMalloc((imax+1)*sizeof(int )); CHKPTRQ(n); 70 rtable = (int *)PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable); 71 /* Hash table for maping row ->proc */ 72 73 for ( i=0 ; i<imax ; i++) { 74 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 75 ierr = ISGetLocalSize(is[i],&n[i]); CHKERRQ(ierr); 76 } 77 78 /* Create hash table for the mapping :row -> proc*/ 79 for( i=0, j=0; i< size; i++) { 80 for (; j <c->rowners[i+1]; j++) { 81 rtable[j] = i; 82 } 83 } 84 85 /* evaluate communication - mesg to who, length of mesg, and buffer space 86 required. Based on this, buffers are allocated, and data copied into them*/ 87 w1 = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /* mesg size */ 88 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 89 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 90 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 91 PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/ 92 for ( i=0; i<imax ; i++) { 93 PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/ 94 for ( j =0 ; j < n[i] ; j++) { 95 row = idx[i][j]; 96 proc = rtable[row]; 97 w4[proc]++; 98 } 99 for( j = 0; j < size; j++){ 100 if( w4[j] ) { w1[j] += w4[j]; w3[j] += 1;} 101 } 102 } 103 104 nrqs = 0; /* no of outgoing messages */ 105 msz = 0; /* total mesg length (for all proc */ 106 w1[rank] = 0; /* no mesg sent to intself */ 107 w3[rank] = 0; 108 for (i =0; i < size ; i++) { 109 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 110 } 111 pa = (int *)PetscMalloc((nrqs +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */ 112 for (i =0, j=0; i < size ; i++) { 113 if (w1[i]) { pa[j] = i; j++; } 114 } 115 116 /* Each message would have a header = 1 + 2*(no of IS) + data */ 117 for (i = 0; i<nrqs ; i++) { 118 j = pa[i]; 119 w1[j] += w2[j] + 2* w3[j]; 120 msz += w1[j]; 121 } 122 123 124 /* Do a global reduction to determine how many messages to expect*/ 125 { 126 int *rw1, *rw2; 127 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 128 rw2 = rw1+size; 129 MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm); 130 bsz = rw1[rank]; 131 MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm); 132 nrqr = rw2[rank]; 133 PetscFree(rw1); 134 } 135 136 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 137 rbuf = (int**) PetscMalloc((nrqr+1) *sizeof(int*)); CHKPTRQ(rbuf); 138 rbuf[0] = (int *) PetscMalloc((nrqr *bsz+1) * sizeof(int)); CHKPTRQ(rbuf[0]); 139 for (i=1; i<nrqr ; ++i) rbuf[i] = rbuf[i-1] + bsz; 140 141 /* Now post the receives */ 142 recv_waits = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 143 CHKPTRQ(recv_waits); 144 for ( i=0; i<nrqr; ++i){ 145 MPI_Irecv((void *)(rbuf[i]), bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i); 146 } 147 148 /* Allocate Memory for outgoing messages */ 149 outdat = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(outdat); 150 PetscMemzero(outdat, 2*size*sizeof(int*)); 151 tmp = (int *)PetscMalloc((msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */ 152 ptr = outdat +size; /* Pointers to the data in outgoing buffers */ 153 ctr = (int *)PetscMalloc( size*sizeof(int)); CHKPTRQ(ctr); 154 155 { 156 int *iptr = tmp; 157 int ict = 0; 158 for (i = 0; i < nrqs ; i++) { 159 j = pa[i]; 160 iptr += ict; 161 outdat[j] = iptr; 162 ict = w1[j]; 163 } 164 } 165 166 /* Form the outgoing messages */ 167 /*plug in the headers*/ 168 for ( i=0 ; i<nrqs ; i++) { 169 j = pa[i]; 170 outdat[j][0] = 0; 171 PetscMemzero(outdat[j]+1, 2 * w3[j]*sizeof(int)); 172 ptr[j] = outdat[j] + 2*w3[j] +1; 173 } 174 175 /* Memory for doing local proc's work*/ 176 table = (char **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(table); 177 data = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(data); 178 table[0] = (char *)PetscMalloc((m/BITSPERBYTE +1)*(imax)); CHKPTRQ(table[0]); 179 data [0] = (int *)PetscMalloc((m+1)*(imax)*sizeof(int)); CHKPTRQ(data[0]); 180 181 for(i = 1; i<imax ; i++) { 182 table[i] = table[0] + (m/BITSPERBYTE+1)*i; 183 data[i] = data[0] + (m+1)*i; 184 } 185 186 PetscMemzero((void*)*table,(m/BITSPERBYTE+1)*(imax)); 187 isz = (int *)PetscMalloc((imax+1) *sizeof(int)); CHKPTRQ(isz); 188 PetscMemzero((void *)isz,(imax+1) *sizeof(int)); 189 190 /* Parse the IS and update local tables and the outgoing buf with the data*/ 191 for ( i=0 ; i<imax ; i++) { 192 PetscMemzero(ctr,size*sizeof(int)); 193 for( j=0; j<n[i]; j++) { /* parse the indices of each IS */ 194 row = idx[i][j]; 195 proc = rtable[row]; 196 if (proc != rank) { /* copy to the outgoing buf*/ 197 ctr[proc]++; 198 *ptr[proc] = row; 199 ptr[proc]++; 200 } 201 else { /* Update the table */ 202 if ( !BT_LOOKUP(table[i],row)) { data[i][isz[i]++] = row;} 203 } 204 } 205 /* Update the headers for the current IS */ 206 for( j = 0; j<size; j++) { /* Can Optimise this loop too */ 207 if (ctr[j]) { 208 k= ++outdat[j][0]; 209 outdat[j][2*k] = ctr[j]; 210 outdat[j][2*k-1] = i; 211 } 212 } 213 } 214 215 216 217 /* Now post the sends */ 218 send_waits = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 219 CHKPTRQ(send_waits); 220 for( i =0; i< nrqs; ++i){ 221 j = pa[i]; 222 MPI_Isend( (void *)(outdat[j]), w1[j], MPI_INT, j, tag, comm, send_waits+i); 223 } 224 225 /* I nolonger need the original indices*/ 226 for( i=0; i< imax; ++i) { 227 ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr); 228 } 229 PetscFree(idx); 230 PetscFree(n); 231 PetscFree(rtable); 232 for( i=0; i< imax; ++i) { 233 ierr = ISDestroy(is[i]); CHKERRQ(ierr); 234 } 235 236 /* Do Local work*/ 237 ierr = FindOverlapLocal(C, imax, table,isz, data); CHKERRQ(ierr); 238 /* Extract the matrices */ 239 240 /* Receive messages*/ 241 { 242 int index; 243 244 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 245 CHKPTRQ(recv_status); 246 for ( i=0; i< nrqr; ++i) { 247 MPI_Waitany(nrqr, recv_waits, &index, recv_status+i); 248 } 249 250 send_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 251 CHKPTRQ(send_status); 252 MPI_Waitall(nrqs,send_waits,send_status); 253 } 254 /* Pahse 1 sends are complete - deallocate buffers */ 255 PetscFree(outdat); 256 PetscFree(w1); 257 PetscFree(tmp); 258 259 /* int FindOverlapRecievedMesg(Mat C, int imax, int *isz, char **table, int **data)*/ 260 xdata = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(xdata); 261 isz1 = (int *)PetscMalloc((nrqr+1) *sizeof(int)); CHKPTRQ(isz1); 262 ierr = FindOverlapRecievedMesg(C, nrqr, rbuf,xdata,isz1); CHKERRQ(ierr); 263 264 /* Nolonger need rbuf. */ 265 PetscFree(rbuf[0]); 266 PetscFree(rbuf); 267 268 269 /* Send the data back*/ 270 /* Do a global reduction to know the buffer space req for incoming messages*/ 271 { 272 int *rw1, *rw2; 273 274 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 275 PetscMemzero((void*)rw1,2*size*sizeof(int)); 276 rw2 = rw1+size; 277 for (i =0; i < nrqr ; ++i) { 278 proc = recv_status[i].MPI_SOURCE; 279 rw1[proc] = isz1[i]; 280 } 281 282 MPI_Allreduce((void *)rw1, (void *)rw2, size, MPI_INT, MPI_MAX, comm); 283 bsz1 = rw2[rank]; 284 PetscFree(rw1); 285 } 286 287 /* Allocate buffers*/ 288 289 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 290 rbuf2 = (int**) PetscMalloc((nrqs+1) *sizeof(int*)); CHKPTRQ(rbuf2); 291 rbuf2[0] = (int *) PetscMalloc((nrqs*bsz1+1) * sizeof(int)); CHKPTRQ(rbuf2[0]); 292 for (i=1; i<nrqs ; ++i) rbuf2[i] = rbuf2[i-1] + bsz1; 293 294 /* Now post the receives */ 295 recv_waits2 = (MPI_Request *)PetscMalloc((nrqs+1)*sizeof(MPI_Request)); CHKPTRQ(recv_waits2) 296 CHKPTRQ(recv_waits2); 297 for ( i=0; i<nrqs; ++i){ 298 MPI_Irecv((void *)(rbuf2[i]), bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits2+i); 299 } 300 301 /* Now post the sends */ 302 send_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 303 CHKPTRQ(send_waits2); 304 for( i =0; i< nrqr; ++i){ 305 j = recv_status[i].MPI_SOURCE; 306 MPI_Isend( (void *)(xdata[i]), isz1[i], MPI_INT, j, tag, comm, send_waits2+i); 307 } 308 309 /* recieve work done on other processors*/ 310 { 311 int index, is_no, ct1, max; 312 MPI_Status *send_status2 ,*recv_status2; 313 314 recv_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 315 CHKPTRQ(recv_status2); 316 317 318 for ( i=0; i< nrqs; ++i) { 319 MPI_Waitany(nrqs, recv_waits2, &index, recv_status2+i); 320 /* Process the message*/ 321 ct1 = 2*rbuf2[index][0]+1; 322 for (j=1; j<=rbuf2[index][0]; j++) { 323 max = rbuf2[index][2*j]; 324 is_no = rbuf2[index][2*j-1]; 325 for (k=0; k < max ; k++, ct1++) { 326 row = rbuf2[index][ct1]; 327 if(!BT_LOOKUP(table[is_no],row)) { data[is_no][isz[is_no]++] = row;} 328 } 329 } 330 } 331 332 333 send_status2 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 334 CHKPTRQ(send_status2); 335 MPI_Waitall(nrqr,send_waits2,send_status2); 336 337 PetscFree(send_status2); PetscFree(recv_status2); 338 } 339 340 TrSpace( &space, &fr, &maxs ); 341 /* MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs);*/ 342 343 PetscFree(ctr); 344 PetscFree(pa); 345 PetscFree(rbuf2[0]); 346 PetscFree(rbuf2); 347 PetscFree(send_waits); 348 PetscFree(recv_waits); 349 PetscFree(send_waits2); 350 PetscFree(recv_waits2); 351 PetscFree(table[0]); 352 PetscFree(table); 353 PetscFree(send_status); 354 PetscFree(recv_status); 355 PetscFree(isz1); 356 PetscFree(xdata[0]); 357 PetscFree(xdata); 358 359 for ( i=0; i<imax; ++i) { 360 ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr); 361 } 362 PetscFree(isz); 363 PetscFree(data[0]); 364 PetscFree(data); 365 366 return 0; 367 } 368 369 /* FindOverlapLocal() - Called by MatincreaseOverlap, to do the work on 370 the local processor. 371 372 Inputs: 373 C - MAT_MPIAIJ; 374 imax - total no of index sets processed at a time; 375 table - an array of char - size = m bits. 376 377 Output: 378 isz - array containing the count of the solution elements correspondign 379 to each index set; 380 data - pointer to the solutions 381 */ 382 static int FindOverlapLocal(Mat C, int imax, char **table, int *isz,int **data) 383 { 384 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 385 Mat A = c->A, B = c->B; 386 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 387 int start, end, val, max, rstart,cstart, ashift, bshift,*ai, *aj; 388 int *bi, *bj, *garray, i, j, k, row; 389 390 rstart = c->rstart; 391 cstart = c->cstart; 392 ashift = a->indexshift; 393 ai = a->i; 394 aj = a->j +ashift; 395 bshift = b->indexshift; 396 bi = b->i; 397 bj = b->j +bshift; 398 garray = c->garray; 399 400 401 for( i=0; i<imax; i++) { 402 for ( j=0, max =isz[i] ; j< max; j++) { 403 row = data[i][j] - rstart; 404 start = ai[row]; 405 end = ai[row+1]; 406 for ( k=start; k < end; k++) { /* Amat */ 407 val = aj[k] + ashift + cstart; 408 if(!BT_LOOKUP(table[i],val)) { data[i][isz[i]++] = val;} 409 } 410 start = bi[row]; 411 end = bi[row+1]; 412 for ( k=start; k < end; k++) { /* Bmat */ 413 val = garray[bj[k]+bshift] ; 414 if(! BT_LOOKUP(table[i],val)) { data[i][isz[i]++] = val;} 415 } 416 } 417 } 418 419 return 0; 420 } 421 /* FindOverlapRecievedMesg - Process the recieved messages, 422 and return the output 423 424 Input: 425 C - the matrix 426 nrqr - no of messages being processed. 427 rbuf - an array of pointers to the recieved requests 428 429 Output: 430 xdata - array of messages to be sent back 431 isz1 - size of each message 432 */ 433 static int FindOverlapRecievedMesg(Mat C, int nrqr, int ** rbuf, int ** xdata, int * isz1 ) 434 { 435 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 436 Mat A = c->A, B = c->B; 437 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 438 int rstart,cstart, ashift, bshift,*ai, *aj, *bi, *bj, *garray, i, j, k; 439 int row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end; 440 int val, max1, max2, rank, m, no_malloc =0, *tmp, new_estimate, ctr; 441 char *xtable; 442 443 rank = c->rank; 444 m = c->M; 445 rstart = c->rstart; 446 cstart = c->cstart; 447 ashift = a->indexshift; 448 ai = a->i; 449 aj = a->j +ashift; 450 bshift = b->indexshift; 451 bi = b->i; 452 bj = b->j +bshift; 453 garray = c->garray; 454 455 456 for (i =0, ct =0, total_sz =0; i< nrqr; ++i){ 457 ct+= rbuf[i][0]; 458 for ( j = 1; j <= rbuf[i][0] ; j++ ) { total_sz += rbuf[i][2*j]; } 459 } 460 461 max1 = ct*(a->nz +b->nz)/c->m; 462 mem_estimate = 3*((total_sz > max1?total_sz:max1)+1); 463 xdata[0] = (int *)PetscMalloc(mem_estimate *sizeof(int)); CHKPTRQ(xdata[0]); 464 ++no_malloc; 465 xtable = (char *)PetscMalloc((m/BITSPERBYTE+1)); CHKPTRQ(xtable); 466 PetscMemzero((void *)isz1,(nrqr+1) *sizeof(int)); 467 468 ct3 = 0; 469 for (i =0; i< nrqr; i++) { /* for easch mesg from proc i */ 470 ct1 = 2*rbuf[i][0]+1; 471 ct2 = ct1; 472 ct3+= ct1; 473 for (j = 1, max1= rbuf[i][0]; j<=max1; j++) { /* for each IS from proc i*/ 474 PetscMemzero((void *)xtable,(m/BITSPERBYTE+1)); 475 oct2 = ct2; 476 for (k =0; k < rbuf[i][2*j]; k++, ct1++) { 477 row = rbuf[i][ct1]; 478 if(!BT_LOOKUP(xtable,row)) { 479 if (!(ct3 < mem_estimate)) { 480 new_estimate = (int)1.5*mem_estimate+1; 481 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 482 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 483 PetscFree(xdata[0]); 484 xdata[0] = tmp; 485 mem_estimate = new_estimate; ++no_malloc; 486 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 487 } 488 xdata[i][ct2++] = row;ct3++; 489 } 490 } 491 for ( k=oct2, max2 =ct2 ; k< max2; k++) { 492 row = xdata[i][k] - rstart; 493 start = ai[row]; 494 end = ai[row+1]; 495 for ( l=start; l < end; l++) { 496 val = aj[l] +ashift + cstart; 497 if(!BT_LOOKUP(xtable,val)) { 498 if (!(ct3 < mem_estimate)) { 499 new_estimate = (int)1.5*mem_estimate+1; 500 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 501 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 502 PetscFree(xdata[0]); 503 xdata[0] = tmp; 504 mem_estimate = new_estimate; ++no_malloc; 505 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 506 } 507 xdata[i][ct2++] = val;ct3++; 508 } 509 } 510 start = bi[row]; 511 end = bi[row+1]; 512 for ( l=start; l < end; l++) { 513 val = garray[bj[l]+bshift] ; 514 if(!BT_LOOKUP(xtable,val)) { 515 if (!(ct3 < mem_estimate)) { 516 new_estimate = (int)1.5*mem_estimate+1; 517 tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp); 518 PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int)); 519 PetscFree(xdata[0]); 520 xdata[0] = tmp; 521 mem_estimate = new_estimate; ++no_malloc; 522 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 523 } 524 xdata[i][ct2++] = val;ct3++; 525 } 526 } 527 } 528 /* Update the header*/ 529 xdata[i][2*j] = ct2-oct2; /* Undo the vector isz1 and use only a var*/ 530 xdata[i][2*j-1] = rbuf[i][2*j-1]; 531 } 532 xdata[i][0] = rbuf[i][0]; 533 xdata[i+1] = xdata[i] +ct2; 534 isz1[i] = ct2; /* size of each message */ 535 } 536 PetscFree(xtable); 537 PLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc); 538 return 0; 539 } 540 541 542 int MatGetSubMatrices_MPIAIJ (Mat C,int ismax, IS *isrow,IS *iscol,MatGetSubMatrixCall 543 scall, Mat **submat) 544 { 545 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 546 Mat A = c->A, B = c->B; 547 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data, *mat; 548 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable, start, end, size; 549 int **sbuf1,**sbuf2, rank, m,i,j,k,l,ct1,ct2, ierr, **rbuf1, row, proc; 550 int nrqs, msz, **ptr, index, *req_size, *ctr, *pa, tag, *tmp,tcol,bsz, nrqr; 551 int **rbuf3,*req_source,**sbuf_aj, rstart,cstart, ashift, bshift,*ai, *aj; 552 int *bi, *bj,*garray,**rbuf2, max1,max2, **rmap, **cmap, **srmap, **scmap; 553 int **lens, is_no, ncols, *cols, mat_i, *mat_j; 554 MPI_Request *send_waits, *recv_waits, *send_waits2, *recv_waits2, *recv_waits3 ; 555 MPI_Request *recv_waits4,*send_waits3,*send_waits4; 556 MPI_Status *recv_status ,*recv_status2,*send_status,*send_status3 ,*send_status2; 557 MPI_Status *recv_status3,*recv_status4,*send_status4; 558 MPI_Comm comm; 559 Scalar **rbuf4, *aa, *ba, **sbuf_aa, *vals, *mat_a; 560 double space, fr, maxs; 561 562 563 comm = C->comm; 564 tag = C->tag; 565 size = c->size; 566 rank = c->rank; 567 m = c->M; 568 rstart = c->rstart; 569 cstart = c->cstart; 570 ashift = a->indexshift; 571 ai = a->i; 572 aj = a->j; 573 aa = a->a; 574 bshift = b->indexshift; 575 bi = b->i; 576 bj = b->j; 577 ba = b->a; 578 garray = c->garray; 579 580 TrSpace( &space, &fr, &maxs ); 581 /* MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs); */ 582 583 irow = (int **)PetscMalloc((ismax+1)*sizeof(int *)); CHKPTRQ(irow); 584 icol = (int **)PetscMalloc((ismax+1)*sizeof(int *)); CHKPTRQ(icol); 585 nrow = (int *) PetscMalloc((ismax+1)*sizeof(int )); CHKPTRQ(nrow); 586 ncol = (int *) PetscMalloc((ismax+1)*sizeof(int )); CHKPTRQ(ncol); 587 rtable = (int *) PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable); 588 /* Hash table for maping row ->proc */ 589 590 for ( i=0 ; i<ismax ; i++) { /* Extract the indicies and sort them */ 591 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 592 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 593 ierr = ISGetLocalSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 594 ierr = ISGetLocalSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 595 ierr = SYIsort(nrow[i], irow[i]); CHKERRQ(ierr); 596 ierr = SYIsort(ncol[i], icol[i]); CHKERRQ(ierr); 597 } 598 599 /* Create hash table for the mapping :row -> proc*/ 600 for( i=0, j=0; i< size; i++) { 601 for (; j <c->rowners[i+1]; j++) { 602 rtable[j] = i; 603 } 604 } 605 606 /* evaluate communication - mesg to who, length of mesg, and buffer space 607 required. Based on this, buffers are allocated, and data copied into them*/ 608 w1 = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /* mesg size */ 609 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 610 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 611 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 612 PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/ 613 for ( i=0; i<ismax ; i++) { 614 PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/ 615 for ( j =0 ; j < nrow[i] ; j++) { 616 row = irow[i][j]; 617 proc = rtable[row]; 618 w4[proc]++; 619 } 620 for( j = 0; j < size; j++){ 621 if( w4[j] ) { w1[j] += w4[j]; w3[j] += 1;} 622 } 623 } 624 625 nrqs = 0; /* no of outgoing messages */ 626 msz = 0; /* total mesg length (for all proc */ 627 w1[rank] = 0; /* no mesg sent to intself */ 628 w3[rank] = 0; 629 for (i =0; i < size ; i++) { 630 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 631 } 632 pa = (int *)PetscMalloc((nrqs +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */ 633 for (i =0, j=0; i < size ; i++) { 634 if (w1[i]) { pa[j] = i; j++; } 635 } 636 637 /* Each message would have a header = 1 + 2*(no of IS) + data */ 638 for (i = 0; i<nrqs ; i++) { 639 j = pa[i]; 640 w1[j] += w2[j] + 2* w3[j]; 641 msz += w1[j]; 642 } 643 644 645 /* Do a global reduction to determine how many messages to expect*/ 646 { 647 int *rw1, *rw2; 648 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 649 rw2 = rw1+size; 650 MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm); 651 bsz = rw1[rank]; 652 MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm); 653 nrqr = rw2[rank]; 654 PetscFree(rw1); 655 } 656 657 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 658 rbuf1 = (int**) PetscMalloc((nrqr+1) *sizeof(int*)); CHKPTRQ(rbuf1); 659 rbuf1[0] = (int *) PetscMalloc((nrqr *bsz+1) * sizeof(int)); CHKPTRQ(rbuf1[0]); 660 for (i=1; i<nrqr ; ++i) rbuf1[i] = rbuf1[i-1] + bsz; 661 662 /* Now post the receives */ 663 recv_waits = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 664 CHKPTRQ(recv_waits); 665 for ( i=0; i<nrqr; ++i){ 666 MPI_Irecv((void *)(rbuf1[i]), bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i); 667 } 668 669 /* Allocate Memory for outgoing messages */ 670 sbuf1 = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(sbuf1); 671 PetscMemzero(sbuf1, 2*size*sizeof(int*)); 672 /* allocate memory for outgoing data + buf to recive the first reply */ 673 tmp = (int *)PetscMalloc((2*msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */ 674 ptr = sbuf1 +size; /* Pointers to the data in outgoing buffers */ 675 ctr = (int *)PetscMalloc( size*sizeof(int)); CHKPTRQ(ctr); 676 677 { 678 int *iptr = tmp; 679 int ict = 0; 680 for (i = 0; i < nrqs ; i++) { 681 j = pa[i]; 682 iptr += ict; 683 sbuf1[j] = iptr; 684 ict = w1[j]; 685 } 686 } 687 688 /* Form the outgoing messages */ 689 /* Initialise the header space */ 690 for ( i=0 ; i<nrqs ; i++) { 691 j = pa[i]; 692 sbuf1[j][0] = 0; 693 PetscMemzero(sbuf1[j]+1, 2 * w3[j]*sizeof(int)); 694 ptr[j] = sbuf1[j] + 2*w3[j] +1; 695 } 696 697 698 /* Parse the isrow and copy data into outbuf */ 699 for ( i=0 ; i<ismax ; i++) { 700 PetscMemzero(ctr,size*sizeof(int)); 701 for( j=0; j<nrow[i]; j++) { /* parse the indices of each IS */ 702 row = irow[i][j]; 703 proc = rtable[row]; 704 if (proc != rank) { /* copy to the outgoing buf*/ 705 ctr[proc]++; 706 *ptr[proc] = row; 707 ptr[proc]++; 708 } 709 } 710 /* Update the headers for the current IS */ 711 for( j = 0; j<size; j++) { /* Can Optimise this loop too */ 712 if (ctr[j]) { 713 k= ++sbuf1[j][0]; 714 sbuf1[j][2*k] = ctr[j]; 715 sbuf1[j][2*k-1] = i; 716 } 717 } 718 } 719 720 /* Now post the sends */ 721 send_waits = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 722 CHKPTRQ(send_waits); 723 for( i =0; i< nrqs; ++i){ 724 j = pa[i]; 725 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 726 MPI_Isend( (void *)(sbuf1[j]), w1[j], MPI_INT, j, tag, comm, send_waits+i); 727 } 728 729 /* Post Recieves to capture the buffer size */ 730 recv_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 731 CHKPTRQ(recv_waits2); 732 rbuf2 = (int**)PetscMalloc((nrqs+1) *sizeof(int *)); CHKPTRQ(rbuf2); 733 rbuf2[0] = tmp + msz; 734 for( i =1; i< nrqs; ++i){ 735 j = pa[i]; 736 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 737 } 738 for( i =0; i< nrqs; ++i){ 739 j = pa[i]; 740 MPI_Irecv( (void *)(rbuf2[i]), w1[j], MPI_INT, j, tag+1, comm, recv_waits2+i); 741 } 742 743 /* Send to other procs the buf size they should allocate */ 744 745 746 /* Receive messages*/ 747 send_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 748 CHKPTRQ(send_waits2); 749 recv_status = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 750 CHKPTRQ(recv_status); 751 req_size = (int *) PetscMalloc( (nrqr +1) * sizeof(int)) ; CHKPTRQ(req_size); 752 req_source = (int *) PetscMalloc( (nrqr +1) * sizeof(int)) ; CHKPTRQ(req_source); 753 sbuf2 = (int**) PetscMalloc( (nrqr +1) * sizeof(int*)) ; CHKPTRQ(sbuf2); 754 755 for ( i=0; i< nrqr; ++i) { 756 MPI_Waitany(nrqr, recv_waits, &index, recv_status+i); 757 /* req_size[index] = 2*rbuf1[index][0];*/ 758 req_size[index] = 0; 759 start = 2*rbuf1[index][0] + 1 ; 760 MPI_Get_count(recv_status+i,MPI_INT, &end); 761 sbuf2 [index] = (int *)PetscMalloc(end*sizeof(int)); CHKPTRQ(sbuf2[index]); 762 763 for (j=start; j< end; j++) { 764 row = rbuf1[index][j] - rstart; 765 sbuf2[index][j] = (ai[row+1] - ai[row]); /*overite it with nz count of that row */ 766 sbuf2[index][j] += (bi[row+1] - bi[row]); 767 req_size[index] += sbuf2[index][j]; 768 } 769 req_source[index] = recv_status[i].MPI_SOURCE; 770 /* form the header */ 771 sbuf2[index][0] = req_size[index]; 772 for (j=1; j<start; j++){ sbuf2[index][j] = rbuf1[index][j]; } 773 /* printf("[%d] Send to %d: size %d, index = %d start = %d end = %d \n", rank, 774 req_source[index] , *(req_size+index), index, start, end); */ 775 MPI_Isend((void *)(sbuf2[index]),end,MPI_INT,req_source[index],tag+1, comm, send_waits2+i); 776 777 } 778 779 780 /* recv buffer sizes */ 781 /* Receive messages*/ 782 783 rbuf3 = (int**)PetscMalloc((nrqs+1) *sizeof(int*)); CHKPTRQ(rbuf3); 784 rbuf4 = (Scalar**)PetscMalloc((nrqs+1) *sizeof(Scalar*)); CHKPTRQ(rbuf4); 785 recv_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 786 CHKPTRQ(recv_waits3); 787 recv_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request)); 788 CHKPTRQ(recv_waits4); 789 recv_status2 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 790 CHKPTRQ(recv_status2); 791 for ( i=0; i< nrqs; ++i) { 792 MPI_Waitany(nrqs, recv_waits2, &index, recv_status2+i); 793 794 rbuf3[index] = (int *)PetscMalloc(rbuf2[index][0]*sizeof(int)); 795 CHKPTRQ(rbuf3[index]); 796 rbuf4[index] = (Scalar *)PetscMalloc(rbuf2[index][0]*sizeof(Scalar)); 797 CHKPTRQ(rbuf4[index]); 798 /* printf("[%d] Posting Irecv for aj, aa from %d, size of buf = %d \n",rank, 799 recv_status2[i].MPI_SOURCE,rbuf2[index][0]); */ 800 MPI_Irecv((void *)(rbuf3[index]),rbuf2[index][0], MPI_INT, 801 recv_status2[i].MPI_SOURCE, tag+2, comm, recv_waits3+index); 802 MPI_Irecv((void *)(rbuf4[index]),rbuf2[index][0], MPIU_SCALAR, 803 recv_status2[i].MPI_SOURCE, tag+3, comm, recv_waits4+index); 804 805 } 806 807 /* Wait on sends1 and sends2 */ 808 send_status = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 809 CHKPTRQ(send_status); 810 send_status2 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 811 CHKPTRQ(send_status2); 812 813 MPI_Waitall(nrqs,send_waits,send_status); 814 MPI_Waitall(nrqr,send_waits2,send_status2); 815 816 817 /* Now allocate buffers for a->j, and send them off */ 818 sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *)); CHKPTRQ(sbuf_aj); 819 for ( i=0, j =0; i< nrqr; i++) j += req_size[i]; 820 sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); 821 for (i =1; i< nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 822 823 send_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 824 CHKPTRQ(send_waits3); 825 for (i=0; i<nrqr; i++) { 826 ct1 = 2*rbuf1[i][0]+1; 827 ct2 = 0; 828 for (j=1, max1 = rbuf1[i][0]; j<= max1; j++){ 829 for( k=0; k< rbuf1[i][2*j]; k++, ct1++) { 830 /* row = rbuf1[i][ct1] - rstart; 831 start = ai[row]; 832 end = ai[row+1]; 833 for (l = start; l< end; l++) { 834 sbuf_aj[i][ct2++] = aj[l] + cstart; 835 } 836 start = bi[row]; 837 end = bi[row+1]; 838 for (l = start; l< end; l++) { 839 sbuf_aj[i][ct2++] = garray[bj[l]]; 840 }*/ 841 row = rbuf1[i][ct1]; 842 ierr = MatGetRow(C, row, &ncols, &cols, 0); CHKERRQ(ierr); 843 /* MPIU_printf(MPI_COMM_SELF,"[%d]",rank); 844 for (l =0; l<ncols ; l++ ) MPIU_printf(MPI_COMM_SELF,"%d ",cols[l]); 845 MPIU_printf (MPI_COMM_SELF,"\n"); */ 846 PetscMemcpy(sbuf_aj[i]+ct2, cols, ncols*sizeof(int)); 847 ct2 += ncols; 848 ierr = MatRestoreRow(C,row, &ncols, &cols,0); CHKERRQ(ierr); 849 } 850 } 851 /* no header for this message */ 852 /* printf("[%d] Send AJ to %d: size %d, ct2 = %d \n", rank, req_source[i] , req_size[i], ct2); */ 853 MPI_Isend((void *)(sbuf_aj[i]),req_size[i],MPI_INT,req_source[i],tag+2, comm, send_waits3+i); 854 } 855 recv_status3 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 856 CHKPTRQ(recv_status3); 857 send_status3 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 858 CHKPTRQ(send_status3); 859 /* 860 for (i=0; i< nrqs; ++i){ 861 MPI_Waitany(nrqs, recv_waits3, &index, recv_status3+i); 862 } 863 864 for (i=0; i< nrqr; ++i){ 865 MPI_Waitany(nrqr, send_waits3, &index, send_status3+i); 866 } 867 */ 868 MPI_Waitall(nrqs,recv_waits3,recv_status3); 869 MPI_Waitall(nrqr,send_waits3,send_status3); 870 871 872 /* Now allocate buffers for a->a, and send them off */ 873 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *)); CHKPTRQ(sbuf_aa); 874 for ( i=0, j =0; i< nrqr; i++) j += req_size[i]; 875 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*sizeof(Scalar)); CHKPTRQ(sbuf_aa[0]); 876 for (i =1; i< nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 877 878 send_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request)); 879 CHKPTRQ(send_waits4); 880 for (i=0; i<nrqr; i++) { 881 ct1 = 2*rbuf1[i][0]+1; 882 ct2 = 0; 883 for (j=1, max1 = rbuf1[i][0]; j<= max1; j++){ 884 for( k=0; k< rbuf1[i][2*j]; k++, ct1++) { 885 /* row = rbuf1[i][ct1] - rstart; 886 start = ai[row]; 887 end = ai[row+1]; 888 for (l = start; l< end; l++) { 889 sbuf_aa[i][ct2++] = aa[l]; 890 } 891 start = bi[row]; 892 end = bi[row+1]; 893 for (l = start; l< end; l++) { 894 sbuf_aa[i][ct2++] = ba[l]; 895 } */ 896 row = rbuf1[i][ct1]; 897 ierr = MatGetRow(C, row, &ncols, 0, &vals); CHKERRQ(ierr); 898 PetscMemcpy(sbuf_aa[i]+ct2, vals, ncols*sizeof(Scalar)); 899 ct2 += ncols; 900 ierr = MatRestoreRow(C,row, &ncols,0,&vals); CHKERRQ(ierr); 901 } 902 } 903 /* no header for this message */ 904 /* printf("[%d] Send AA to %d: size %d, ct2 = %d \n", rank, req_source[i] , req_size[i], ct2); */ 905 MPI_Isend((void *)(sbuf_aa[i]),req_size[i],MPIU_SCALAR,req_source[i],tag+3, comm, send_waits4+i); 906 } 907 recv_status4 = (MPI_Status *) PetscMalloc( (nrqs+1)*sizeof(MPI_Status) ); 908 CHKPTRQ(recv_status4); 909 send_status4 = (MPI_Status *) PetscMalloc( (nrqr+1)*sizeof(MPI_Status) ); 910 CHKPTRQ(send_status4); 911 912 /* for (i=0; i< nrqs; ++i){ 913 MPI_Waitany(nrqs, recv_waits4, &index, recv_status4+i); 914 } 915 916 for (i=0; i< nrqr; ++i){ 917 MPI_Waitany(nrqr, send_waits4, &index, send_status4+i); 918 }*/ 919 920 MPI_Waitall(nrqs,recv_waits4,recv_status4); 921 MPI_Waitall(nrqr,send_waits4,send_status4); 922 923 924 /* Now u have all the data required, so form the matrix */ 925 /* rbuf3->aj, rbuf4 -> aa */ 926 927 /* create col map */ 928 cmap = (int **) PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(cmap); 929 scmap = (int **)PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(scmap); 930 cmap[0] = (int *)PetscMalloc((1+ ismax*c->N)*sizeof(int)); CHKPTRQ(cmap[0]); 931 PetscMemzero((char *)cmap[0],(1+ ismax*c->N)*sizeof(int)); 932 for (i =1; i<ismax; i++) { cmap[i] = cmap[i-1] + c->n; } 933 for (i =0; i<ismax; i++) { scmap[i] = cmap[i] +ashift;} 934 for (i=0; i< ismax; i++) { 935 for ( j=0; j< ncol[i]; j++) { 936 scmap[i][icol[i][j]] = j+1; 937 } 938 } 939 940 /* Create lens which is required for MatCreate... */ 941 lens = (int **)PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(lens); 942 for (i =0, j=0; i<ismax; i++) { j +=nrow[i]; } 943 lens[0] = (int *)PetscMalloc((1+ j)*sizeof(int)); CHKPTRQ(lens[0]); 944 PetscMemzero((char *)lens[0], (1+ j)*sizeof(int)); 945 for (i =1; i<ismax; i++) { lens[i] = lens[i-1] +nrow[i-1]; } 946 947 /* Update lens from local data */ 948 for (i=0; i< ismax; i++) { 949 for (j =0; j< nrow[i]; j++) { 950 row = irow[i][j] ; 951 proc = rtable[row]; 952 if (proc == rank) { 953 /* row -= rstart; 954 start = ai[row]; 955 end = ai[row +1]; 956 for (k =start; k < end; k++) { 957 if ( scmap[aj[k]+cstart]) { lens[i][j]++ ;} 958 } 959 start = bi[row]; 960 end = bi[row +1]; 961 for (k =start; k < end; k++) { 962 if ( scmap[garray[bj[k]]]) { lens[i][j]++ ;} 963 } */ 964 ierr = MatGetRow(C,row,&ncols,&cols,0); CHKERRQ(ierr); 965 for (k =0; k< ncols; k++) { 966 if ( scmap[i][cols[k]]) { lens[i][j]++ ;} 967 } 968 ierr = MatRestoreRow(C,row,&ncols,&cols,0); CHKERRQ(ierr); 969 } 970 } 971 } 972 973 /* Create row map*/ 974 rmap = (int **)PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(rmap); 975 srmap = (int **)PetscMalloc((1+ ismax)*sizeof(int *)); CHKPTRQ(srmap); 976 rmap[0] = (int *)PetscMalloc((1+ ismax*c->M)*sizeof(int)); CHKPTRQ(rmap[0]); 977 PetscMemzero((char *)rmap[0],(1+ ismax*c->M)*sizeof(int)); 978 for (i =1; i<ismax; i++) { rmap[i] = rmap[i-1] + c->M ;} 979 for (i =0; i<ismax; i++) { srmap[i] = rmap[i] +ashift;} 980 for (i=0; i< ismax; i++) { 981 for ( j=0; j< nrow[i]; j++) { 982 srmap[i][irow[i][j]] = j; 983 } 984 } 985 986 /* Update lens from offproc data */ 987 for ( i =0; i<nrqs; i++) { 988 index = pa[i]; 989 ct1 = 2*sbuf1[index][0]+1; /* sbuf1, rbuf2*/ 990 ct2 = 0; /* rbuf3, rbuf4 */ 991 for (j =1; j<= sbuf1[index][0]; j++) { 992 is_no = sbuf1[index][2*j-1]; 993 max1 = sbuf1[index][2*j]; 994 for (k =0; k< max1; k++, ct1++) { 995 row = sbuf1[index][ct1]; 996 row = srmap[is_no][row]; /* the val in the new matrix to be */ 997 max2 = rbuf2[i][ct1]; 998 for (l=0; l<max2; l++, ct2++) { 999 if (scmap[is_no][rbuf3[i][ct2]]) { 1000 lens[is_no][row]++; 1001 } 1002 } 1003 } 1004 } 1005 } 1006 1007 /* Create the submatrices */ 1008 if( scall == MAT_REUSE_MATRIX) { 1009 int n_cols, n_rows; 1010 for (i=0; i<ismax; i++){ 1011 ierr = MatGetSize((*submat)[i],&n_rows, &n_cols); CHKERRQ(ierr); 1012 if ((n_rows !=nrow[i]) || (n_cols !=ncol[i])) { 1013 SETERRQ(1,"MatGetSubmatrices_MPIAIJ:"); 1014 } 1015 } 1016 } 1017 else { 1018 *submat = (Mat *)PetscMalloc(ismax*sizeof(Mat)); CHKPTRQ(*submat); 1019 for ( i=0; i<ismax; i++) { 1020 ierr = MatCreateSeqAIJ(comm, nrow[i],ncol[i],0,lens[i],(*submat)+i); CHKERRQ(ierr); 1021 } 1022 } 1023 1024 /* Assemble the matrices */ 1025 /* First assemble the local rows */ 1026 for (i=0; i< ismax; i++) { 1027 mat = (Mat_SeqAIJ *)((*submat)[i]->data); 1028 for (j =0; j< nrow[i]; j++) { 1029 row = irow[i][j] ; 1030 proc = rtable[row]; 1031 if (proc == rank) { 1032 ierr = MatGetRow(C,row,&ncols,&cols,&vals); CHKERRQ(ierr); 1033 row = srmap[i][row]; 1034 mat_i = mat->i[row]; 1035 mat_a = mat->a + mat_i; 1036 mat_j = mat->j + mat_i; 1037 for (k =0; k< ncols; k++) { 1038 if ((tcol = scmap[i][cols[k]])) { 1039 *mat_j++ = tcol - 1; 1040 *mat_a++ = vals[k]; 1041 mat->ilen[row]++; 1042 } 1043 } 1044 ierr = MatRestoreRow(C,row,&ncols,&cols,&vals); CHKERRQ(ierr); 1045 } 1046 } 1047 } 1048 1049 /* Now assemble the off proc rows*/ 1050 for ( i =0; i<nrqs; i++) { 1051 index = pa[i]; 1052 ct1 = 2*sbuf1[index][0]+1; /* sbuf1, rbuf2*/ 1053 ct2 = 0; /* rbuf3, rbuf4 */ 1054 for (j =1; j<= sbuf1[index][0]; j++) { 1055 is_no = sbuf1[index][2*j-1]; 1056 mat = (Mat_SeqAIJ *)((*submat)[is_no]->data); 1057 max1 = sbuf1[index][2*j]; 1058 for (k =0; k< max1; k++, ct1++) { 1059 row = sbuf1[index][ct1]; 1060 row = srmap[is_no][row]; /* the val in the new matrix to be */ 1061 mat_i = mat->i[row]; 1062 mat_a = mat->a + mat_i; 1063 mat_j = mat->j + mat_i; 1064 max2 = rbuf2[i][ct1]; 1065 for (l=0; l<max2; l++, ct2++) { 1066 if ((tcol = scmap[is_no][rbuf3[i][ct2]])) { 1067 *mat_j++ = tcol - 1; 1068 *mat_a++ = rbuf4[i][ct2]; 1069 mat->ilen[row]++; 1070 } 1071 } 1072 } 1073 } 1074 } 1075 /* Packup*/ 1076 for( i=0; i< ismax; i++) { 1077 ierr = MatAssemblyBegin((*submat)[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1078 } 1079 for( i=0; i< ismax; i++) { 1080 ierr = MatAssemblyEnd((*submat)[i], FINAL_ASSEMBLY); CHKERRQ(ierr); 1081 } 1082 /* Restore the indices */ 1083 for (i=0; i<ismax; i++) { 1084 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 1085 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 1086 } 1087 /* Destroy allocated memory */ 1088 PetscFree(nrow); 1089 PetscFree(ncol); 1090 PetscFree(irow); 1091 PetscFree(icol); 1092 PetscFree(rtable); 1093 PetscFree(w1); 1094 PetscFree(pa); 1095 PetscFree(rbuf1[0]); 1096 PetscFree(rbuf1); 1097 PetscFree(sbuf1 ); 1098 PetscFree(tmp); 1099 PetscFree(ctr); 1100 PetscFree(rbuf2); 1101 PetscFree(req_size); 1102 PetscFree(req_source); 1103 for ( i=0; i< nrqr; ++i) { 1104 PetscFree(sbuf2[i]); 1105 } 1106 for ( i=0; i< nrqs; ++i) { 1107 PetscFree(rbuf3[i]); 1108 PetscFree(rbuf4[i]); 1109 } 1110 1111 PetscFree( sbuf2 ); 1112 PetscFree(rbuf3); 1113 PetscFree(rbuf4 ); 1114 PetscFree(sbuf_aj[0]); 1115 PetscFree(sbuf_aj); 1116 PetscFree(sbuf_aa[0]); 1117 PetscFree(sbuf_aa); 1118 1119 PetscFree(cmap[0]); 1120 PetscFree(rmap[0]); 1121 PetscFree(cmap); 1122 PetscFree(rmap); 1123 PetscFree(scmap); 1124 PetscFree(srmap); 1125 PetscFree(lens[0]); 1126 PetscFree(lens); 1127 1128 PetscFree(recv_waits ); 1129 PetscFree(recv_waits2); 1130 PetscFree(recv_waits3); 1131 PetscFree(recv_waits4); 1132 1133 PetscFree(recv_status); 1134 PetscFree(recv_status2); 1135 PetscFree(recv_status3); 1136 PetscFree(recv_status4); 1137 1138 PetscFree(send_waits); 1139 PetscFree(send_waits2); 1140 PetscFree(send_waits3); 1141 PetscFree(send_waits4); 1142 1143 PetscFree( send_status); 1144 PetscFree(send_status2); 1145 PetscFree(send_status3); 1146 PetscFree(send_status4); 1147 1148 return 0; 1149 } 1150 1151 1152 1153 1154