1 #ifndef lint 2 static char vcid[] = "$Id: mpiov.c,v 1.4 1996/01/23 20:13:36 balay Exp balay $"; 3 #endif 4 5 #include "mpiaij.h" 6 int MatIncreaseOverlap_MPIAIJ_private(Mat, int, IS *); 7 8 int MatIncreaseOverlap_MPIAIJ(Mat C, int is_max, IS *is, int ov) 9 { 10 int i, ierr; 11 if (ov < 1){ SETERRQ(1," MatIncreaseOverlap_MPIAIJ: overlap should be atleast 1\n");} 12 for (i =1; i<=ov; ++i) { 13 ierr = MatIncreaseOverlap_MPIAIJ_private(C, is_max, is); CHKERRQ(ierr); 14 } 15 return 0; 16 } 17 18 19 20 int MatIncreaseOverlap_MPIAIJ_private(Mat C, int is_max, IS *is) 21 { 22 Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data; 23 Mat A = c->A, B = c->B; 24 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 25 int **idx, *n, *w1, *w2, *w3, *w4, *rtable,**table,**data; 26 int size, rank, m,i,j,k, ierr, **rbuf, row, proc, mct, msz, **outdat, **ptr; 27 int *ctr, sum, *pa, tag, *tmp,bsz, nmsg , *isz, *isz1, **xdata, rstart; 28 int cstart, *ai, *aj, *bi, *bj, *garray, bsz1, **rbuf2; 29 MPI_Comm comm; 30 MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2 ; 31 MPI_Status *send_status ,*recv_status; 32 /* assume overlap = 1 */ 33 comm = C->comm; 34 tag = C->tag; 35 size = c->size; 36 rank = c->rank; 37 m = c->M; 38 rstart = c->rstart; 39 cstart = c->cstart; 40 ai = a->i; 41 aj = a->j; 42 bi = b->i; 43 bj = b->j; 44 garray = c->garray; 45 46 idx = (int **)PetscMalloc((is_max+1)*sizeof(int *)); CHKPTRQ(idx); 47 n = (int *)PetscMalloc((is_max+1)*sizeof(int )); CHKPTRQ(n); 48 rtable = (int *)PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable); 49 /* Hash table for maping row ->proc */ 50 51 for ( i=0 ; i<is_max ; i++) { 52 ierr = ISGetIndices(is[i],&idx[i]); CHKERRQ(ierr); 53 ierr = ISGetLocalSize(is[i],&n[i]); CHKERRQ(ierr); 54 } 55 56 /* Create hash table for the mapping :row -> proc*/ 57 for( i=0, j=0; i< size; i++) { 58 for (; j <c->rowners[i+1]; j++) { 59 rtable[j] = i; 60 } 61 } 62 63 /* evaluate communication - mesg to who, length of mesg, and buffer space 64 required. Based on this, buffers are allocated, and data copied into them*/ 65 w1 = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /* mesg size */ 66 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 67 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 68 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 69 PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/ 70 for ( i=0; i<is_max ; i++) { 71 PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/ 72 for ( j =0 ; j < n[i] ; j++) { 73 row = idx[i][j]; 74 proc = rtable[row]; 75 w4[proc]++; 76 } 77 for( j = 0; j < size; j++){ 78 if( w4[j] ) { w1[j] += w4[j]; w3[j] += 1;} 79 } 80 } 81 82 mct = 0; /* no of outgoing messages */ 83 msz = 0; /* total mesg length (for all proc */ 84 w1[rank] = 0; /* no mesg sent to intself */ 85 w3[rank] = 0; 86 for (i =0; i < size ; i++) { 87 if (w1[i]) { w2[i] = 1; mct++;} /* there exists a message to proc i */ 88 } 89 pa = (int *)PetscMalloc((mct +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */ 90 for (i =0, j=0; i < size ; i++) { 91 if (w1[i]) { pa[j] = i; j++; } 92 } 93 94 /* Each message would have a header = 1 + no of IS + data */ 95 for (i = 0; i<mct ; i++) { 96 j = pa[i]; 97 w1[j] += w2[j] + w3[j]; 98 msz += w1[j]; 99 } 100 101 /* Allocate Memory for outgoing messages */ 102 outdat = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(outdat); 103 PetscMemzero(outdat, 2*size*sizeof(int*)); 104 tmp = (int *)PetscMalloc((msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */ 105 ptr = outdat +size; /* Pointers to the data in outgoing buffers */ 106 ctr = (int *)PetscMalloc( size*sizeof(int)); CHKPTRQ(ctr); 107 108 { 109 int *iptr = tmp; 110 int ict = 0; 111 for (i = 0; i < mct ; i++) { 112 j = pa[i]; 113 iptr += ict; 114 outdat[j] = iptr; 115 ict = w1[j]; 116 } 117 } 118 119 /* Form the outgoing messages */ 120 /*plug in the headers*/ 121 for ( i=0 ; i<mct ; i++) { 122 j = pa[i]; 123 outdat[j][0] = 0; 124 PetscMemzero(outdat[j]+1, w3[j]*sizeof(int)); 125 ptr[j] = outdat[j] + w3[j]; 126 } 127 128 /* Memory for doing local proc's work*/ 129 table = (int **)PetscMalloc((is_max+1)*sizeof(int *)); CHKPTRQ(table); 130 data = (int **)PetscMalloc((is_max+1)*sizeof(int *)); CHKPTRQ(data); 131 table[0] = (int *)PetscMalloc((m+1)*(is_max)*2*sizeof(int)); CHKPTRQ(table[0]); 132 data [0] = table[0] + (m+1)*(is_max); 133 134 for(i = 1; i<is_max ; i++) { 135 table[i] = table[0] + (m+1)*i; 136 data[i] = data[0] + (m+1)*i; 137 } 138 139 PetscMemzero((void*)*table,(m+1)*(is_max)*sizeof(int)); 140 isz = (int *)PetscMalloc((is_max+1) *sizeof(int)); CHKPTRQ(isz); 141 PetscMemzero((void *)isz,(is_max+1) *sizeof(int)); 142 143 /* Parse the IS and update local tables and the outgoing buf with the data*/ 144 for ( i=0 ; i<is_max ; i++) { 145 PetscMemzero(ctr,size*sizeof(int)); 146 for( j=0; j<n[i]; j++) { /* parse the indices of each IS */ 147 row = idx[i][j]; 148 proc = rtable[row]; 149 if (proc != rank) { /* copy to the outgoing buf*/ 150 ctr[proc]++; 151 *ptr[proc] = row; 152 ptr[proc]++; 153 } 154 else { /* Update the table */ 155 if(!table[i][row]++) { data[i][isz[i]++] = row;} 156 } 157 } 158 159 /* Update the headers*/ 160 for( j = 0; j<size; j++) { /* Can Optimise this loop too */ 161 if (ctr[j]) { 162 outdat[j][0] ++; 163 outdat[j][ outdat[j][0]] = ctr[j]; 164 } 165 } 166 } 167 168 /* Check Validity of the outgoing messages */ 169 for ( i=0 ; i<mct ; i++) { 170 j = pa[i]; 171 if (w3[j] != outdat[j][0]) {SETERRQ(1,"MatIncreaseOverlap_MPIAIJ: Blew it! Header[1] mismatch!\n"); } 172 } 173 174 for ( i=0 ; i<mct ; i++) { 175 j = pa[i]; 176 sum = 1; 177 for (k = 1; k <= w3[j]; k++) sum += outdat[j][k]+1; 178 if (sum != w1[j]) { SETERRQ(1,"MatIncreaseOverlap_MPIAIJ: Blew it! Header[2-n] mismatch! \n"); } 179 } 180 181 MPIU_printf(MPI_COMM_SELF,"[%d]Whew!!! sending to %d nodes\n",rank, mct); 182 183 184 /* Do a global reduction to determine how many messages to expect*/ 185 { 186 int *rw1, *rw2; 187 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 188 rw2 = rw1+size; 189 MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm); 190 bsz = rw1[rank]; 191 MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm); 192 nmsg = rw2[rank]; 193 PetscFree(rw1); 194 } 195 196 /* Allocate memory for recv buffers . Prob none if nmsg = 0 ???? */ 197 rbuf = (int**) PetscMalloc((nmsg+1) *sizeof(int*)); CHKPTRQ(rbuf); 198 rbuf[0] = (int *) PetscMalloc((nmsg *bsz+1) * sizeof(int)); CHKPTRQ(rbuf[0]); 199 for (i=1; i<nmsg ; ++i) rbuf[i] = rbuf[i-1] + bsz; 200 201 /* Now post the receives */ 202 recv_waits = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request)); 203 CHKPTRQ(recv_waits); 204 for ( i=0; i<nmsg; ++i){ 205 MPI_Irecv((void *)rbuf[i], bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i); 206 } 207 208 /* Now post the sends */ 209 send_waits = (MPI_Request *) PetscMalloc((mct+1)*sizeof(MPI_Request)); 210 CHKPTRQ(send_waits); 211 for( i =0; i< mct; ++i){ 212 j = pa[i]; 213 MPI_Isend( (void *)outdat[j], w1[j], MPI_INT, j, tag, comm, send_waits+i); 214 } 215 216 MPIU_printf(MPI_COMM_SELF,"[%d]:) posted %d sends and %d recieves\n",rank, mct, nmsg); 217 218 219 /* Do Local work*/ 220 /* Extract the matrices */ 221 { 222 int start, end, val, max; 223 224 for( i=0; i<is_max; i++) { 225 for ( j=0, max =isz[i] ; j< max; j++) { 226 row = data[i][j] - rstart; 227 start = ai[row]; 228 end = ai[row+1]; 229 for ( k=start; k < end; k++) { /* Amat */ 230 val = aj[k] +cstart; 231 if(!table[i][val]++) { data[i][isz[i]++] = val;} 232 } 233 start = bi[row]; 234 end = bi[row+1]; 235 for ( k=start; k < end; k++) { /* Bmat */ 236 val = garray[bj[k]] ; 237 if(!table[i][val]++) { data[i][isz[i]++] = val;} 238 } 239 } 240 } 241 } 242 /* Receive messages*/ 243 { 244 int index; 245 246 recv_status = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) ); 247 CHKPTRQ(recv_status); 248 for ( i=0; i< nmsg; ++i) { 249 MPI_Waitany(nmsg, recv_waits, &index, recv_status+i); 250 } 251 252 send_status = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) ); 253 CHKPTRQ(send_status); 254 MPI_Waitall(mct,send_waits,send_status); 255 } 256 257 258 /* Do work on recieved messages*/ 259 { 260 int ct, ct1, ct2, *xtable; 261 int oct2, l, start, end, val, max1, max2; 262 263 264 for (i =0, ct =0; i< nmsg; ++i) ct+= rbuf[i][0]; 265 266 xdata = (int **)PetscMalloc((nmsg+1)*sizeof(int *)); CHKPTRQ(xdata); 267 xtable = (int *)PetscMalloc((ct+nmsg+(m+1)*(ct+1))*sizeof(int)); CHKPTRQ(xtable); 268 xdata [0] = xtable + (m+1); 269 isz1 = (int *)PetscMalloc((nmsg+1) *sizeof(int)); CHKPTRQ(isz1); 270 PetscMemzero((void *)isz1,(nmsg+1) *sizeof(int)); 271 272 273 for (i =0; i< nmsg; i++) { /* for easch mesg from proc i */ 274 ct1 = rbuf[i][0]+1; 275 ct2 = rbuf[i][0]+1; 276 for (j = 0, max1= rbuf[i][0]; j<max1; j++) { /* for each IS from proc i*/ 277 PetscMemzero((void *)xtable,(m+1)*sizeof(int)); 278 oct2 = ct2; 279 for (k =0; k < rbuf[i][j]+1; k++, ct1++) { 280 row = rbuf[i][ct1]; 281 if(!xtable[row]++) { xdata[i][ct2++] = row;} 282 } 283 for ( k=oct2, max2 =ct2 ; k< max2; k++) { 284 row = xdata[i][k] - rstart; 285 start = ai[row]; 286 end = ai[row+1]; 287 for ( l=start; l < end; l++) { 288 val = aj[l] +cstart; 289 if(!xtable[val]++) { xdata[i][ct2++] = val;} 290 } 291 start = bi[row]; 292 end = bi[row+1]; 293 for ( l=start; l < end; l++) { 294 val = garray[bj[l]] ; 295 if(!xtable[val]++) { xdata[i][ct2++] = val;} 296 } 297 } 298 /* Update the header*/ 299 xdata[i][j] = ct2-oct2; /* Undo the vector isz1 and use only a var*/ 300 } 301 xdata[i][0] = rbuf[i][0]; 302 xdata[i+1] = xdata[i] +ct2; 303 isz1[i] = ct2; /* size of each message */ 304 } 305 } 306 /* need isz, xdata;*/ 307 308 /* Send the data back*/ 309 /* Do a global reduction to know the buffer space req for incoming messages*/ 310 { 311 int *rw1, *rw2; 312 313 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 314 PetscMemzero((void*)rw1,2*size*sizeof(int)); 315 rw2 = rw1+size; 316 for (i =0; i < nmsg ; ++i) { 317 proc = recv_status[i].MPI_SOURCE; 318 rw1[proc] = isz1[i]; 319 } 320 321 MPI_Allreduce((void *)rw1, (void *)rw2, size, MPI_INT, MPI_MAX, comm); 322 bsz1 = rw2[rank]; 323 PetscFree(rw1); 324 } 325 326 /* Allocate buffers*/ 327 328 /* Allocate memory for recv buffers . Prob none if nmsg = 0 ???? */ 329 rbuf2 = (int**) PetscMalloc((mct+1) *sizeof(int*)); CHKPTRQ(rbuf2); 330 rbuf2[0] = (int *) PetscMalloc((mct*bsz1+1) * sizeof(int)); CHKPTRQ(rbuf2[0]); 331 for (i=1; i<mct ; ++i) rbuf2[i] = rbuf2[i-1] + bsz1; 332 333 /* Now post the receives */ 334 recv_waits2 = (MPI_Request *)PetscMalloc((mct+1)*sizeof(MPI_Request)); CHKPTRQ(recv_waits2) 335 CHKPTRQ(recv_waits2); 336 for ( i=0; i<mct; ++i){ 337 MPI_Irecv((void *)rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits2+i); 338 } 339 340 /* Now post the sends */ 341 send_waits2 = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request)); 342 CHKPTRQ(send_waits2); 343 for( i =0; i< nmsg; ++i){ 344 j = recv_status[i].MPI_SOURCE; 345 MPI_Isend( (void *)xdata[i], isz1[i], MPI_INT, j, tag, comm, send_waits2+i); 346 } 347 348 MPIU_printf(MPI_COMM_SELF,"[%d]:):) posted %d sends and %d recieves\n",rank, mct, nmsg); 349 350 /* recieve work done on other processors*/ 351 352 /* pack up*/ 353 354 /* done !!!! */ 355 356 /* whew! already done? check again :) */ 357 PetscFree(idx); 358 PetscFree(n); 359 PetscFree(rtable); 360 PetscFree(w1); 361 PetscFree(tmp); 362 PetscFree(outdat); 363 PetscFree(ctr); 364 PetscFree(pa); 365 PetscFree(rbuf[0]); 366 PetscFree(rbuf); 367 PetscFree(send_waits); 368 PetscFree(recv_waits); 369 PetscFree(table[0]); 370 PetscFree(table); 371 PetscFree(data); 372 PetscFree(send_status); 373 /* Dont forget to ISRestoreIndices */ 374 return 0; 375 } 376