1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++){ 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 260 rw1[proc] = isz1[i]; 261 } 262 ierr = PetscFree(onodes1);CHKERRQ(ierr); 263 ierr = PetscFree(olengths1);CHKERRQ(ierr); 264 265 /* Determine the number of messages to expect, their lengths, from from-ids */ 266 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 267 ierr = PetscFree(rw1);CHKERRQ(ierr); 268 } 269 /* Now post the Irecvs corresponding to these messages */ 270 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 271 272 /* Now post the sends */ 273 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 274 for (i=0; i<nrqr; ++i) { 275 j = recv_status[i].MPI_SOURCE; 276 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* receive work done on other processors*/ 280 { 281 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 282 PetscMPIInt idex; 283 PetscBT table_i; 284 MPI_Status *status2; 285 286 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 287 for (i=0; i<nrqs; ++i) { 288 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 289 /* Process the message*/ 290 rbuf2_i = rbuf2[idex]; 291 ct1 = 2*rbuf2_i[0]+1; 292 jmax = rbuf2[idex][0]; 293 for (j=1; j<=jmax; j++) { 294 max = rbuf2_i[2*j]; 295 is_no = rbuf2_i[2*j-1]; 296 isz_i = isz[is_no]; 297 data_i = data[is_no]; 298 table_i = table[is_no]; 299 for (k=0; k<max; k++,ct1++) { 300 row = rbuf2_i[ct1]; 301 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 302 } 303 isz[is_no] = isz_i; 304 } 305 } 306 307 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 308 ierr = PetscFree(status2);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<imax; ++i) { 312 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 313 } 314 315 ierr = PetscFree(onodes2);CHKERRQ(ierr); 316 ierr = PetscFree(olengths2);CHKERRQ(ierr); 317 318 ierr = PetscFree(pa);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 324 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 325 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 326 ierr = PetscFree(s_status);CHKERRQ(ierr); 327 ierr = PetscFree(recv_status);CHKERRQ(ierr); 328 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 329 ierr = PetscFree(xdata);CHKERRQ(ierr); 330 ierr = PetscFree(isz1);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 336 /* 337 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 338 the work on the local processor. 339 340 Inputs: 341 C - MAT_MPIAIJ; 342 imax - total no of index sets processed at a time; 343 table - an array of char - size = m bits. 344 345 Output: 346 isz - array containing the count of the solution elements corresponding 347 to each index set; 348 data - pointer to the solutions 349 */ 350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 351 { 352 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 353 Mat A = c->A,B = c->B; 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 355 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 356 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 357 PetscBT table_i; 358 359 PetscFunctionBegin; 360 rstart = C->rmap->rstart; 361 cstart = C->cmap->rstart; 362 ai = a->i; 363 aj = a->j; 364 bi = b->i; 365 bj = b->j; 366 garray = c->garray; 367 368 369 for (i=0; i<imax; i++) { 370 data_i = data[i]; 371 table_i = table[i]; 372 isz_i = isz[i]; 373 for (j=0,max=isz[i]; j<max; j++) { 374 row = data_i[j] - rstart; 375 start = ai[row]; 376 end = ai[row+1]; 377 for (k=start; k<end; k++) { /* Amat */ 378 val = aj[k] + cstart; 379 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 380 } 381 start = bi[row]; 382 end = bi[row+1]; 383 for (k=start; k<end; k++) { /* Bmat */ 384 val = garray[bj[k]]; 385 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 386 } 387 } 388 isz[i] = isz_i; 389 } 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 395 /* 396 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 397 and return the output 398 399 Input: 400 C - the matrix 401 nrqr - no of messages being processed. 402 rbuf - an array of pointers to the recieved requests 403 404 Output: 405 xdata - array of messages to be sent back 406 isz1 - size of each message 407 408 For better efficiency perhaps we should malloc separately each xdata[i], 409 then if a remalloc is required we need only copy the data for that one row 410 rather then all previous rows as it is now where a single large chunck of 411 memory is used. 412 413 */ 414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 415 { 416 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 417 Mat A = c->A,B = c->B; 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 419 PetscErrorCode ierr; 420 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 421 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 422 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 423 PetscInt *rbuf_i,kmax,rbuf_0; 424 PetscBT xtable; 425 426 PetscFunctionBegin; 427 m = C->rmap->N; 428 rstart = C->rmap->rstart; 429 cstart = C->cmap->rstart; 430 ai = a->i; 431 aj = a->j; 432 bi = b->i; 433 bj = b->j; 434 garray = c->garray; 435 436 437 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 438 rbuf_i = rbuf[i]; 439 rbuf_0 = rbuf_i[0]; 440 ct += rbuf_0; 441 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 442 } 443 444 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 445 else max1 = 1; 446 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 447 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 448 ++no_malloc; 449 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 450 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 451 452 ct3 = 0; 453 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 454 rbuf_i = rbuf[i]; 455 rbuf_0 = rbuf_i[0]; 456 ct1 = 2*rbuf_0+1; 457 ct2 = ct1; 458 ct3 += ct1; 459 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 460 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 461 oct2 = ct2; 462 kmax = rbuf_i[2*j]; 463 for (k=0; k<kmax; k++,ct1++) { 464 row = rbuf_i[ct1]; 465 if (!PetscBTLookupSet(xtable,row)) { 466 if (!(ct3 < mem_estimate)) { 467 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 468 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 469 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 470 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 471 xdata[0] = tmp; 472 mem_estimate = new_estimate; ++no_malloc; 473 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 474 } 475 xdata[i][ct2++] = row; 476 ct3++; 477 } 478 } 479 for (k=oct2,max2=ct2; k<max2; k++) { 480 row = xdata[i][k] - rstart; 481 start = ai[row]; 482 end = ai[row+1]; 483 for (l=start; l<end; l++) { 484 val = aj[l] + cstart; 485 if (!PetscBTLookupSet(xtable,val)) { 486 if (!(ct3 < mem_estimate)) { 487 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 488 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 489 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 490 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 491 xdata[0] = tmp; 492 mem_estimate = new_estimate; ++no_malloc; 493 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 494 } 495 xdata[i][ct2++] = val; 496 ct3++; 497 } 498 } 499 start = bi[row]; 500 end = bi[row+1]; 501 for (l=start; l<end; l++) { 502 val = garray[bj[l]]; 503 if (!PetscBTLookupSet(xtable,val)) { 504 if (!(ct3 < mem_estimate)) { 505 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 506 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 507 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 508 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 509 xdata[0] = tmp; 510 mem_estimate = new_estimate; ++no_malloc; 511 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 512 } 513 xdata[i][ct2++] = val; 514 ct3++; 515 } 516 } 517 } 518 /* Update the header*/ 519 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 520 xdata[i][2*j-1] = rbuf_i[2*j-1]; 521 } 522 xdata[i][0] = rbuf_0; 523 xdata[i+1] = xdata[i] + ct2; 524 isz1[i] = ct2; /* size of each message */ 525 } 526 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 527 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 528 PetscFunctionReturn(0); 529 } 530 /* -------------------------------------------------------------------------*/ 531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 533 /* 534 Every processor gets the entire matrix 535 */ 536 #undef __FUNCT__ 537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 539 { 540 Mat B; 541 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 542 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 543 PetscErrorCode ierr; 544 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 545 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 546 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 547 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 548 549 PetscFunctionBegin; 550 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 551 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 552 553 if (scall == MAT_INITIAL_MATRIX) { 554 /* ---------------------------------------------------------------- 555 Tell every processor the number of nonzeros per row 556 */ 557 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 558 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 559 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 560 } 561 sendcount = A->rmap->rend - A->rmap->rstart; 562 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 563 for (i=0; i<size; i++) { 564 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 565 displs[i] = A->rmap->range[i]; 566 } 567 #if defined(PETSC_HAVE_MPI_IN_PLACE) 568 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 569 #else 570 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 571 #endif 572 /* --------------------------------------------------------------- 573 Create the sequential matrix of the same type as the local block diagonal 574 */ 575 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 576 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 577 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 578 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 579 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 580 **Bin = B; 581 b = (Mat_SeqAIJ *)B->data; 582 583 /*-------------------------------------------------------------------- 584 Copy my part of matrix column indices over 585 */ 586 sendcount = ad->nz + bd->nz; 587 jsendbuf = b->j + b->i[rstarts[rank]]; 588 a_jsendbuf = ad->j; 589 b_jsendbuf = bd->j; 590 n = A->rmap->rend - A->rmap->rstart; 591 cnt = 0; 592 for (i=0; i<n; i++) { 593 594 /* put in lower diagonal portion */ 595 m = bd->i[i+1] - bd->i[i]; 596 while (m > 0) { 597 /* is it above diagonal (in bd (compressed) numbering) */ 598 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 599 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 600 m--; 601 } 602 603 /* put in diagonal portion */ 604 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 605 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 606 } 607 608 /* put in upper diagonal portion */ 609 while (m-- > 0) { 610 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 611 } 612 } 613 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 614 615 /*-------------------------------------------------------------------- 616 Gather all column indices to all processors 617 */ 618 for (i=0; i<size; i++) { 619 recvcounts[i] = 0; 620 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 621 recvcounts[i] += lens[j]; 622 } 623 } 624 displs[0] = 0; 625 for (i=1; i<size; i++) { 626 displs[i] = displs[i-1] + recvcounts[i-1]; 627 } 628 #if defined(PETSC_HAVE_MPI_IN_PLACE) 629 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 630 #else 631 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 632 #endif 633 /*-------------------------------------------------------------------- 634 Assemble the matrix into useable form (note numerical values not yet set) 635 */ 636 /* set the b->ilen (length of each row) values */ 637 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 638 /* set the b->i indices */ 639 b->i[0] = 0; 640 for (i=1; i<=A->rmap->N; i++) { 641 b->i[i] = b->i[i-1] + lens[i-1]; 642 } 643 ierr = PetscFree(lens);CHKERRQ(ierr); 644 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 645 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 646 647 } else { 648 B = **Bin; 649 b = (Mat_SeqAIJ *)B->data; 650 } 651 652 /*-------------------------------------------------------------------- 653 Copy my part of matrix numerical values into the values location 654 */ 655 if (flag == MAT_GET_VALUES){ 656 sendcount = ad->nz + bd->nz; 657 sendbuf = b->a + b->i[rstarts[rank]]; 658 a_sendbuf = ad->a; 659 b_sendbuf = bd->a; 660 b_sendj = bd->j; 661 n = A->rmap->rend - A->rmap->rstart; 662 cnt = 0; 663 for (i=0; i<n; i++) { 664 665 /* put in lower diagonal portion */ 666 m = bd->i[i+1] - bd->i[i]; 667 while (m > 0) { 668 /* is it above diagonal (in bd (compressed) numbering) */ 669 if (garray[*b_sendj] > A->rmap->rstart + i) break; 670 sendbuf[cnt++] = *b_sendbuf++; 671 m--; 672 b_sendj++; 673 } 674 675 /* put in diagonal portion */ 676 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 677 sendbuf[cnt++] = *a_sendbuf++; 678 } 679 680 /* put in upper diagonal portion */ 681 while (m-- > 0) { 682 sendbuf[cnt++] = *b_sendbuf++; 683 b_sendj++; 684 } 685 } 686 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 687 688 /* ----------------------------------------------------------------- 689 Gather all numerical values to all processors 690 */ 691 if (!recvcounts) { 692 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 693 } 694 for (i=0; i<size; i++) { 695 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 696 } 697 displs[0] = 0; 698 for (i=1; i<size; i++) { 699 displs[i] = displs[i-1] + recvcounts[i-1]; 700 } 701 recvbuf = b->a; 702 #if defined(PETSC_HAVE_MPI_IN_PLACE) 703 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 704 #else 705 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 706 #endif 707 } /* endof (flag == MAT_GET_VALUES) */ 708 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 709 710 if (A->symmetric){ 711 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 712 } else if (A->hermitian) { 713 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 714 } else if (A->structurally_symmetric) { 715 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 716 } 717 PetscFunctionReturn(0); 718 } 719 720 721 722 #undef __FUNCT__ 723 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 724 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 725 { 726 PetscErrorCode ierr; 727 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 728 PetscBool rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix; 729 730 PetscFunctionBegin; 731 /* 732 Check for special case each processor gets entire matrix 733 */ 734 if (ismax == 1 && C->rmap->N == C->cmap->N) { 735 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 736 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 737 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 738 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 739 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 740 wantallmatrix = PETSC_TRUE; 741 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 742 } 743 } 744 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 745 if (twantallmatrix) { 746 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 747 PetscFunctionReturn(0); 748 } 749 750 /* Allocate memory to hold all the submatrices */ 751 if (scall != MAT_REUSE_MATRIX) { 752 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 753 } 754 /* Determine the number of stages through which submatrices are done */ 755 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 756 /* 757 Each stage will extract nmax submatrices. 758 nmax is determined by the matrix column dimension. 759 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 760 */ 761 if (!nmax) nmax = 1; 762 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 763 764 /* Make sure every processor loops through the nstages */ 765 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 766 767 for (i=0,pos=0; i<nstages; i++) { 768 if (pos+nmax <= ismax) max_no = nmax; 769 else if (pos == ismax) max_no = 0; 770 else max_no = ismax-pos; 771 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 772 pos += max_no; 773 } 774 PetscFunctionReturn(0); 775 } 776 777 /* -------------------------------------------------------------------------*/ 778 #undef __FUNCT__ 779 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 780 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 781 { 782 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 783 Mat A = c->A; 784 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 785 const PetscInt **icol,**irow; 786 PetscInt *nrow,*ncol,start; 787 PetscErrorCode ierr; 788 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 789 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 790 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 791 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 792 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 793 #if defined (PETSC_USE_CTABLE) 794 PetscTable *cmap,cmap_i,*rmap,rmap_i; 795 #else 796 PetscInt **cmap,*cmap_i,**rmap,*rmap_i; 797 #endif 798 const PetscInt *irow_i; 799 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 800 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 801 MPI_Request *r_waits4,*s_waits3,*s_waits4; 802 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 803 MPI_Status *r_status3,*r_status4,*s_status4; 804 MPI_Comm comm; 805 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 806 PetscMPIInt *onodes1,*olengths1; 807 PetscMPIInt idex,idex2,end; 808 809 PetscFunctionBegin; 810 comm = ((PetscObject)C)->comm; 811 tag0 = ((PetscObject)C)->tag; 812 size = c->size; 813 rank = c->rank; 814 815 /* Get some new tags to keep the communication clean */ 816 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 817 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 818 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 819 820 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 821 822 for (i=0; i<ismax; i++) { 823 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 824 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 825 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 826 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 827 } 828 829 /* evaluate communication - mesg to who, length of mesg, and buffer space 830 required. Based on this, buffers are allocated, and data copied into them*/ 831 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 832 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 833 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 834 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 835 for (i=0; i<ismax; i++) { 836 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 837 jmax = nrow[i]; 838 irow_i = irow[i]; 839 for (j=0; j<jmax; j++) { 840 l = 0; 841 row = irow_i[j]; 842 while (row >= C->rmap->range[l+1]) l++; 843 proc = l; 844 w4[proc]++; 845 } 846 for (j=0; j<size; j++) { 847 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 848 } 849 } 850 851 nrqs = 0; /* no of outgoing messages */ 852 msz = 0; /* total mesg length (for all procs) */ 853 w1[rank] = 0; /* no mesg sent to self */ 854 w3[rank] = 0; 855 for (i=0; i<size; i++) { 856 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 857 } 858 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 859 for (i=0,j=0; i<size; i++) { 860 if (w1[i]) { pa[j] = i; j++; } 861 } 862 863 /* Each message would have a header = 1 + 2*(no of IS) + data */ 864 for (i=0; i<nrqs; i++) { 865 j = pa[i]; 866 w1[j] += w2[j] + 2* w3[j]; 867 msz += w1[j]; 868 } 869 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 870 871 /* Determine the number of messages to expect, their lengths, from from-ids */ 872 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 873 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 874 875 /* Now post the Irecvs corresponding to these messages */ 876 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 877 878 ierr = PetscFree(onodes1);CHKERRQ(ierr); 879 ierr = PetscFree(olengths1);CHKERRQ(ierr); 880 881 /* Allocate Memory for outgoing messages */ 882 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 883 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 884 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 885 886 { 887 PetscInt *iptr = tmp,ict = 0; 888 for (i=0; i<nrqs; i++) { 889 j = pa[i]; 890 iptr += ict; 891 sbuf1[j] = iptr; 892 ict = w1[j]; 893 } 894 } 895 896 /* Form the outgoing messages */ 897 /* Initialize the header space */ 898 for (i=0; i<nrqs; i++) { 899 j = pa[i]; 900 sbuf1[j][0] = 0; 901 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 902 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 903 } 904 905 /* Parse the isrow and copy data into outbuf */ 906 for (i=0; i<ismax; i++) { 907 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 908 irow_i = irow[i]; 909 jmax = nrow[i]; 910 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 911 l = 0; 912 row = irow_i[j]; 913 while (row >= C->rmap->range[l+1]) l++; 914 proc = l; 915 if (proc != rank) { /* copy to the outgoing buf*/ 916 ctr[proc]++; 917 *ptr[proc] = row; 918 ptr[proc]++; 919 } 920 } 921 /* Update the headers for the current IS */ 922 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 923 if ((ctr_j = ctr[j])) { 924 sbuf1_j = sbuf1[j]; 925 k = ++sbuf1_j[0]; 926 sbuf1_j[2*k] = ctr_j; 927 sbuf1_j[2*k-1] = i; 928 } 929 } 930 } 931 932 /* Now post the sends */ 933 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 934 for (i=0; i<nrqs; ++i) { 935 j = pa[i]; 936 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 937 } 938 939 /* Post Receives to capture the buffer size */ 940 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 941 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 942 rbuf2[0] = tmp + msz; 943 for (i=1; i<nrqs; ++i) { 944 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 945 } 946 for (i=0; i<nrqs; ++i) { 947 j = pa[i]; 948 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 949 } 950 951 /* Send to other procs the buf size they should allocate */ 952 953 954 /* Receive messages*/ 955 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 956 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 957 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 958 { 959 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 960 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 961 PetscInt *sbuf2_i; 962 963 for (i=0; i<nrqr; ++i) { 964 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 965 req_size[idex] = 0; 966 rbuf1_i = rbuf1[idex]; 967 start = 2*rbuf1_i[0] + 1; 968 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 969 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 970 sbuf2_i = sbuf2[idex]; 971 for (j=start; j<end; j++) { 972 id = rbuf1_i[j] - rstart; 973 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 974 sbuf2_i[j] = ncols; 975 req_size[idex] += ncols; 976 } 977 req_source[idex] = r_status1[i].MPI_SOURCE; 978 /* form the header */ 979 sbuf2_i[0] = req_size[idex]; 980 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 981 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 982 } 983 } 984 ierr = PetscFree(r_status1);CHKERRQ(ierr); 985 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 986 987 /* recv buffer sizes */ 988 /* Receive messages*/ 989 990 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 991 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 992 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 993 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 994 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 995 996 for (i=0; i<nrqs; ++i) { 997 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 998 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 999 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1000 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1001 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1002 } 1003 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1004 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1005 1006 /* Wait on sends1 and sends2 */ 1007 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1008 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1009 1010 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1011 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1012 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1013 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1014 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1015 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1016 1017 /* Now allocate buffers for a->j, and send them off */ 1018 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1019 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1020 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1021 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1022 1023 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1024 { 1025 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1026 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1027 PetscInt cend = C->cmap->rend; 1028 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1029 1030 for (i=0; i<nrqr; i++) { 1031 rbuf1_i = rbuf1[i]; 1032 sbuf_aj_i = sbuf_aj[i]; 1033 ct1 = 2*rbuf1_i[0] + 1; 1034 ct2 = 0; 1035 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1036 kmax = rbuf1[i][2*j]; 1037 for (k=0; k<kmax; k++,ct1++) { 1038 row = rbuf1_i[ct1] - rstart; 1039 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1040 ncols = nzA + nzB; 1041 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1042 1043 /* load the column indices for this row into cols*/ 1044 cols = sbuf_aj_i + ct2; 1045 1046 lwrite = 0; 1047 for (l=0; l<nzB; l++) { 1048 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1049 } 1050 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1051 for (l=0; l<nzB; l++) { 1052 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1053 } 1054 1055 ct2 += ncols; 1056 } 1057 } 1058 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1059 } 1060 } 1061 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1062 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1063 1064 /* Allocate buffers for a->a, and send them off */ 1065 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1066 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1067 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1068 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1069 1070 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1071 { 1072 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1073 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1074 PetscInt cend = C->cmap->rend; 1075 PetscInt *b_j = b->j; 1076 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1077 1078 for (i=0; i<nrqr; i++) { 1079 rbuf1_i = rbuf1[i]; 1080 sbuf_aa_i = sbuf_aa[i]; 1081 ct1 = 2*rbuf1_i[0]+1; 1082 ct2 = 0; 1083 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1084 kmax = rbuf1_i[2*j]; 1085 for (k=0; k<kmax; k++,ct1++) { 1086 row = rbuf1_i[ct1] - rstart; 1087 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1088 ncols = nzA + nzB; 1089 cworkB = b_j + b_i[row]; 1090 vworkA = a_a + a_i[row]; 1091 vworkB = b_a + b_i[row]; 1092 1093 /* load the column values for this row into vals*/ 1094 vals = sbuf_aa_i+ct2; 1095 1096 lwrite = 0; 1097 for (l=0; l<nzB; l++) { 1098 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1099 } 1100 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1101 for (l=0; l<nzB; l++) { 1102 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1103 } 1104 1105 ct2 += ncols; 1106 } 1107 } 1108 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1109 } 1110 } 1111 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1112 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1113 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1114 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1115 1116 /* Form the matrix */ 1117 /* create col map: global col of C -> local col of submatrices */ 1118 { 1119 const PetscInt *icol_i; 1120 #if defined (PETSC_USE_CTABLE) 1121 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1122 for (i=0; i<ismax; i++) { 1123 ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr); 1124 jmax = ncol[i]; 1125 icol_i = icol[i]; 1126 cmap_i = cmap[i]; 1127 for (j=0; j<jmax; j++) { 1128 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr); 1129 } 1130 } 1131 #else 1132 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1133 ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 1134 ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1135 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; } 1136 for (i=0; i<ismax; i++) { 1137 jmax = ncol[i]; 1138 icol_i = icol[i]; 1139 cmap_i = cmap[i]; 1140 for (j=0; j<jmax; j++) { 1141 cmap_i[icol_i[j]] = j+1; 1142 } 1143 } 1144 #endif 1145 } 1146 1147 /* Create lens which is required for MatCreate... */ 1148 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1149 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1150 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1151 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1152 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1153 1154 /* Update lens from local data */ 1155 for (i=0; i<ismax; i++) { 1156 jmax = nrow[i]; 1157 cmap_i = cmap[i]; 1158 irow_i = irow[i]; 1159 lens_i = lens[i]; 1160 for (j=0; j<jmax; j++) { 1161 l = 0; 1162 row = irow_i[j]; 1163 while (row >= C->rmap->range[l+1]) l++; 1164 proc = l; 1165 if (proc == rank) { 1166 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1167 for (k=0; k<ncols; k++) { 1168 #if defined (PETSC_USE_CTABLE) 1169 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1170 #else 1171 tcol = cmap_i[cols[k]]; 1172 #endif 1173 if (tcol) { lens_i[j]++;} 1174 } 1175 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1176 } 1177 } 1178 } 1179 1180 /* Create row map: global row of C -> local row of submatrices */ 1181 #if defined (PETSC_USE_CTABLE) 1182 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1183 for (i=0; i<ismax; i++) { 1184 ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr); 1185 rmap_i = rmap[i]; 1186 irow_i = irow[i]; 1187 jmax = nrow[i]; 1188 for (j=0; j<jmax; j++) { 1189 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr); 1190 } 1191 } 1192 #else 1193 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1194 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1195 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1196 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1197 for (i=0; i<ismax; i++) { 1198 rmap_i = rmap[i]; 1199 irow_i = irow[i]; 1200 jmax = nrow[i]; 1201 for (j=0; j<jmax; j++) { 1202 rmap_i[irow_i[j]] = j; 1203 } 1204 } 1205 #endif 1206 1207 /* Update lens from offproc data */ 1208 { 1209 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1210 1211 for (tmp2=0; tmp2<nrqs; tmp2++) { 1212 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1213 idex = pa[idex2]; 1214 sbuf1_i = sbuf1[idex]; 1215 jmax = sbuf1_i[0]; 1216 ct1 = 2*jmax+1; 1217 ct2 = 0; 1218 rbuf2_i = rbuf2[idex2]; 1219 rbuf3_i = rbuf3[idex2]; 1220 for (j=1; j<=jmax; j++) { 1221 is_no = sbuf1_i[2*j-1]; 1222 max1 = sbuf1_i[2*j]; 1223 lens_i = lens[is_no]; 1224 cmap_i = cmap[is_no]; 1225 rmap_i = rmap[is_no]; 1226 for (k=0; k<max1; k++,ct1++) { 1227 #if defined (PETSC_USE_CTABLE) 1228 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1229 row--; 1230 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1231 #else 1232 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1233 #endif 1234 max2 = rbuf2_i[ct1]; 1235 for (l=0; l<max2; l++,ct2++) { 1236 #if defined (PETSC_USE_CTABLE) 1237 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1238 #else 1239 tcol = cmap_i[rbuf3_i[ct2]]; 1240 #endif 1241 if (tcol) { 1242 lens_i[row]++; 1243 } 1244 } 1245 } 1246 } 1247 } 1248 } 1249 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1250 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1251 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1252 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1253 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1254 1255 /* Create the submatrices */ 1256 if (scall == MAT_REUSE_MATRIX) { 1257 PetscBool flag; 1258 1259 /* 1260 Assumes new rows are same length as the old rows,hence bug! 1261 */ 1262 for (i=0; i<ismax; i++) { 1263 mat = (Mat_SeqAIJ *)(submats[i]->data); 1264 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1265 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1266 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1267 /* Initial matrix as if empty */ 1268 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1269 submats[i]->factortype = C->factortype; 1270 } 1271 } else { 1272 for (i=0; i<ismax; i++) { 1273 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1274 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1275 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1276 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1277 } 1278 } 1279 1280 /* Assemble the matrices */ 1281 /* First assemble the local rows */ 1282 { 1283 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1284 PetscScalar *imat_a; 1285 1286 for (i=0; i<ismax; i++) { 1287 mat = (Mat_SeqAIJ*)submats[i]->data; 1288 imat_ilen = mat->ilen; 1289 imat_j = mat->j; 1290 imat_i = mat->i; 1291 imat_a = mat->a; 1292 cmap_i = cmap[i]; 1293 rmap_i = rmap[i]; 1294 irow_i = irow[i]; 1295 jmax = nrow[i]; 1296 for (j=0; j<jmax; j++) { 1297 l = 0; 1298 row = irow_i[j]; 1299 while (row >= C->rmap->range[l+1]) l++; 1300 proc = l; 1301 if (proc == rank) { 1302 old_row = row; 1303 #if defined (PETSC_USE_CTABLE) 1304 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1305 row--; 1306 #else 1307 row = rmap_i[row]; 1308 #endif 1309 ilen_row = imat_ilen[row]; 1310 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1311 mat_i = imat_i[row] ; 1312 mat_a = imat_a + mat_i; 1313 mat_j = imat_j + mat_i; 1314 for (k=0; k<ncols; k++) { 1315 #if defined (PETSC_USE_CTABLE) 1316 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1317 #else 1318 tcol = cmap_i[cols[k]]; 1319 #endif 1320 if (tcol){ 1321 *mat_j++ = tcol - 1; 1322 *mat_a++ = vals[k]; 1323 ilen_row++; 1324 } 1325 } 1326 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1327 imat_ilen[row] = ilen_row; 1328 } 1329 } 1330 } 1331 } 1332 1333 /* Now assemble the off proc rows*/ 1334 { 1335 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1336 PetscInt *imat_j,*imat_i; 1337 PetscScalar *imat_a,*rbuf4_i; 1338 1339 for (tmp2=0; tmp2<nrqs; tmp2++) { 1340 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1341 idex = pa[idex2]; 1342 sbuf1_i = sbuf1[idex]; 1343 jmax = sbuf1_i[0]; 1344 ct1 = 2*jmax + 1; 1345 ct2 = 0; 1346 rbuf2_i = rbuf2[idex2]; 1347 rbuf3_i = rbuf3[idex2]; 1348 rbuf4_i = rbuf4[idex2]; 1349 for (j=1; j<=jmax; j++) { 1350 is_no = sbuf1_i[2*j-1]; 1351 rmap_i = rmap[is_no]; 1352 cmap_i = cmap[is_no]; 1353 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1354 imat_ilen = mat->ilen; 1355 imat_j = mat->j; 1356 imat_i = mat->i; 1357 imat_a = mat->a; 1358 max1 = sbuf1_i[2*j]; 1359 for (k=0; k<max1; k++,ct1++) { 1360 row = sbuf1_i[ct1]; 1361 #if defined (PETSC_USE_CTABLE) 1362 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1363 row--; 1364 #else 1365 row = rmap_i[row]; 1366 #endif 1367 ilen = imat_ilen[row]; 1368 mat_i = imat_i[row] ; 1369 mat_a = imat_a + mat_i; 1370 mat_j = imat_j + mat_i; 1371 max2 = rbuf2_i[ct1]; 1372 for (l=0; l<max2; l++,ct2++) { 1373 1374 #if defined (PETSC_USE_CTABLE) 1375 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1376 #else 1377 tcol = cmap_i[rbuf3_i[ct2]]; 1378 #endif 1379 if (tcol) { 1380 *mat_j++ = tcol - 1; 1381 *mat_a++ = rbuf4_i[ct2]; 1382 ilen++; 1383 } 1384 } 1385 imat_ilen[row] = ilen; 1386 } 1387 } 1388 } 1389 } 1390 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1391 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1392 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1393 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1394 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1395 1396 /* Restore the indices */ 1397 for (i=0; i<ismax; i++) { 1398 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1399 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1400 } 1401 1402 /* Destroy allocated memory */ 1403 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1404 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1405 ierr = PetscFree(pa);CHKERRQ(ierr); 1406 1407 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1408 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1409 for (i=0; i<nrqr; ++i) { 1410 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1411 } 1412 for (i=0; i<nrqs; ++i) { 1413 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1414 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1415 } 1416 1417 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1418 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1419 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1420 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1421 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1422 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1423 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1424 #if defined (PETSC_USE_CTABLE) 1425 for (i=0; i<ismax; i++) { 1426 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1427 ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr); 1428 } 1429 #else 1430 ierr = PetscFree(cmap[0]);CHKERRQ(ierr); 1431 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1432 #endif 1433 ierr = PetscFree(cmap);CHKERRQ(ierr); 1434 ierr = PetscFree(rmap);CHKERRQ(ierr); 1435 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1436 ierr = PetscFree(lens);CHKERRQ(ierr); 1437 1438 for (i=0; i<ismax; i++) { 1439 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1440 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1441 } 1442 PetscFunctionReturn(0); 1443 } 1444 1445 /* 1446 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1447 Be careful not to destroy them elsewhere. 1448 */ 1449 #undef __FUNCT__ 1450 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1451 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1452 { 1453 /* If making this function public, change the error returned in this function away from _PLIB. */ 1454 PetscErrorCode ierr; 1455 Mat_MPIAIJ *aij; 1456 PetscBool seqaij; 1457 1458 PetscFunctionBegin; 1459 /* Check to make sure the component matrices are compatible with C. */ 1460 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1461 if(!seqaij) { 1462 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1463 } 1464 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1465 if(!seqaij) { 1466 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1467 } 1468 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1469 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1470 } 1471 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1472 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1473 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1474 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1475 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1476 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1477 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1478 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1479 } 1480 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1481 aij = (Mat_MPIAIJ*)((*C)->data); 1482 aij->A = A; 1483 aij->B = B; 1484 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1485 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1486 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1487 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1488 PetscFunctionReturn(0); 1489 } 1490 1491 #undef __FUNCT__ 1492 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1493 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1494 { 1495 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1496 PetscFunctionBegin; 1497 PetscValidPointer(A,2); 1498 PetscValidPointer(B,3); 1499 *A = aij->A; 1500 *B = aij->B; 1501 /* Note that we don't incref *A and *B, so be careful! */ 1502 PetscFunctionReturn(0); 1503 } 1504 1505 1506 #undef __FUNCT__ 1507 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1508 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1509 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1510 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1511 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1512 { 1513 PetscErrorCode ierr; 1514 PetscMPIInt size, flag; 1515 PetscInt i,ii; 1516 PetscInt ismax_c; 1517 PetscFunctionBegin; 1518 if(!ismax) { 1519 PetscFunctionReturn(0); 1520 } 1521 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1522 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1523 if(flag != MPI_IDENT) { 1524 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1525 } 1526 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1527 if(size > 1) { 1528 ++ismax_c; 1529 } 1530 } 1531 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1532 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1533 } 1534 else { /* if(ismax_c) */ 1535 Mat *A,*B; 1536 IS *isrow_c, *iscol_c; 1537 PetscMPIInt size; 1538 /* 1539 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1540 array of sequential matrices underlying the resulting parallel matrices. 1541 Which arrays to allocate is based on the value of MatReuse scall. 1542 There are as many diag matrices as there are original index sets. 1543 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1544 1545 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1546 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1547 will deposite the extracted diag and off-diag parts. 1548 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1549 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1550 */ 1551 1552 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1553 if (scall != MAT_REUSE_MATRIX) { 1554 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1555 } 1556 else { 1557 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1558 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1559 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1560 for(i = 0, ii = 0; i < ismax; ++i) { 1561 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1562 if(size > 1) { 1563 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1564 ++ii; 1565 } 1566 else { 1567 A[i] = (*submat)[i]; 1568 } 1569 } 1570 } 1571 /* 1572 Construct the complements of the iscol ISs for parallel ISs only. 1573 These are used to extract the off-diag portion of the resulting parallel matrix. 1574 The row IS for the off-diag portion is the same as for the diag portion, 1575 so we merely alias the row IS, while skipping those that are sequential. 1576 */ 1577 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1578 for(i = 0, ii = 0; i < ismax; ++i) { 1579 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1580 if(size > 1) { 1581 isrow_c[ii] = isrow[i]; 1582 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1583 ++ii; 1584 } 1585 } 1586 /* Now obtain the sequential A and B submatrices separately. */ 1587 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1588 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1589 for(ii = 0; ii < ismax_c; ++ii) { 1590 ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr); 1591 } 1592 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1593 /* 1594 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1595 have been extracted directly into the parallel matrices containing them, or 1596 simply into the sequential matrix identical with the corresponding A (if size == 1). 1597 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1598 A is put into the correct submat slot (if size == 1). 1599 */ 1600 if(scall != MAT_REUSE_MATRIX) { 1601 for(i = 0, ii = 0; i < ismax; ++i) { 1602 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1603 if(size > 1) { 1604 /* 1605 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1606 */ 1607 /* Construct submat[i] from the Seq pieces A and B. */ 1608 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1609 1610 ++ii; 1611 } 1612 else { 1613 (*submat)[i] = A[i]; 1614 } 1615 } 1616 } 1617 ierr = PetscFree(A); CHKERRQ(ierr); 1618 ierr = PetscFree(B); CHKERRQ(ierr); 1619 } 1620 PetscFunctionReturn(0); 1621 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1622 1623 1624 1625 #undef __FUNCT__ 1626 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1627 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1628 { 1629 PetscErrorCode ierr; 1630 PetscFunctionBegin; 1631 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1632 MatCreateMPIAIJFromSeqMatrices_Private, 1633 MatMPIAIJExtractSeqMatrices_Private); 1634 CHKERRQ(ierr); 1635 PetscFunctionReturn(0); 1636 } 1637