1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++){ 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 260 rw1[proc] = isz1[i]; 261 } 262 ierr = PetscFree(onodes1);CHKERRQ(ierr); 263 ierr = PetscFree(olengths1);CHKERRQ(ierr); 264 265 /* Determine the number of messages to expect, their lengths, from from-ids */ 266 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 267 ierr = PetscFree(rw1);CHKERRQ(ierr); 268 } 269 /* Now post the Irecvs corresponding to these messages */ 270 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 271 272 /* Now post the sends */ 273 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 274 for (i=0; i<nrqr; ++i) { 275 j = recv_status[i].MPI_SOURCE; 276 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* receive work done on other processors*/ 280 { 281 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 282 PetscMPIInt idex; 283 PetscBT table_i; 284 MPI_Status *status2; 285 286 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 287 for (i=0; i<nrqs; ++i) { 288 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 289 /* Process the message*/ 290 rbuf2_i = rbuf2[idex]; 291 ct1 = 2*rbuf2_i[0]+1; 292 jmax = rbuf2[idex][0]; 293 for (j=1; j<=jmax; j++) { 294 max = rbuf2_i[2*j]; 295 is_no = rbuf2_i[2*j-1]; 296 isz_i = isz[is_no]; 297 data_i = data[is_no]; 298 table_i = table[is_no]; 299 for (k=0; k<max; k++,ct1++) { 300 row = rbuf2_i[ct1]; 301 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 302 } 303 isz[is_no] = isz_i; 304 } 305 } 306 307 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 308 ierr = PetscFree(status2);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<imax; ++i) { 312 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 313 } 314 315 ierr = PetscFree(onodes2);CHKERRQ(ierr); 316 ierr = PetscFree(olengths2);CHKERRQ(ierr); 317 318 ierr = PetscFree(pa);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 324 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 325 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 326 ierr = PetscFree(s_status);CHKERRQ(ierr); 327 ierr = PetscFree(recv_status);CHKERRQ(ierr); 328 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 329 ierr = PetscFree(xdata);CHKERRQ(ierr); 330 ierr = PetscFree(isz1);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 336 /* 337 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 338 the work on the local processor. 339 340 Inputs: 341 C - MAT_MPIAIJ; 342 imax - total no of index sets processed at a time; 343 table - an array of char - size = m bits. 344 345 Output: 346 isz - array containing the count of the solution elements corresponding 347 to each index set; 348 data - pointer to the solutions 349 */ 350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 351 { 352 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 353 Mat A = c->A,B = c->B; 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 355 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 356 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 357 PetscBT table_i; 358 359 PetscFunctionBegin; 360 rstart = C->rmap->rstart; 361 cstart = C->cmap->rstart; 362 ai = a->i; 363 aj = a->j; 364 bi = b->i; 365 bj = b->j; 366 garray = c->garray; 367 368 369 for (i=0; i<imax; i++) { 370 data_i = data[i]; 371 table_i = table[i]; 372 isz_i = isz[i]; 373 for (j=0,max=isz[i]; j<max; j++) { 374 row = data_i[j] - rstart; 375 start = ai[row]; 376 end = ai[row+1]; 377 for (k=start; k<end; k++) { /* Amat */ 378 val = aj[k] + cstart; 379 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 380 } 381 start = bi[row]; 382 end = bi[row+1]; 383 for (k=start; k<end; k++) { /* Bmat */ 384 val = garray[bj[k]]; 385 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 386 } 387 } 388 isz[i] = isz_i; 389 } 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 395 /* 396 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 397 and return the output 398 399 Input: 400 C - the matrix 401 nrqr - no of messages being processed. 402 rbuf - an array of pointers to the recieved requests 403 404 Output: 405 xdata - array of messages to be sent back 406 isz1 - size of each message 407 408 For better efficiency perhaps we should malloc separately each xdata[i], 409 then if a remalloc is required we need only copy the data for that one row 410 rather then all previous rows as it is now where a single large chunck of 411 memory is used. 412 413 */ 414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 415 { 416 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 417 Mat A = c->A,B = c->B; 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 419 PetscErrorCode ierr; 420 PetscMPIInt rank; 421 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 422 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 423 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 424 PetscInt *rbuf_i,kmax,rbuf_0; 425 PetscBT xtable; 426 427 PetscFunctionBegin; 428 rank = c->rank; 429 m = C->rmap->N; 430 rstart = C->rmap->rstart; 431 cstart = C->cmap->rstart; 432 ai = a->i; 433 aj = a->j; 434 bi = b->i; 435 bj = b->j; 436 garray = c->garray; 437 438 439 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 440 rbuf_i = rbuf[i]; 441 rbuf_0 = rbuf_i[0]; 442 ct += rbuf_0; 443 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 444 } 445 446 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 447 else max1 = 1; 448 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 449 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 450 ++no_malloc; 451 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 452 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 453 454 ct3 = 0; 455 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 456 rbuf_i = rbuf[i]; 457 rbuf_0 = rbuf_i[0]; 458 ct1 = 2*rbuf_0+1; 459 ct2 = ct1; 460 ct3 += ct1; 461 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 462 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 463 oct2 = ct2; 464 kmax = rbuf_i[2*j]; 465 for (k=0; k<kmax; k++,ct1++) { 466 row = rbuf_i[ct1]; 467 if (!PetscBTLookupSet(xtable,row)) { 468 if (!(ct3 < mem_estimate)) { 469 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 470 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 471 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 472 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 473 xdata[0] = tmp; 474 mem_estimate = new_estimate; ++no_malloc; 475 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 476 } 477 xdata[i][ct2++] = row; 478 ct3++; 479 } 480 } 481 for (k=oct2,max2=ct2; k<max2; k++) { 482 row = xdata[i][k] - rstart; 483 start = ai[row]; 484 end = ai[row+1]; 485 for (l=start; l<end; l++) { 486 val = aj[l] + cstart; 487 if (!PetscBTLookupSet(xtable,val)) { 488 if (!(ct3 < mem_estimate)) { 489 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 490 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 491 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 492 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 493 xdata[0] = tmp; 494 mem_estimate = new_estimate; ++no_malloc; 495 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 496 } 497 xdata[i][ct2++] = val; 498 ct3++; 499 } 500 } 501 start = bi[row]; 502 end = bi[row+1]; 503 for (l=start; l<end; l++) { 504 val = garray[bj[l]]; 505 if (!PetscBTLookupSet(xtable,val)) { 506 if (!(ct3 < mem_estimate)) { 507 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 508 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 509 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 510 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 511 xdata[0] = tmp; 512 mem_estimate = new_estimate; ++no_malloc; 513 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 514 } 515 xdata[i][ct2++] = val; 516 ct3++; 517 } 518 } 519 } 520 /* Update the header*/ 521 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 522 xdata[i][2*j-1] = rbuf_i[2*j-1]; 523 } 524 xdata[i][0] = rbuf_0; 525 xdata[i+1] = xdata[i] + ct2; 526 isz1[i] = ct2; /* size of each message */ 527 } 528 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 529 ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 530 PetscFunctionReturn(0); 531 } 532 /* -------------------------------------------------------------------------*/ 533 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 534 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 535 /* 536 Every processor gets the entire matrix 537 */ 538 #undef __FUNCT__ 539 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 540 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 541 { 542 Mat B; 543 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 544 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 545 PetscErrorCode ierr; 546 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 547 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 548 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 549 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 550 551 PetscFunctionBegin; 552 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 553 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 554 555 if (scall == MAT_INITIAL_MATRIX) { 556 /* ---------------------------------------------------------------- 557 Tell every processor the number of nonzeros per row 558 */ 559 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 560 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 561 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 562 } 563 sendcount = A->rmap->rend - A->rmap->rstart; 564 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 565 for (i=0; i<size; i++) { 566 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 567 displs[i] = A->rmap->range[i]; 568 } 569 #if defined(PETSC_HAVE_MPI_IN_PLACE) 570 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 571 #else 572 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 573 #endif 574 /* --------------------------------------------------------------- 575 Create the sequential matrix of the same type as the local block diagonal 576 */ 577 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 578 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 579 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 580 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 581 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 582 **Bin = B; 583 b = (Mat_SeqAIJ *)B->data; 584 585 /*-------------------------------------------------------------------- 586 Copy my part of matrix column indices over 587 */ 588 sendcount = ad->nz + bd->nz; 589 jsendbuf = b->j + b->i[rstarts[rank]]; 590 a_jsendbuf = ad->j; 591 b_jsendbuf = bd->j; 592 n = A->rmap->rend - A->rmap->rstart; 593 cnt = 0; 594 for (i=0; i<n; i++) { 595 596 /* put in lower diagonal portion */ 597 m = bd->i[i+1] - bd->i[i]; 598 while (m > 0) { 599 /* is it above diagonal (in bd (compressed) numbering) */ 600 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 601 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 602 m--; 603 } 604 605 /* put in diagonal portion */ 606 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 607 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 608 } 609 610 /* put in upper diagonal portion */ 611 while (m-- > 0) { 612 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 613 } 614 } 615 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 616 617 /*-------------------------------------------------------------------- 618 Gather all column indices to all processors 619 */ 620 for (i=0; i<size; i++) { 621 recvcounts[i] = 0; 622 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 623 recvcounts[i] += lens[j]; 624 } 625 } 626 displs[0] = 0; 627 for (i=1; i<size; i++) { 628 displs[i] = displs[i-1] + recvcounts[i-1]; 629 } 630 #if defined(PETSC_HAVE_MPI_IN_PLACE) 631 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 632 #else 633 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 634 #endif 635 /*-------------------------------------------------------------------- 636 Assemble the matrix into useable form (note numerical values not yet set) 637 */ 638 /* set the b->ilen (length of each row) values */ 639 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 640 /* set the b->i indices */ 641 b->i[0] = 0; 642 for (i=1; i<=A->rmap->N; i++) { 643 b->i[i] = b->i[i-1] + lens[i-1]; 644 } 645 ierr = PetscFree(lens);CHKERRQ(ierr); 646 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 647 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 648 649 } else { 650 B = **Bin; 651 b = (Mat_SeqAIJ *)B->data; 652 } 653 654 /*-------------------------------------------------------------------- 655 Copy my part of matrix numerical values into the values location 656 */ 657 if (flag == MAT_GET_VALUES){ 658 sendcount = ad->nz + bd->nz; 659 sendbuf = b->a + b->i[rstarts[rank]]; 660 a_sendbuf = ad->a; 661 b_sendbuf = bd->a; 662 b_sendj = bd->j; 663 n = A->rmap->rend - A->rmap->rstart; 664 cnt = 0; 665 for (i=0; i<n; i++) { 666 667 /* put in lower diagonal portion */ 668 m = bd->i[i+1] - bd->i[i]; 669 while (m > 0) { 670 /* is it above diagonal (in bd (compressed) numbering) */ 671 if (garray[*b_sendj] > A->rmap->rstart + i) break; 672 sendbuf[cnt++] = *b_sendbuf++; 673 m--; 674 b_sendj++; 675 } 676 677 /* put in diagonal portion */ 678 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 679 sendbuf[cnt++] = *a_sendbuf++; 680 } 681 682 /* put in upper diagonal portion */ 683 while (m-- > 0) { 684 sendbuf[cnt++] = *b_sendbuf++; 685 b_sendj++; 686 } 687 } 688 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 689 690 /* ----------------------------------------------------------------- 691 Gather all numerical values to all processors 692 */ 693 if (!recvcounts) { 694 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 695 } 696 for (i=0; i<size; i++) { 697 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 698 } 699 displs[0] = 0; 700 for (i=1; i<size; i++) { 701 displs[i] = displs[i-1] + recvcounts[i-1]; 702 } 703 recvbuf = b->a; 704 #if defined(PETSC_HAVE_MPI_IN_PLACE) 705 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 706 #else 707 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 708 #endif 709 } /* endof (flag == MAT_GET_VALUES) */ 710 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 711 712 if (A->symmetric){ 713 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 714 } else if (A->hermitian) { 715 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 716 } else if (A->structurally_symmetric) { 717 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 718 } 719 PetscFunctionReturn(0); 720 } 721 722 723 724 #undef __FUNCT__ 725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 727 { 728 PetscErrorCode ierr; 729 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 730 PetscBool rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix; 731 732 PetscFunctionBegin; 733 /* 734 Check for special case each processor gets entire matrix 735 */ 736 if (ismax == 1 && C->rmap->N == C->cmap->N) { 737 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 738 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 739 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 740 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 741 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 742 wantallmatrix = PETSC_TRUE; 743 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 744 } 745 } 746 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 747 if (twantallmatrix) { 748 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 749 PetscFunctionReturn(0); 750 } 751 752 /* Allocate memory to hold all the submatrices */ 753 if (scall != MAT_REUSE_MATRIX) { 754 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 755 } 756 /* Determine the number of stages through which submatrices are done */ 757 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 758 /* 759 Each stage will extract nmax submatrices. 760 nmax is determined by the matrix column dimension. 761 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 762 */ 763 if (!nmax) nmax = 1; 764 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 765 766 /* Make sure every processor loops through the nstages */ 767 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 768 769 for (i=0,pos=0; i<nstages; i++) { 770 if (pos+nmax <= ismax) max_no = nmax; 771 else if (pos == ismax) max_no = 0; 772 else max_no = ismax-pos; 773 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 774 pos += max_no; 775 } 776 PetscFunctionReturn(0); 777 } 778 779 /* -------------------------------------------------------------------------*/ 780 #undef __FUNCT__ 781 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 782 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 783 { 784 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 785 Mat A = c->A; 786 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 787 const PetscInt **icol,**irow; 788 PetscInt *nrow,*ncol,start; 789 PetscErrorCode ierr; 790 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 791 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 792 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 793 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 794 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 795 #if defined (PETSC_USE_CTABLE) 796 PetscTable *cmap,cmap_i,*rmap,rmap_i; 797 #else 798 PetscInt **cmap,*cmap_i,**rmap,*rmap_i; 799 #endif 800 const PetscInt *irow_i; 801 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 802 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 803 MPI_Request *r_waits4,*s_waits3,*s_waits4; 804 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 805 MPI_Status *r_status3,*r_status4,*s_status4; 806 MPI_Comm comm; 807 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 808 PetscMPIInt *onodes1,*olengths1; 809 PetscMPIInt idex,idex2,end; 810 811 PetscFunctionBegin; 812 comm = ((PetscObject)C)->comm; 813 tag0 = ((PetscObject)C)->tag; 814 size = c->size; 815 rank = c->rank; 816 817 /* Get some new tags to keep the communication clean */ 818 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 819 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 820 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 821 822 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 823 824 for (i=0; i<ismax; i++) { 825 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 826 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 827 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 828 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 829 } 830 831 /* evaluate communication - mesg to who, length of mesg, and buffer space 832 required. Based on this, buffers are allocated, and data copied into them*/ 833 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 834 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 835 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 836 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 837 for (i=0; i<ismax; i++) { 838 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 839 jmax = nrow[i]; 840 irow_i = irow[i]; 841 for (j=0; j<jmax; j++) { 842 l = 0; 843 row = irow_i[j]; 844 while (row >= C->rmap->range[l+1]) l++; 845 proc = l; 846 w4[proc]++; 847 } 848 for (j=0; j<size; j++) { 849 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 850 } 851 } 852 853 nrqs = 0; /* no of outgoing messages */ 854 msz = 0; /* total mesg length (for all procs) */ 855 w1[rank] = 0; /* no mesg sent to self */ 856 w3[rank] = 0; 857 for (i=0; i<size; i++) { 858 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 859 } 860 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 861 for (i=0,j=0; i<size; i++) { 862 if (w1[i]) { pa[j] = i; j++; } 863 } 864 865 /* Each message would have a header = 1 + 2*(no of IS) + data */ 866 for (i=0; i<nrqs; i++) { 867 j = pa[i]; 868 w1[j] += w2[j] + 2* w3[j]; 869 msz += w1[j]; 870 } 871 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 872 873 /* Determine the number of messages to expect, their lengths, from from-ids */ 874 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 875 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 876 877 /* Now post the Irecvs corresponding to these messages */ 878 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 879 880 ierr = PetscFree(onodes1);CHKERRQ(ierr); 881 ierr = PetscFree(olengths1);CHKERRQ(ierr); 882 883 /* Allocate Memory for outgoing messages */ 884 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 885 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 886 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 887 888 { 889 PetscInt *iptr = tmp,ict = 0; 890 for (i=0; i<nrqs; i++) { 891 j = pa[i]; 892 iptr += ict; 893 sbuf1[j] = iptr; 894 ict = w1[j]; 895 } 896 } 897 898 /* Form the outgoing messages */ 899 /* Initialize the header space */ 900 for (i=0; i<nrqs; i++) { 901 j = pa[i]; 902 sbuf1[j][0] = 0; 903 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 904 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 905 } 906 907 /* Parse the isrow and copy data into outbuf */ 908 for (i=0; i<ismax; i++) { 909 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 910 irow_i = irow[i]; 911 jmax = nrow[i]; 912 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 913 l = 0; 914 row = irow_i[j]; 915 while (row >= C->rmap->range[l+1]) l++; 916 proc = l; 917 if (proc != rank) { /* copy to the outgoing buf*/ 918 ctr[proc]++; 919 *ptr[proc] = row; 920 ptr[proc]++; 921 } 922 } 923 /* Update the headers for the current IS */ 924 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 925 if ((ctr_j = ctr[j])) { 926 sbuf1_j = sbuf1[j]; 927 k = ++sbuf1_j[0]; 928 sbuf1_j[2*k] = ctr_j; 929 sbuf1_j[2*k-1] = i; 930 } 931 } 932 } 933 934 /* Now post the sends */ 935 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 936 for (i=0; i<nrqs; ++i) { 937 j = pa[i]; 938 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 939 } 940 941 /* Post Receives to capture the buffer size */ 942 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 943 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 944 rbuf2[0] = tmp + msz; 945 for (i=1; i<nrqs; ++i) { 946 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 947 } 948 for (i=0; i<nrqs; ++i) { 949 j = pa[i]; 950 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 951 } 952 953 /* Send to other procs the buf size they should allocate */ 954 955 956 /* Receive messages*/ 957 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 958 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 959 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 960 { 961 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 962 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 963 PetscInt *sbuf2_i; 964 965 for (i=0; i<nrqr; ++i) { 966 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 967 req_size[idex] = 0; 968 rbuf1_i = rbuf1[idex]; 969 start = 2*rbuf1_i[0] + 1; 970 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 971 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 972 sbuf2_i = sbuf2[idex]; 973 for (j=start; j<end; j++) { 974 id = rbuf1_i[j] - rstart; 975 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 976 sbuf2_i[j] = ncols; 977 req_size[idex] += ncols; 978 } 979 req_source[idex] = r_status1[i].MPI_SOURCE; 980 /* form the header */ 981 sbuf2_i[0] = req_size[idex]; 982 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 983 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 984 } 985 } 986 ierr = PetscFree(r_status1);CHKERRQ(ierr); 987 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 988 989 /* recv buffer sizes */ 990 /* Receive messages*/ 991 992 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 993 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 994 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 995 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 996 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 997 998 for (i=0; i<nrqs; ++i) { 999 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1000 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1001 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1002 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1003 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1004 } 1005 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1006 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1007 1008 /* Wait on sends1 and sends2 */ 1009 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1010 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1011 1012 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1013 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1014 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1015 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1016 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1017 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1018 1019 /* Now allocate buffers for a->j, and send them off */ 1020 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1021 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1022 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1023 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1024 1025 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1026 { 1027 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1028 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1029 PetscInt cend = C->cmap->rend; 1030 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1031 1032 for (i=0; i<nrqr; i++) { 1033 rbuf1_i = rbuf1[i]; 1034 sbuf_aj_i = sbuf_aj[i]; 1035 ct1 = 2*rbuf1_i[0] + 1; 1036 ct2 = 0; 1037 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1038 kmax = rbuf1[i][2*j]; 1039 for (k=0; k<kmax; k++,ct1++) { 1040 row = rbuf1_i[ct1] - rstart; 1041 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1042 ncols = nzA + nzB; 1043 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1044 1045 /* load the column indices for this row into cols*/ 1046 cols = sbuf_aj_i + ct2; 1047 1048 lwrite = 0; 1049 for (l=0; l<nzB; l++) { 1050 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1051 } 1052 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1053 for (l=0; l<nzB; l++) { 1054 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1055 } 1056 1057 ct2 += ncols; 1058 } 1059 } 1060 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1061 } 1062 } 1063 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1064 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1065 1066 /* Allocate buffers for a->a, and send them off */ 1067 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1068 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1069 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1070 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1071 1072 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1073 { 1074 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1075 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1076 PetscInt cend = C->cmap->rend; 1077 PetscInt *b_j = b->j; 1078 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1079 1080 for (i=0; i<nrqr; i++) { 1081 rbuf1_i = rbuf1[i]; 1082 sbuf_aa_i = sbuf_aa[i]; 1083 ct1 = 2*rbuf1_i[0]+1; 1084 ct2 = 0; 1085 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1086 kmax = rbuf1_i[2*j]; 1087 for (k=0; k<kmax; k++,ct1++) { 1088 row = rbuf1_i[ct1] - rstart; 1089 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1090 ncols = nzA + nzB; 1091 cworkB = b_j + b_i[row]; 1092 vworkA = a_a + a_i[row]; 1093 vworkB = b_a + b_i[row]; 1094 1095 /* load the column values for this row into vals*/ 1096 vals = sbuf_aa_i+ct2; 1097 1098 lwrite = 0; 1099 for (l=0; l<nzB; l++) { 1100 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1101 } 1102 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1103 for (l=0; l<nzB; l++) { 1104 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1105 } 1106 1107 ct2 += ncols; 1108 } 1109 } 1110 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1111 } 1112 } 1113 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1114 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1115 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1116 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1117 1118 /* Form the matrix */ 1119 /* create col map: global col of C -> local col of submatrices */ 1120 { 1121 const PetscInt *icol_i; 1122 #if defined (PETSC_USE_CTABLE) 1123 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1124 for (i=0; i<ismax; i++) { 1125 ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr); 1126 jmax = ncol[i]; 1127 icol_i = icol[i]; 1128 cmap_i = cmap[i]; 1129 for (j=0; j<jmax; j++) { 1130 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr); 1131 } 1132 } 1133 #else 1134 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1135 ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 1136 ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1137 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; } 1138 for (i=0; i<ismax; i++) { 1139 jmax = ncol[i]; 1140 icol_i = icol[i]; 1141 cmap_i = cmap[i]; 1142 for (j=0; j<jmax; j++) { 1143 cmap_i[icol_i[j]] = j+1; 1144 } 1145 } 1146 #endif 1147 } 1148 1149 /* Create lens which is required for MatCreate... */ 1150 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1151 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1152 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1153 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1154 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1155 1156 /* Update lens from local data */ 1157 for (i=0; i<ismax; i++) { 1158 jmax = nrow[i]; 1159 cmap_i = cmap[i]; 1160 irow_i = irow[i]; 1161 lens_i = lens[i]; 1162 for (j=0; j<jmax; j++) { 1163 l = 0; 1164 row = irow_i[j]; 1165 while (row >= C->rmap->range[l+1]) l++; 1166 proc = l; 1167 if (proc == rank) { 1168 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1169 for (k=0; k<ncols; k++) { 1170 #if defined (PETSC_USE_CTABLE) 1171 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1172 #else 1173 tcol = cmap_i[cols[k]]; 1174 #endif 1175 if (tcol) { lens_i[j]++;} 1176 } 1177 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1178 } 1179 } 1180 } 1181 1182 /* Create row map: global row of C -> local row of submatrices */ 1183 #if defined (PETSC_USE_CTABLE) 1184 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1185 for (i=0; i<ismax; i++) { 1186 ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr); 1187 rmap_i = rmap[i]; 1188 irow_i = irow[i]; 1189 jmax = nrow[i]; 1190 for (j=0; j<jmax; j++) { 1191 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr); 1192 } 1193 } 1194 #else 1195 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1196 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1197 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1198 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1199 for (i=0; i<ismax; i++) { 1200 rmap_i = rmap[i]; 1201 irow_i = irow[i]; 1202 jmax = nrow[i]; 1203 for (j=0; j<jmax; j++) { 1204 rmap_i[irow_i[j]] = j; 1205 } 1206 } 1207 #endif 1208 1209 /* Update lens from offproc data */ 1210 { 1211 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1212 1213 for (tmp2=0; tmp2<nrqs; tmp2++) { 1214 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1215 idex = pa[idex2]; 1216 sbuf1_i = sbuf1[idex]; 1217 jmax = sbuf1_i[0]; 1218 ct1 = 2*jmax+1; 1219 ct2 = 0; 1220 rbuf2_i = rbuf2[idex2]; 1221 rbuf3_i = rbuf3[idex2]; 1222 for (j=1; j<=jmax; j++) { 1223 is_no = sbuf1_i[2*j-1]; 1224 max1 = sbuf1_i[2*j]; 1225 lens_i = lens[is_no]; 1226 cmap_i = cmap[is_no]; 1227 rmap_i = rmap[is_no]; 1228 for (k=0; k<max1; k++,ct1++) { 1229 #if defined (PETSC_USE_CTABLE) 1230 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1231 row--; 1232 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1233 #else 1234 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1235 #endif 1236 max2 = rbuf2_i[ct1]; 1237 for (l=0; l<max2; l++,ct2++) { 1238 #if defined (PETSC_USE_CTABLE) 1239 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1240 #else 1241 tcol = cmap_i[rbuf3_i[ct2]]; 1242 #endif 1243 if (tcol) { 1244 lens_i[row]++; 1245 } 1246 } 1247 } 1248 } 1249 } 1250 } 1251 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1252 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1253 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1254 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1255 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1256 1257 /* Create the submatrices */ 1258 if (scall == MAT_REUSE_MATRIX) { 1259 PetscBool flag; 1260 1261 /* 1262 Assumes new rows are same length as the old rows,hence bug! 1263 */ 1264 for (i=0; i<ismax; i++) { 1265 mat = (Mat_SeqAIJ *)(submats[i]->data); 1266 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1267 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1268 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1269 /* Initial matrix as if empty */ 1270 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1271 submats[i]->factortype = C->factortype; 1272 } 1273 } else { 1274 for (i=0; i<ismax; i++) { 1275 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1276 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1277 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1278 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1279 } 1280 } 1281 1282 /* Assemble the matrices */ 1283 /* First assemble the local rows */ 1284 { 1285 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1286 PetscScalar *imat_a; 1287 1288 for (i=0; i<ismax; i++) { 1289 mat = (Mat_SeqAIJ*)submats[i]->data; 1290 imat_ilen = mat->ilen; 1291 imat_j = mat->j; 1292 imat_i = mat->i; 1293 imat_a = mat->a; 1294 cmap_i = cmap[i]; 1295 rmap_i = rmap[i]; 1296 irow_i = irow[i]; 1297 jmax = nrow[i]; 1298 for (j=0; j<jmax; j++) { 1299 l = 0; 1300 row = irow_i[j]; 1301 while (row >= C->rmap->range[l+1]) l++; 1302 proc = l; 1303 if (proc == rank) { 1304 old_row = row; 1305 #if defined (PETSC_USE_CTABLE) 1306 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1307 row--; 1308 #else 1309 row = rmap_i[row]; 1310 #endif 1311 ilen_row = imat_ilen[row]; 1312 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1313 mat_i = imat_i[row] ; 1314 mat_a = imat_a + mat_i; 1315 mat_j = imat_j + mat_i; 1316 for (k=0; k<ncols; k++) { 1317 #if defined (PETSC_USE_CTABLE) 1318 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1319 #else 1320 tcol = cmap_i[cols[k]]; 1321 #endif 1322 if (tcol){ 1323 *mat_j++ = tcol - 1; 1324 *mat_a++ = vals[k]; 1325 ilen_row++; 1326 } 1327 } 1328 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1329 imat_ilen[row] = ilen_row; 1330 } 1331 } 1332 } 1333 } 1334 1335 /* Now assemble the off proc rows*/ 1336 { 1337 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1338 PetscInt *imat_j,*imat_i; 1339 PetscScalar *imat_a,*rbuf4_i; 1340 1341 for (tmp2=0; tmp2<nrqs; tmp2++) { 1342 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1343 idex = pa[idex2]; 1344 sbuf1_i = sbuf1[idex]; 1345 jmax = sbuf1_i[0]; 1346 ct1 = 2*jmax + 1; 1347 ct2 = 0; 1348 rbuf2_i = rbuf2[idex2]; 1349 rbuf3_i = rbuf3[idex2]; 1350 rbuf4_i = rbuf4[idex2]; 1351 for (j=1; j<=jmax; j++) { 1352 is_no = sbuf1_i[2*j-1]; 1353 rmap_i = rmap[is_no]; 1354 cmap_i = cmap[is_no]; 1355 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1356 imat_ilen = mat->ilen; 1357 imat_j = mat->j; 1358 imat_i = mat->i; 1359 imat_a = mat->a; 1360 max1 = sbuf1_i[2*j]; 1361 for (k=0; k<max1; k++,ct1++) { 1362 row = sbuf1_i[ct1]; 1363 #if defined (PETSC_USE_CTABLE) 1364 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1365 row--; 1366 #else 1367 row = rmap_i[row]; 1368 #endif 1369 ilen = imat_ilen[row]; 1370 mat_i = imat_i[row] ; 1371 mat_a = imat_a + mat_i; 1372 mat_j = imat_j + mat_i; 1373 max2 = rbuf2_i[ct1]; 1374 for (l=0; l<max2; l++,ct2++) { 1375 1376 #if defined (PETSC_USE_CTABLE) 1377 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1378 #else 1379 tcol = cmap_i[rbuf3_i[ct2]]; 1380 #endif 1381 if (tcol) { 1382 *mat_j++ = tcol - 1; 1383 *mat_a++ = rbuf4_i[ct2]; 1384 ilen++; 1385 } 1386 } 1387 imat_ilen[row] = ilen; 1388 } 1389 } 1390 } 1391 } 1392 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1393 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1394 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1395 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1396 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1397 1398 /* Restore the indices */ 1399 for (i=0; i<ismax; i++) { 1400 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1401 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1402 } 1403 1404 /* Destroy allocated memory */ 1405 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1406 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1407 ierr = PetscFree(pa);CHKERRQ(ierr); 1408 1409 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1410 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1411 for (i=0; i<nrqr; ++i) { 1412 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1413 } 1414 for (i=0; i<nrqs; ++i) { 1415 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1416 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1417 } 1418 1419 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1420 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1421 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1422 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1423 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1424 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1425 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1426 #if defined (PETSC_USE_CTABLE) 1427 for (i=0; i<ismax; i++) { 1428 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1429 ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr); 1430 } 1431 #else 1432 ierr = PetscFree(cmap[0]);CHKERRQ(ierr); 1433 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1434 #endif 1435 ierr = PetscFree(cmap);CHKERRQ(ierr); 1436 ierr = PetscFree(rmap);CHKERRQ(ierr); 1437 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1438 ierr = PetscFree(lens);CHKERRQ(ierr); 1439 1440 for (i=0; i<ismax; i++) { 1441 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1442 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1443 } 1444 PetscFunctionReturn(0); 1445 } 1446 1447 /* 1448 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1449 Be careful not to destroy them elsewhere. 1450 */ 1451 #undef __FUNCT__ 1452 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1453 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1454 { 1455 /* If making this function public, change the error returned in this function away from _PLIB. */ 1456 PetscErrorCode ierr; 1457 Mat_MPIAIJ *aij; 1458 PetscBool seqaij; 1459 1460 PetscFunctionBegin; 1461 /* Check to make sure the component matrices are compatible with C. */ 1462 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1463 if(!seqaij) { 1464 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1465 } 1466 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1467 if(!seqaij) { 1468 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1469 } 1470 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1471 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1472 } 1473 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1474 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1475 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1476 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1477 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1478 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1479 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1480 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1481 } 1482 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1483 aij = (Mat_MPIAIJ*)((*C)->data); 1484 aij->A = A; 1485 aij->B = B; 1486 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1487 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1488 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1489 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1490 PetscFunctionReturn(0); 1491 } 1492 1493 #undef __FUNCT__ 1494 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1495 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1496 { 1497 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1498 PetscFunctionBegin; 1499 PetscValidPointer(A,2); 1500 PetscValidPointer(B,3); 1501 *A = aij->A; 1502 *B = aij->B; 1503 /* Note that we don't incref *A and *B, so be careful! */ 1504 PetscFunctionReturn(0); 1505 } 1506 1507 1508 #undef __FUNCT__ 1509 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1510 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1511 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1512 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1513 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1514 { 1515 PetscErrorCode ierr; 1516 PetscMPIInt size, flag; 1517 PetscInt i,ii; 1518 PetscInt ismax_c; 1519 PetscFunctionBegin; 1520 if(!ismax) { 1521 PetscFunctionReturn(0); 1522 } 1523 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1524 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1525 if(flag != MPI_IDENT) { 1526 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1527 } 1528 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1529 if(size > 1) { 1530 ++ismax_c; 1531 } 1532 } 1533 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1534 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1535 } 1536 else { /* if(ismax_c) */ 1537 Mat *A,*B; 1538 IS *isrow_c, *iscol_c; 1539 PetscMPIInt size; 1540 /* 1541 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1542 array of sequential matrices underlying the resulting parallel matrices. 1543 Which arrays to allocate is based on the value of MatReuse scall. 1544 There are as many diag matrices as there are original index sets. 1545 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1546 1547 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1548 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1549 will deposite the extracted diag and off-diag parts. 1550 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1551 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1552 */ 1553 1554 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1555 if (scall != MAT_REUSE_MATRIX) { 1556 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1557 } 1558 else { 1559 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1560 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1561 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1562 for(i = 0, ii = 0; i < ismax; ++i) { 1563 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1564 if(size > 1) { 1565 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1566 ++ii; 1567 } 1568 else { 1569 A[i] = (*submat)[i]; 1570 } 1571 } 1572 } 1573 /* 1574 Construct the complements of the iscol ISs for parallel ISs only. 1575 These are used to extract the off-diag portion of the resulting parallel matrix. 1576 The row IS for the off-diag portion is the same as for the diag portion, 1577 so we merely alias the row IS, while skipping those that are sequential. 1578 */ 1579 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1580 for(i = 0, ii = 0; i < ismax; ++i) { 1581 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1582 if(size > 1) { 1583 isrow_c[ii] = isrow[i]; 1584 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1585 ++ii; 1586 } 1587 } 1588 /* Now obtain the sequential A and B submatrices separately. */ 1589 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1590 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1591 for(ii = 0; ii < ismax_c; ++ii) { 1592 ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr); 1593 } 1594 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1595 /* 1596 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1597 have been extracted directly into the parallel matrices containing them, or 1598 simply into the sequential matrix identical with the corresponding A (if size == 1). 1599 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1600 A is put into the correct submat slot (if size == 1). 1601 */ 1602 if(scall != MAT_REUSE_MATRIX) { 1603 for(i = 0, ii = 0; i < ismax; ++i) { 1604 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1605 if(size > 1) { 1606 /* 1607 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1608 */ 1609 /* Construct submat[i] from the Seq pieces A and B. */ 1610 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1611 1612 ++ii; 1613 } 1614 else { 1615 (*submat)[i] = A[i]; 1616 } 1617 } 1618 } 1619 ierr = PetscFree(A); CHKERRQ(ierr); 1620 ierr = PetscFree(B); CHKERRQ(ierr); 1621 } 1622 PetscFunctionReturn(0); 1623 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1624 1625 1626 1627 #undef __FUNCT__ 1628 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1629 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1630 { 1631 PetscErrorCode ierr; 1632 PetscFunctionBegin; 1633 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1634 MatCreateMPIAIJFromSeqMatrices_Private, 1635 MatMPIAIJExtractSeqMatrices_Private); 1636 CHKERRQ(ierr); 1637 PetscFunctionReturn(0); 1638 } 1639