1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++) { 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 260 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 261 rw1[proc] = isz1[i]; 262 } 263 ierr = PetscFree(onodes1);CHKERRQ(ierr); 264 ierr = PetscFree(olengths1);CHKERRQ(ierr); 265 266 /* Determine the number of messages to expect, their lengths, from from-ids */ 267 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 268 ierr = PetscFree(rw1);CHKERRQ(ierr); 269 } 270 /* Now post the Irecvs corresponding to these messages */ 271 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 272 273 /* Now post the sends */ 274 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 275 for (i=0; i<nrqr; ++i) { 276 j = recv_status[i].MPI_SOURCE; 277 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 278 } 279 280 /* receive work done on other processors*/ 281 { 282 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 283 PetscMPIInt idex; 284 PetscBT table_i; 285 MPI_Status *status2; 286 287 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 288 for (i=0; i<nrqs; ++i) { 289 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 290 /* Process the message*/ 291 rbuf2_i = rbuf2[idex]; 292 ct1 = 2*rbuf2_i[0]+1; 293 jmax = rbuf2[idex][0]; 294 for (j=1; j<=jmax; j++) { 295 max = rbuf2_i[2*j]; 296 is_no = rbuf2_i[2*j-1]; 297 isz_i = isz[is_no]; 298 data_i = data[is_no]; 299 table_i = table[is_no]; 300 for (k=0; k<max; k++,ct1++) { 301 row = rbuf2_i[ct1]; 302 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 303 } 304 isz[is_no] = isz_i; 305 } 306 } 307 308 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 309 ierr = PetscFree(status2);CHKERRQ(ierr); 310 } 311 312 for (i=0; i<imax; ++i) { 313 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 314 } 315 316 ierr = PetscFree(onodes2);CHKERRQ(ierr); 317 ierr = PetscFree(olengths2);CHKERRQ(ierr); 318 319 ierr = PetscFree(pa);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 321 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 322 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 324 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 325 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 326 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 327 ierr = PetscFree(s_status);CHKERRQ(ierr); 328 ierr = PetscFree(recv_status);CHKERRQ(ierr); 329 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 330 ierr = PetscFree(xdata);CHKERRQ(ierr); 331 ierr = PetscFree(isz1);CHKERRQ(ierr); 332 PetscFunctionReturn(0); 333 } 334 335 #undef __FUNCT__ 336 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 337 /* 338 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 339 the work on the local processor. 340 341 Inputs: 342 C - MAT_MPIAIJ; 343 imax - total no of index sets processed at a time; 344 table - an array of char - size = m bits. 345 346 Output: 347 isz - array containing the count of the solution elements corresponding 348 to each index set; 349 data - pointer to the solutions 350 */ 351 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 352 { 353 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 354 Mat A = c->A,B = c->B; 355 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 356 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 357 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 358 PetscBT table_i; 359 360 PetscFunctionBegin; 361 rstart = C->rmap->rstart; 362 cstart = C->cmap->rstart; 363 ai = a->i; 364 aj = a->j; 365 bi = b->i; 366 bj = b->j; 367 garray = c->garray; 368 369 370 for (i=0; i<imax; i++) { 371 data_i = data[i]; 372 table_i = table[i]; 373 isz_i = isz[i]; 374 for (j=0,max=isz[i]; j<max; j++) { 375 row = data_i[j] - rstart; 376 start = ai[row]; 377 end = ai[row+1]; 378 for (k=start; k<end; k++) { /* Amat */ 379 val = aj[k] + cstart; 380 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 381 } 382 start = bi[row]; 383 end = bi[row+1]; 384 for (k=start; k<end; k++) { /* Bmat */ 385 val = garray[bj[k]]; 386 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 387 } 388 } 389 isz[i] = isz_i; 390 } 391 PetscFunctionReturn(0); 392 } 393 394 #undef __FUNCT__ 395 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 396 /* 397 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 398 and return the output 399 400 Input: 401 C - the matrix 402 nrqr - no of messages being processed. 403 rbuf - an array of pointers to the recieved requests 404 405 Output: 406 xdata - array of messages to be sent back 407 isz1 - size of each message 408 409 For better efficiency perhaps we should malloc separately each xdata[i], 410 then if a remalloc is required we need only copy the data for that one row 411 rather then all previous rows as it is now where a single large chunck of 412 memory is used. 413 414 */ 415 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 416 { 417 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 418 Mat A = c->A,B = c->B; 419 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 420 PetscErrorCode ierr; 421 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 422 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 423 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 424 PetscInt *rbuf_i,kmax,rbuf_0; 425 PetscBT xtable; 426 427 PetscFunctionBegin; 428 m = C->rmap->N; 429 rstart = C->rmap->rstart; 430 cstart = C->cmap->rstart; 431 ai = a->i; 432 aj = a->j; 433 bi = b->i; 434 bj = b->j; 435 garray = c->garray; 436 437 438 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 439 rbuf_i = rbuf[i]; 440 rbuf_0 = rbuf_i[0]; 441 ct += rbuf_0; 442 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 443 } 444 445 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 446 else max1 = 1; 447 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 448 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 449 ++no_malloc; 450 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 451 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 452 453 ct3 = 0; 454 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 455 rbuf_i = rbuf[i]; 456 rbuf_0 = rbuf_i[0]; 457 ct1 = 2*rbuf_0+1; 458 ct2 = ct1; 459 ct3 += ct1; 460 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 461 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 462 oct2 = ct2; 463 kmax = rbuf_i[2*j]; 464 for (k=0; k<kmax; k++,ct1++) { 465 row = rbuf_i[ct1]; 466 if (!PetscBTLookupSet(xtable,row)) { 467 if (!(ct3 < mem_estimate)) { 468 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 469 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 470 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 471 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 472 xdata[0] = tmp; 473 mem_estimate = new_estimate; ++no_malloc; 474 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 475 } 476 xdata[i][ct2++] = row; 477 ct3++; 478 } 479 } 480 for (k=oct2,max2=ct2; k<max2; k++) { 481 row = xdata[i][k] - rstart; 482 start = ai[row]; 483 end = ai[row+1]; 484 for (l=start; l<end; l++) { 485 val = aj[l] + cstart; 486 if (!PetscBTLookupSet(xtable,val)) { 487 if (!(ct3 < mem_estimate)) { 488 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 489 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 490 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 491 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 492 xdata[0] = tmp; 493 mem_estimate = new_estimate; ++no_malloc; 494 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 495 } 496 xdata[i][ct2++] = val; 497 ct3++; 498 } 499 } 500 start = bi[row]; 501 end = bi[row+1]; 502 for (l=start; l<end; l++) { 503 val = garray[bj[l]]; 504 if (!PetscBTLookupSet(xtable,val)) { 505 if (!(ct3 < mem_estimate)) { 506 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 507 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 508 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 509 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 510 xdata[0] = tmp; 511 mem_estimate = new_estimate; ++no_malloc; 512 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 513 } 514 xdata[i][ct2++] = val; 515 ct3++; 516 } 517 } 518 } 519 /* Update the header*/ 520 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 521 xdata[i][2*j-1] = rbuf_i[2*j-1]; 522 } 523 xdata[i][0] = rbuf_0; 524 xdata[i+1] = xdata[i] + ct2; 525 isz1[i] = ct2; /* size of each message */ 526 } 527 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 528 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 529 PetscFunctionReturn(0); 530 } 531 /* -------------------------------------------------------------------------*/ 532 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 533 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 534 /* 535 Every processor gets the entire matrix 536 */ 537 #undef __FUNCT__ 538 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 539 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 540 { 541 Mat B; 542 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 543 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 544 PetscErrorCode ierr; 545 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 546 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 547 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 548 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 549 550 PetscFunctionBegin; 551 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 552 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 553 554 if (scall == MAT_INITIAL_MATRIX) { 555 /* ---------------------------------------------------------------- 556 Tell every processor the number of nonzeros per row 557 */ 558 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 559 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 560 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 561 } 562 sendcount = A->rmap->rend - A->rmap->rstart; 563 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 564 for (i=0; i<size; i++) { 565 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 566 displs[i] = A->rmap->range[i]; 567 } 568 #if defined(PETSC_HAVE_MPI_IN_PLACE) 569 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 570 #else 571 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 572 #endif 573 /* --------------------------------------------------------------- 574 Create the sequential matrix of the same type as the local block diagonal 575 */ 576 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 577 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 578 ierr = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 579 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 580 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 581 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 582 **Bin = B; 583 b = (Mat_SeqAIJ*)B->data; 584 585 /*-------------------------------------------------------------------- 586 Copy my part of matrix column indices over 587 */ 588 sendcount = ad->nz + bd->nz; 589 jsendbuf = b->j + b->i[rstarts[rank]]; 590 a_jsendbuf = ad->j; 591 b_jsendbuf = bd->j; 592 n = A->rmap->rend - A->rmap->rstart; 593 cnt = 0; 594 for (i=0; i<n; i++) { 595 596 /* put in lower diagonal portion */ 597 m = bd->i[i+1] - bd->i[i]; 598 while (m > 0) { 599 /* is it above diagonal (in bd (compressed) numbering) */ 600 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 601 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 602 m--; 603 } 604 605 /* put in diagonal portion */ 606 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 607 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 608 } 609 610 /* put in upper diagonal portion */ 611 while (m-- > 0) { 612 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 613 } 614 } 615 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 616 617 /*-------------------------------------------------------------------- 618 Gather all column indices to all processors 619 */ 620 for (i=0; i<size; i++) { 621 recvcounts[i] = 0; 622 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 623 recvcounts[i] += lens[j]; 624 } 625 } 626 displs[0] = 0; 627 for (i=1; i<size; i++) { 628 displs[i] = displs[i-1] + recvcounts[i-1]; 629 } 630 #if defined(PETSC_HAVE_MPI_IN_PLACE) 631 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 632 #else 633 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 634 #endif 635 /*-------------------------------------------------------------------- 636 Assemble the matrix into useable form (note numerical values not yet set) 637 */ 638 /* set the b->ilen (length of each row) values */ 639 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 640 /* set the b->i indices */ 641 b->i[0] = 0; 642 for (i=1; i<=A->rmap->N; i++) { 643 b->i[i] = b->i[i-1] + lens[i-1]; 644 } 645 ierr = PetscFree(lens);CHKERRQ(ierr); 646 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 647 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 648 649 } else { 650 B = **Bin; 651 b = (Mat_SeqAIJ*)B->data; 652 } 653 654 /*-------------------------------------------------------------------- 655 Copy my part of matrix numerical values into the values location 656 */ 657 if (flag == MAT_GET_VALUES) { 658 sendcount = ad->nz + bd->nz; 659 sendbuf = b->a + b->i[rstarts[rank]]; 660 a_sendbuf = ad->a; 661 b_sendbuf = bd->a; 662 b_sendj = bd->j; 663 n = A->rmap->rend - A->rmap->rstart; 664 cnt = 0; 665 for (i=0; i<n; i++) { 666 667 /* put in lower diagonal portion */ 668 m = bd->i[i+1] - bd->i[i]; 669 while (m > 0) { 670 /* is it above diagonal (in bd (compressed) numbering) */ 671 if (garray[*b_sendj] > A->rmap->rstart + i) break; 672 sendbuf[cnt++] = *b_sendbuf++; 673 m--; 674 b_sendj++; 675 } 676 677 /* put in diagonal portion */ 678 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 679 sendbuf[cnt++] = *a_sendbuf++; 680 } 681 682 /* put in upper diagonal portion */ 683 while (m-- > 0) { 684 sendbuf[cnt++] = *b_sendbuf++; 685 b_sendj++; 686 } 687 } 688 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 689 690 /* ----------------------------------------------------------------- 691 Gather all numerical values to all processors 692 */ 693 if (!recvcounts) { 694 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 695 } 696 for (i=0; i<size; i++) { 697 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 698 } 699 displs[0] = 0; 700 for (i=1; i<size; i++) { 701 displs[i] = displs[i-1] + recvcounts[i-1]; 702 } 703 recvbuf = b->a; 704 #if defined(PETSC_HAVE_MPI_IN_PLACE) 705 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 706 #else 707 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 708 #endif 709 } /* endof (flag == MAT_GET_VALUES) */ 710 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 711 712 if (A->symmetric) { 713 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 714 } else if (A->hermitian) { 715 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 716 } else if (A->structurally_symmetric) { 717 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 718 } 719 PetscFunctionReturn(0); 720 } 721 722 723 724 #undef __FUNCT__ 725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 727 { 728 PetscErrorCode ierr; 729 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 730 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 731 732 PetscFunctionBegin; 733 /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */ 734 /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level. 735 However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that 736 take care to order the result correctly by assembling it with MatSetValues() (after preallocating). 737 */ 738 for (i = 0; i < ismax; ++i) { 739 PetscBool sorted; 740 ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr); 741 if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i); 742 } 743 744 /* 745 Check for special case: each processor gets entire matrix 746 */ 747 if (ismax == 1 && C->rmap->N == C->cmap->N) { 748 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 749 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 750 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 751 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 752 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 753 wantallmatrix = PETSC_TRUE; 754 755 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 756 } 757 } 758 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 759 if (twantallmatrix) { 760 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 761 PetscFunctionReturn(0); 762 } 763 764 /* Allocate memory to hold all the submatrices */ 765 if (scall != MAT_REUSE_MATRIX) { 766 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 767 } 768 769 /* Check for special case: each processor gets entire matrix columns */ 770 ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr); 771 for (i=0; i<ismax; i++) { 772 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 773 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 774 if (colflag && ncol == C->cmap->N) { 775 allcolumns[i] = PETSC_TRUE; 776 } else { 777 allcolumns[i] = PETSC_FALSE; 778 } 779 } 780 781 /* Determine the number of stages through which submatrices are done */ 782 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 783 784 /* 785 Each stage will extract nmax submatrices. 786 nmax is determined by the matrix column dimension. 787 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 788 */ 789 if (!nmax) nmax = 1; 790 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 791 792 /* Make sure every processor loops through the nstages */ 793 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 794 795 for (i=0,pos=0; i<nstages; i++) { 796 if (pos+nmax <= ismax) max_no = nmax; 797 else if (pos == ismax) max_no = 0; 798 else max_no = ismax-pos; 799 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 800 pos += max_no; 801 } 802 803 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 804 PetscFunctionReturn(0); 805 } 806 807 /* -------------------------------------------------------------------------*/ 808 #undef __FUNCT__ 809 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 810 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 811 { 812 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 813 Mat A = c->A; 814 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 815 const PetscInt **icol,**irow; 816 PetscInt *nrow,*ncol,start; 817 PetscErrorCode ierr; 818 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 819 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 820 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 821 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 822 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 823 #if defined(PETSC_USE_CTABLE) 824 PetscTable *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i; 825 #else 826 PetscInt **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i; 827 #endif 828 const PetscInt *irow_i; 829 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 830 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 831 MPI_Request *r_waits4,*s_waits3,*s_waits4; 832 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 833 MPI_Status *r_status3,*r_status4,*s_status4; 834 MPI_Comm comm; 835 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 836 PetscMPIInt *onodes1,*olengths1; 837 PetscMPIInt idex,idex2,end; 838 839 PetscFunctionBegin; 840 comm = ((PetscObject)C)->comm; 841 tag0 = ((PetscObject)C)->tag; 842 size = c->size; 843 rank = c->rank; 844 845 /* Get some new tags to keep the communication clean */ 846 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 847 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 848 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 849 850 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 851 852 for (i=0; i<ismax; i++) { 853 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 854 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 855 if (allcolumns[i]) { 856 icol[i] = PETSC_NULL; 857 ncol[i] = C->cmap->N; 858 } else { 859 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 860 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 861 } 862 } 863 864 /* evaluate communication - mesg to who, length of mesg, and buffer space 865 required. Based on this, buffers are allocated, and data copied into them*/ 866 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 867 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 868 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 869 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 870 for (i=0; i<ismax; i++) { 871 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 872 jmax = nrow[i]; 873 irow_i = irow[i]; 874 for (j=0; j<jmax; j++) { 875 l = 0; 876 row = irow_i[j]; 877 while (row >= C->rmap->range[l+1]) l++; 878 proc = l; 879 w4[proc]++; 880 } 881 for (j=0; j<size; j++) { 882 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 883 } 884 } 885 886 nrqs = 0; /* no of outgoing messages */ 887 msz = 0; /* total mesg length (for all procs) */ 888 w1[rank] = 0; /* no mesg sent to self */ 889 w3[rank] = 0; 890 for (i=0; i<size; i++) { 891 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 892 } 893 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 894 for (i=0,j=0; i<size; i++) { 895 if (w1[i]) { pa[j] = i; j++; } 896 } 897 898 /* Each message would have a header = 1 + 2*(no of IS) + data */ 899 for (i=0; i<nrqs; i++) { 900 j = pa[i]; 901 w1[j] += w2[j] + 2* w3[j]; 902 msz += w1[j]; 903 } 904 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 905 906 /* Determine the number of messages to expect, their lengths, from from-ids */ 907 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 908 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 909 910 /* Now post the Irecvs corresponding to these messages */ 911 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 912 913 ierr = PetscFree(onodes1);CHKERRQ(ierr); 914 ierr = PetscFree(olengths1);CHKERRQ(ierr); 915 916 /* Allocate Memory for outgoing messages */ 917 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 918 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 919 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 920 921 { 922 PetscInt *iptr = tmp,ict = 0; 923 for (i=0; i<nrqs; i++) { 924 j = pa[i]; 925 iptr += ict; 926 sbuf1[j] = iptr; 927 ict = w1[j]; 928 } 929 } 930 931 /* Form the outgoing messages */ 932 /* Initialize the header space */ 933 for (i=0; i<nrqs; i++) { 934 j = pa[i]; 935 sbuf1[j][0] = 0; 936 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 937 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 938 } 939 940 /* Parse the isrow and copy data into outbuf */ 941 for (i=0; i<ismax; i++) { 942 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 943 irow_i = irow[i]; 944 jmax = nrow[i]; 945 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 946 l = 0; 947 row = irow_i[j]; 948 while (row >= C->rmap->range[l+1]) l++; 949 proc = l; 950 if (proc != rank) { /* copy to the outgoing buf*/ 951 ctr[proc]++; 952 *ptr[proc] = row; 953 ptr[proc]++; 954 } 955 } 956 /* Update the headers for the current IS */ 957 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 958 if ((ctr_j = ctr[j])) { 959 sbuf1_j = sbuf1[j]; 960 k = ++sbuf1_j[0]; 961 sbuf1_j[2*k] = ctr_j; 962 sbuf1_j[2*k-1] = i; 963 } 964 } 965 } 966 967 /* Now post the sends */ 968 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 969 for (i=0; i<nrqs; ++i) { 970 j = pa[i]; 971 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 972 } 973 974 /* Post Receives to capture the buffer size */ 975 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 976 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 977 rbuf2[0] = tmp + msz; 978 for (i=1; i<nrqs; ++i) { 979 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 980 } 981 for (i=0; i<nrqs; ++i) { 982 j = pa[i]; 983 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 984 } 985 986 /* Send to other procs the buf size they should allocate */ 987 988 989 /* Receive messages*/ 990 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 991 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 992 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 993 { 994 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 995 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 996 PetscInt *sbuf2_i; 997 998 for (i=0; i<nrqr; ++i) { 999 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1000 1001 req_size[idex] = 0; 1002 rbuf1_i = rbuf1[idex]; 1003 start = 2*rbuf1_i[0] + 1; 1004 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1005 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 1006 sbuf2_i = sbuf2[idex]; 1007 for (j=start; j<end; j++) { 1008 id = rbuf1_i[j] - rstart; 1009 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1010 sbuf2_i[j] = ncols; 1011 req_size[idex] += ncols; 1012 } 1013 req_source[idex] = r_status1[i].MPI_SOURCE; 1014 /* form the header */ 1015 sbuf2_i[0] = req_size[idex]; 1016 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1017 1018 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1019 } 1020 } 1021 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1022 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1023 1024 /* recv buffer sizes */ 1025 /* Receive messages*/ 1026 1027 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1028 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1029 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1030 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1031 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1032 1033 for (i=0; i<nrqs; ++i) { 1034 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1035 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1036 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1037 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1038 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1039 } 1040 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1041 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1042 1043 /* Wait on sends1 and sends2 */ 1044 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1045 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1046 1047 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1048 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1049 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1050 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1051 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1052 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1053 1054 /* Now allocate buffers for a->j, and send them off */ 1055 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1056 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1057 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1058 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1059 1060 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1061 { 1062 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1063 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1064 PetscInt cend = C->cmap->rend; 1065 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1066 1067 for (i=0; i<nrqr; i++) { 1068 rbuf1_i = rbuf1[i]; 1069 sbuf_aj_i = sbuf_aj[i]; 1070 ct1 = 2*rbuf1_i[0] + 1; 1071 ct2 = 0; 1072 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1073 kmax = rbuf1[i][2*j]; 1074 for (k=0; k<kmax; k++,ct1++) { 1075 row = rbuf1_i[ct1] - rstart; 1076 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1077 ncols = nzA + nzB; 1078 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1079 1080 /* load the column indices for this row into cols*/ 1081 cols = sbuf_aj_i + ct2; 1082 1083 lwrite = 0; 1084 for (l=0; l<nzB; l++) { 1085 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1086 } 1087 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1088 for (l=0; l<nzB; l++) { 1089 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1090 } 1091 1092 ct2 += ncols; 1093 } 1094 } 1095 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1096 } 1097 } 1098 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1099 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1100 1101 /* Allocate buffers for a->a, and send them off */ 1102 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1103 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1104 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1105 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1106 1107 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1108 { 1109 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1110 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1111 PetscInt cend = C->cmap->rend; 1112 PetscInt *b_j = b->j; 1113 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1114 1115 for (i=0; i<nrqr; i++) { 1116 rbuf1_i = rbuf1[i]; 1117 sbuf_aa_i = sbuf_aa[i]; 1118 ct1 = 2*rbuf1_i[0]+1; 1119 ct2 = 0; 1120 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1121 kmax = rbuf1_i[2*j]; 1122 for (k=0; k<kmax; k++,ct1++) { 1123 row = rbuf1_i[ct1] - rstart; 1124 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1125 ncols = nzA + nzB; 1126 cworkB = b_j + b_i[row]; 1127 vworkA = a_a + a_i[row]; 1128 vworkB = b_a + b_i[row]; 1129 1130 /* load the column values for this row into vals*/ 1131 vals = sbuf_aa_i+ct2; 1132 1133 lwrite = 0; 1134 for (l=0; l<nzB; l++) { 1135 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1136 } 1137 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1138 for (l=0; l<nzB; l++) { 1139 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1140 } 1141 1142 ct2 += ncols; 1143 } 1144 } 1145 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1146 } 1147 } 1148 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1149 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1150 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1151 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1152 1153 /* Form the matrix */ 1154 /* create col map: global col of C -> local col of submatrices */ 1155 { 1156 const PetscInt *icol_i; 1157 #if defined(PETSC_USE_CTABLE) 1158 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1159 for (i=0; i<ismax; i++) { 1160 if (!allcolumns[i]) { 1161 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1162 1163 jmax = ncol[i]; 1164 icol_i = icol[i]; 1165 cmap_i = cmap[i]; 1166 for (j=0; j<jmax; j++) { 1167 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1168 } 1169 } else { 1170 cmap[i] = PETSC_NULL; 1171 } 1172 } 1173 #else 1174 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1175 for (i=0; i<ismax; i++) { 1176 if (!allcolumns[i]) { 1177 ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr); 1178 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1179 jmax = ncol[i]; 1180 icol_i = icol[i]; 1181 cmap_i = cmap[i]; 1182 for (j=0; j<jmax; j++) { 1183 cmap_i[icol_i[j]] = j+1; 1184 } 1185 } else { 1186 cmap[i] = PETSC_NULL; 1187 } 1188 } 1189 #endif 1190 } 1191 1192 /* Create lens which is required for MatCreate... */ 1193 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1194 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1195 if (ismax) { 1196 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1197 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1198 } 1199 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1200 1201 /* Update lens from local data */ 1202 for (i=0; i<ismax; i++) { 1203 jmax = nrow[i]; 1204 if (!allcolumns[i]) cmap_i = cmap[i]; 1205 irow_i = irow[i]; 1206 lens_i = lens[i]; 1207 for (j=0; j<jmax; j++) { 1208 l = 0; 1209 row = irow_i[j]; 1210 while (row >= C->rmap->range[l+1]) l++; 1211 proc = l; 1212 if (proc == rank) { 1213 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1214 if (!allcolumns[i]) { 1215 for (k=0; k<ncols; k++) { 1216 #if defined(PETSC_USE_CTABLE) 1217 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1218 #else 1219 tcol = cmap_i[cols[k]]; 1220 #endif 1221 if (tcol) lens_i[j]++; 1222 } 1223 } else { /* allcolumns */ 1224 lens_i[j] = ncols; 1225 } 1226 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1227 } 1228 } 1229 } 1230 1231 /* Create row map: global row of C -> local row of submatrices */ 1232 #if defined(PETSC_USE_CTABLE) 1233 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1234 for (i=0; i<ismax; i++) { 1235 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1236 rmap_i = rmap[i]; 1237 irow_i = irow[i]; 1238 jmax = nrow[i]; 1239 for (j=0; j<jmax; j++) { 1240 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1241 } 1242 } 1243 #else 1244 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1245 if (ismax) { 1246 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1247 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1248 } 1249 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1250 for (i=0; i<ismax; i++) { 1251 rmap_i = rmap[i]; 1252 irow_i = irow[i]; 1253 jmax = nrow[i]; 1254 for (j=0; j<jmax; j++) { 1255 rmap_i[irow_i[j]] = j; 1256 } 1257 } 1258 #endif 1259 1260 /* Update lens from offproc data */ 1261 { 1262 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1263 1264 for (tmp2=0; tmp2<nrqs; tmp2++) { 1265 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1266 idex = pa[idex2]; 1267 sbuf1_i = sbuf1[idex]; 1268 jmax = sbuf1_i[0]; 1269 ct1 = 2*jmax+1; 1270 ct2 = 0; 1271 rbuf2_i = rbuf2[idex2]; 1272 rbuf3_i = rbuf3[idex2]; 1273 for (j=1; j<=jmax; j++) { 1274 is_no = sbuf1_i[2*j-1]; 1275 max1 = sbuf1_i[2*j]; 1276 lens_i = lens[is_no]; 1277 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1278 rmap_i = rmap[is_no]; 1279 for (k=0; k<max1; k++,ct1++) { 1280 #if defined(PETSC_USE_CTABLE) 1281 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1282 row--; 1283 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1284 #else 1285 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1286 #endif 1287 max2 = rbuf2_i[ct1]; 1288 for (l=0; l<max2; l++,ct2++) { 1289 if (!allcolumns[is_no]) { 1290 #if defined(PETSC_USE_CTABLE) 1291 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1292 #else 1293 tcol = cmap_i[rbuf3_i[ct2]]; 1294 #endif 1295 if (tcol) lens_i[row]++; 1296 } else { /* allcolumns */ 1297 lens_i[row]++; /* lens_i[row] += max2 ? */ 1298 } 1299 } 1300 } 1301 } 1302 } 1303 } 1304 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1305 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1306 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1307 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1308 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1309 1310 /* Create the submatrices */ 1311 if (scall == MAT_REUSE_MATRIX) { 1312 PetscBool flag; 1313 1314 /* 1315 Assumes new rows are same length as the old rows,hence bug! 1316 */ 1317 for (i=0; i<ismax; i++) { 1318 mat = (Mat_SeqAIJ*)(submats[i]->data); 1319 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1320 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1321 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1322 /* Initial matrix as if empty */ 1323 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1324 1325 submats[i]->factortype = C->factortype; 1326 } 1327 } else { 1328 for (i=0; i<ismax; i++) { 1329 PetscInt rbs,cbs; 1330 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1331 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1332 1333 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1334 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1335 1336 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1337 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1338 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1339 } 1340 } 1341 1342 /* Assemble the matrices */ 1343 /* First assemble the local rows */ 1344 { 1345 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1346 PetscScalar *imat_a; 1347 1348 for (i=0; i<ismax; i++) { 1349 mat = (Mat_SeqAIJ*)submats[i]->data; 1350 imat_ilen = mat->ilen; 1351 imat_j = mat->j; 1352 imat_i = mat->i; 1353 imat_a = mat->a; 1354 1355 if (!allcolumns[i]) cmap_i = cmap[i]; 1356 rmap_i = rmap[i]; 1357 irow_i = irow[i]; 1358 jmax = nrow[i]; 1359 for (j=0; j<jmax; j++) { 1360 l = 0; 1361 row = irow_i[j]; 1362 while (row >= C->rmap->range[l+1]) l++; 1363 proc = l; 1364 if (proc == rank) { 1365 old_row = row; 1366 #if defined(PETSC_USE_CTABLE) 1367 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1368 row--; 1369 #else 1370 row = rmap_i[row]; 1371 #endif 1372 ilen_row = imat_ilen[row]; 1373 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1374 mat_i = imat_i[row]; 1375 mat_a = imat_a + mat_i; 1376 mat_j = imat_j + mat_i; 1377 if (!allcolumns[i]) { 1378 for (k=0; k<ncols; k++) { 1379 #if defined(PETSC_USE_CTABLE) 1380 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1381 #else 1382 tcol = cmap_i[cols[k]]; 1383 #endif 1384 if (tcol) { 1385 *mat_j++ = tcol - 1; 1386 *mat_a++ = vals[k]; 1387 ilen_row++; 1388 } 1389 } 1390 } else { /* allcolumns */ 1391 for (k=0; k<ncols; k++) { 1392 *mat_j++ = cols[k]; /* global col index! */ 1393 *mat_a++ = vals[k]; 1394 ilen_row++; 1395 } 1396 } 1397 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1398 imat_ilen[row] = ilen_row; 1399 } 1400 } 1401 } 1402 } 1403 1404 /* Now assemble the off proc rows*/ 1405 { 1406 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1407 PetscInt *imat_j,*imat_i; 1408 PetscScalar *imat_a,*rbuf4_i; 1409 1410 for (tmp2=0; tmp2<nrqs; tmp2++) { 1411 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1412 idex = pa[idex2]; 1413 sbuf1_i = sbuf1[idex]; 1414 jmax = sbuf1_i[0]; 1415 ct1 = 2*jmax + 1; 1416 ct2 = 0; 1417 rbuf2_i = rbuf2[idex2]; 1418 rbuf3_i = rbuf3[idex2]; 1419 rbuf4_i = rbuf4[idex2]; 1420 for (j=1; j<=jmax; j++) { 1421 is_no = sbuf1_i[2*j-1]; 1422 rmap_i = rmap[is_no]; 1423 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1424 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1425 imat_ilen = mat->ilen; 1426 imat_j = mat->j; 1427 imat_i = mat->i; 1428 imat_a = mat->a; 1429 max1 = sbuf1_i[2*j]; 1430 for (k=0; k<max1; k++,ct1++) { 1431 row = sbuf1_i[ct1]; 1432 #if defined(PETSC_USE_CTABLE) 1433 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1434 row--; 1435 #else 1436 row = rmap_i[row]; 1437 #endif 1438 ilen = imat_ilen[row]; 1439 mat_i = imat_i[row]; 1440 mat_a = imat_a + mat_i; 1441 mat_j = imat_j + mat_i; 1442 max2 = rbuf2_i[ct1]; 1443 if (!allcolumns[is_no]) { 1444 for (l=0; l<max2; l++,ct2++) { 1445 1446 #if defined(PETSC_USE_CTABLE) 1447 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1448 #else 1449 tcol = cmap_i[rbuf3_i[ct2]]; 1450 #endif 1451 if (tcol) { 1452 *mat_j++ = tcol - 1; 1453 *mat_a++ = rbuf4_i[ct2]; 1454 ilen++; 1455 } 1456 } 1457 } else { /* allcolumns */ 1458 for (l=0; l<max2; l++,ct2++) { 1459 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1460 *mat_a++ = rbuf4_i[ct2]; 1461 ilen++; 1462 } 1463 } 1464 imat_ilen[row] = ilen; 1465 } 1466 } 1467 } 1468 } 1469 1470 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1471 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1472 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1473 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1474 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1475 1476 /* Restore the indices */ 1477 for (i=0; i<ismax; i++) { 1478 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1479 if (!allcolumns[i]) { 1480 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1481 } 1482 } 1483 1484 /* Destroy allocated memory */ 1485 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1486 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1487 ierr = PetscFree(pa);CHKERRQ(ierr); 1488 1489 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1490 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1491 for (i=0; i<nrqr; ++i) { 1492 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1493 } 1494 for (i=0; i<nrqs; ++i) { 1495 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1496 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1497 } 1498 1499 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1500 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1501 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1502 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1503 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1504 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1505 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1506 1507 #if defined(PETSC_USE_CTABLE) 1508 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1509 #else 1510 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1511 #endif 1512 ierr = PetscFree(rmap);CHKERRQ(ierr); 1513 1514 for (i=0; i<ismax; i++) { 1515 if (!allcolumns[i]) { 1516 #if defined(PETSC_USE_CTABLE) 1517 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1518 #else 1519 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1520 #endif 1521 } 1522 } 1523 ierr = PetscFree(cmap);CHKERRQ(ierr); 1524 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1525 ierr = PetscFree(lens);CHKERRQ(ierr); 1526 1527 for (i=0; i<ismax; i++) { 1528 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1529 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1530 } 1531 PetscFunctionReturn(0); 1532 } 1533 1534 /* 1535 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1536 Be careful not to destroy them elsewhere. 1537 */ 1538 #undef __FUNCT__ 1539 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1540 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1541 { 1542 /* If making this function public, change the error returned in this function away from _PLIB. */ 1543 PetscErrorCode ierr; 1544 Mat_MPIAIJ *aij; 1545 PetscBool seqaij; 1546 1547 PetscFunctionBegin; 1548 /* Check to make sure the component matrices are compatible with C. */ 1549 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1550 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1551 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1552 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1553 if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1554 1555 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1556 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1557 ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs);CHKERRQ(ierr); 1558 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1559 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1560 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1561 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1562 aij = (Mat_MPIAIJ*)((*C)->data); 1563 aij->A = A; 1564 aij->B = B; 1565 ierr = PetscLogObjectParent(*C,A);CHKERRQ(ierr); 1566 ierr = PetscLogObjectParent(*C,B);CHKERRQ(ierr); 1567 1568 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1569 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1570 PetscFunctionReturn(0); 1571 } 1572 1573 #undef __FUNCT__ 1574 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1575 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1576 { 1577 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1578 1579 PetscFunctionBegin; 1580 PetscValidPointer(A,2); 1581 PetscValidPointer(B,3); 1582 *A = aij->A; 1583 *B = aij->B; 1584 /* Note that we don't incref *A and *B, so be careful! */ 1585 PetscFunctionReturn(0); 1586 } 1587 1588 #undef __FUNCT__ 1589 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1590 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1591 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1592 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1593 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1594 { 1595 PetscErrorCode ierr; 1596 PetscMPIInt size, flag; 1597 PetscInt i,ii; 1598 PetscInt ismax_c; 1599 1600 PetscFunctionBegin; 1601 if (!ismax) PetscFunctionReturn(0); 1602 1603 for (i = 0, ismax_c = 0; i < ismax; ++i) { 1604 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 1605 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1606 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1607 if (size > 1) ++ismax_c; 1608 } 1609 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1610 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1611 } else { /* if (ismax_c) */ 1612 Mat *A,*B; 1613 IS *isrow_c, *iscol_c; 1614 PetscMPIInt size; 1615 /* 1616 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1617 array of sequential matrices underlying the resulting parallel matrices. 1618 Which arrays to allocate is based on the value of MatReuse scall. 1619 There are as many diag matrices as there are original index sets. 1620 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1621 1622 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1623 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1624 will deposite the extracted diag and off-diag parts. 1625 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1626 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1627 */ 1628 1629 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1630 if (scall != MAT_REUSE_MATRIX) { 1631 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1632 } else { 1633 ierr = PetscMalloc(ismax*sizeof(Mat), &A);CHKERRQ(ierr); 1634 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B);CHKERRQ(ierr); 1635 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1636 for (i = 0, ii = 0; i < ismax; ++i) { 1637 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1638 if (size > 1) { 1639 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 1640 ++ii; 1641 } else A[i] = (*submat)[i]; 1642 } 1643 } 1644 /* 1645 Construct the complements of the iscol ISs for parallel ISs only. 1646 These are used to extract the off-diag portion of the resulting parallel matrix. 1647 The row IS for the off-diag portion is the same as for the diag portion, 1648 so we merely alias the row IS, while skipping those that are sequential. 1649 */ 1650 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c);CHKERRQ(ierr); 1651 for (i = 0, ii = 0; i < ismax; ++i) { 1652 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1653 if (size > 1) { 1654 isrow_c[ii] = isrow[i]; 1655 1656 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 1657 ++ii; 1658 } 1659 } 1660 /* Now obtain the sequential A and B submatrices separately. */ 1661 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 1662 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 1663 for (ii = 0; ii < ismax_c; ++ii) { 1664 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 1665 } 1666 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 1667 /* 1668 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1669 have been extracted directly into the parallel matrices containing them, or 1670 simply into the sequential matrix identical with the corresponding A (if size == 1). 1671 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1672 A is put into the correct submat slot (if size == 1). 1673 */ 1674 if (scall != MAT_REUSE_MATRIX) { 1675 for (i = 0, ii = 0; i < ismax; ++i) { 1676 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1677 if (size > 1) { 1678 /* 1679 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1680 */ 1681 /* Construct submat[i] from the Seq pieces A and B. */ 1682 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 1683 1684 ++ii; 1685 } else (*submat)[i] = A[i]; 1686 } 1687 } 1688 ierr = PetscFree(A);CHKERRQ(ierr); 1689 ierr = PetscFree(B);CHKERRQ(ierr); 1690 } 1691 PetscFunctionReturn(0); 1692 } /* MatGetSubMatricesParallel_MPIXAIJ() */ 1693 1694 1695 1696 #undef __FUNCT__ 1697 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1698 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1699 { 1700 PetscErrorCode ierr; 1701 1702 PetscFunctionBegin; 1703 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 1704 PetscFunctionReturn(0); 1705 } 1706