1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++) { 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc1((nrqs+1),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 162 163 for (i=0; i<imax; i++) { 164 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 165 data[i] = d_p + M*i; 166 } 167 } 168 169 /* Parse the IS and update local tables and the outgoing buf with the data*/ 170 { 171 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 172 PetscBT table_i; 173 174 for (i=0; i<imax; i++) { 175 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 176 n_i = n[i]; 177 table_i = table[i]; 178 idx_i = idx[i]; 179 data_i = data[i]; 180 isz_i = isz[i]; 181 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 182 row = idx_i[j]; 183 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 184 if (proc != rank) { /* copy to the outgoing buffer */ 185 ctr[proc]++; 186 *ptr[proc] = row; 187 ptr[proc]++; 188 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 189 } 190 /* Update the headers for the current IS */ 191 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 192 if ((ctr_j = ctr[j])) { 193 outdat_j = outdat[j]; 194 k = ++outdat_j[0]; 195 outdat_j[2*k] = ctr_j; 196 outdat_j[2*k-1] = i; 197 } 198 } 199 isz[i] = isz_i; 200 } 201 } 202 203 /* Now post the sends */ 204 ierr = PetscMalloc1((nrqs+1),&s_waits1);CHKERRQ(ierr); 205 for (i=0; i<nrqs; ++i) { 206 j = pa[i]; 207 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 208 } 209 210 /* No longer need the original indices*/ 211 for (i=0; i<imax; ++i) { 212 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 213 } 214 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 215 216 for (i=0; i<imax; ++i) { 217 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 218 } 219 220 /* Do Local work*/ 221 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 222 223 /* Receive messages*/ 224 ierr = PetscMalloc1((nrqr+1),&recv_status);CHKERRQ(ierr); 225 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 226 227 ierr = PetscMalloc1((nrqs+1),&s_status);CHKERRQ(ierr); 228 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 229 230 /* Phase 1 sends are complete - deallocate buffers */ 231 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 232 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 233 234 ierr = PetscMalloc1((nrqr+1),&xdata);CHKERRQ(ierr); 235 ierr = PetscMalloc1((nrqr+1),&isz1);CHKERRQ(ierr); 236 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 237 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 238 ierr = PetscFree(rbuf);CHKERRQ(ierr); 239 240 241 /* Send the data back*/ 242 /* Do a global reduction to know the buffer space req for incoming messages*/ 243 { 244 PetscMPIInt *rw1; 245 246 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 247 248 for (i=0; i<nrqr; ++i) { 249 proc = recv_status[i].MPI_SOURCE; 250 251 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 252 rw1[proc] = isz1[i]; 253 } 254 ierr = PetscFree(onodes1);CHKERRQ(ierr); 255 ierr = PetscFree(olengths1);CHKERRQ(ierr); 256 257 /* Determine the number of messages to expect, their lengths, from from-ids */ 258 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 259 ierr = PetscFree(rw1);CHKERRQ(ierr); 260 } 261 /* Now post the Irecvs corresponding to these messages */ 262 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 263 264 /* Now post the sends */ 265 ierr = PetscMalloc1((nrqr+1),&s_waits2);CHKERRQ(ierr); 266 for (i=0; i<nrqr; ++i) { 267 j = recv_status[i].MPI_SOURCE; 268 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 269 } 270 271 /* receive work done on other processors*/ 272 { 273 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 274 PetscMPIInt idex; 275 PetscBT table_i; 276 MPI_Status *status2; 277 278 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 279 for (i=0; i<nrqs; ++i) { 280 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 281 /* Process the message*/ 282 rbuf2_i = rbuf2[idex]; 283 ct1 = 2*rbuf2_i[0]+1; 284 jmax = rbuf2[idex][0]; 285 for (j=1; j<=jmax; j++) { 286 max = rbuf2_i[2*j]; 287 is_no = rbuf2_i[2*j-1]; 288 isz_i = isz[is_no]; 289 data_i = data[is_no]; 290 table_i = table[is_no]; 291 for (k=0; k<max; k++,ct1++) { 292 row = rbuf2_i[ct1]; 293 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 294 } 295 isz[is_no] = isz_i; 296 } 297 } 298 299 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 300 ierr = PetscFree(status2);CHKERRQ(ierr); 301 } 302 303 for (i=0; i<imax; ++i) { 304 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 305 } 306 307 ierr = PetscFree(onodes2);CHKERRQ(ierr); 308 ierr = PetscFree(olengths2);CHKERRQ(ierr); 309 310 ierr = PetscFree(pa);CHKERRQ(ierr); 311 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 312 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 313 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 314 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 315 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 316 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 317 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 318 ierr = PetscFree(s_status);CHKERRQ(ierr); 319 ierr = PetscFree(recv_status);CHKERRQ(ierr); 320 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 321 ierr = PetscFree(xdata);CHKERRQ(ierr); 322 ierr = PetscFree(isz1);CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 #undef __FUNCT__ 327 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 328 /* 329 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 330 the work on the local processor. 331 332 Inputs: 333 C - MAT_MPIAIJ; 334 imax - total no of index sets processed at a time; 335 table - an array of char - size = m bits. 336 337 Output: 338 isz - array containing the count of the solution elements corresponding 339 to each index set; 340 data - pointer to the solutions 341 */ 342 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 343 { 344 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 345 Mat A = c->A,B = c->B; 346 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 347 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 348 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 349 PetscBT table_i; 350 351 PetscFunctionBegin; 352 rstart = C->rmap->rstart; 353 cstart = C->cmap->rstart; 354 ai = a->i; 355 aj = a->j; 356 bi = b->i; 357 bj = b->j; 358 garray = c->garray; 359 360 361 for (i=0; i<imax; i++) { 362 data_i = data[i]; 363 table_i = table[i]; 364 isz_i = isz[i]; 365 for (j=0,max=isz[i]; j<max; j++) { 366 row = data_i[j] - rstart; 367 start = ai[row]; 368 end = ai[row+1]; 369 for (k=start; k<end; k++) { /* Amat */ 370 val = aj[k] + cstart; 371 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 372 } 373 start = bi[row]; 374 end = bi[row+1]; 375 for (k=start; k<end; k++) { /* Bmat */ 376 val = garray[bj[k]]; 377 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 378 } 379 } 380 isz[i] = isz_i; 381 } 382 PetscFunctionReturn(0); 383 } 384 385 #undef __FUNCT__ 386 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 387 /* 388 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 389 and return the output 390 391 Input: 392 C - the matrix 393 nrqr - no of messages being processed. 394 rbuf - an array of pointers to the recieved requests 395 396 Output: 397 xdata - array of messages to be sent back 398 isz1 - size of each message 399 400 For better efficiency perhaps we should malloc separately each xdata[i], 401 then if a remalloc is required we need only copy the data for that one row 402 rather then all previous rows as it is now where a single large chunck of 403 memory is used. 404 405 */ 406 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 407 { 408 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 409 Mat A = c->A,B = c->B; 410 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 411 PetscErrorCode ierr; 412 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 413 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 414 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 415 PetscInt *rbuf_i,kmax,rbuf_0; 416 PetscBT xtable; 417 418 PetscFunctionBegin; 419 m = C->rmap->N; 420 rstart = C->rmap->rstart; 421 cstart = C->cmap->rstart; 422 ai = a->i; 423 aj = a->j; 424 bi = b->i; 425 bj = b->j; 426 garray = c->garray; 427 428 429 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 430 rbuf_i = rbuf[i]; 431 rbuf_0 = rbuf_i[0]; 432 ct += rbuf_0; 433 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 434 } 435 436 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 437 else max1 = 1; 438 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 439 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 440 ++no_malloc; 441 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 442 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 443 444 ct3 = 0; 445 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 446 rbuf_i = rbuf[i]; 447 rbuf_0 = rbuf_i[0]; 448 ct1 = 2*rbuf_0+1; 449 ct2 = ct1; 450 ct3 += ct1; 451 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 452 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 453 oct2 = ct2; 454 kmax = rbuf_i[2*j]; 455 for (k=0; k<kmax; k++,ct1++) { 456 row = rbuf_i[ct1]; 457 if (!PetscBTLookupSet(xtable,row)) { 458 if (!(ct3 < mem_estimate)) { 459 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 460 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 461 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 462 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 463 xdata[0] = tmp; 464 mem_estimate = new_estimate; ++no_malloc; 465 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 466 } 467 xdata[i][ct2++] = row; 468 ct3++; 469 } 470 } 471 for (k=oct2,max2=ct2; k<max2; k++) { 472 row = xdata[i][k] - rstart; 473 start = ai[row]; 474 end = ai[row+1]; 475 for (l=start; l<end; l++) { 476 val = aj[l] + cstart; 477 if (!PetscBTLookupSet(xtable,val)) { 478 if (!(ct3 < mem_estimate)) { 479 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 480 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 481 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 482 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 483 xdata[0] = tmp; 484 mem_estimate = new_estimate; ++no_malloc; 485 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 486 } 487 xdata[i][ct2++] = val; 488 ct3++; 489 } 490 } 491 start = bi[row]; 492 end = bi[row+1]; 493 for (l=start; l<end; l++) { 494 val = garray[bj[l]]; 495 if (!PetscBTLookupSet(xtable,val)) { 496 if (!(ct3 < mem_estimate)) { 497 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 498 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 499 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 500 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 501 xdata[0] = tmp; 502 mem_estimate = new_estimate; ++no_malloc; 503 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 504 } 505 xdata[i][ct2++] = val; 506 ct3++; 507 } 508 } 509 } 510 /* Update the header*/ 511 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 512 xdata[i][2*j-1] = rbuf_i[2*j-1]; 513 } 514 xdata[i][0] = rbuf_0; 515 xdata[i+1] = xdata[i] + ct2; 516 isz1[i] = ct2; /* size of each message */ 517 } 518 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 519 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 520 PetscFunctionReturn(0); 521 } 522 /* -------------------------------------------------------------------------*/ 523 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 524 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 525 /* 526 Every processor gets the entire matrix 527 */ 528 #undef __FUNCT__ 529 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 530 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 531 { 532 Mat B; 533 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 534 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 535 PetscErrorCode ierr; 536 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 537 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 538 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 539 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 540 541 PetscFunctionBegin; 542 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 543 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 544 545 if (scall == MAT_INITIAL_MATRIX) { 546 /* ---------------------------------------------------------------- 547 Tell every processor the number of nonzeros per row 548 */ 549 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 550 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 551 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 552 } 553 sendcount = A->rmap->rend - A->rmap->rstart; 554 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 555 for (i=0; i<size; i++) { 556 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 557 displs[i] = A->rmap->range[i]; 558 } 559 #if defined(PETSC_HAVE_MPI_IN_PLACE) 560 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 561 #else 562 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 563 #endif 564 /* --------------------------------------------------------------- 565 Create the sequential matrix of the same type as the local block diagonal 566 */ 567 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 568 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 569 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 570 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 571 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 572 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 573 **Bin = B; 574 b = (Mat_SeqAIJ*)B->data; 575 576 /*-------------------------------------------------------------------- 577 Copy my part of matrix column indices over 578 */ 579 sendcount = ad->nz + bd->nz; 580 jsendbuf = b->j + b->i[rstarts[rank]]; 581 a_jsendbuf = ad->j; 582 b_jsendbuf = bd->j; 583 n = A->rmap->rend - A->rmap->rstart; 584 cnt = 0; 585 for (i=0; i<n; i++) { 586 587 /* put in lower diagonal portion */ 588 m = bd->i[i+1] - bd->i[i]; 589 while (m > 0) { 590 /* is it above diagonal (in bd (compressed) numbering) */ 591 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 592 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 593 m--; 594 } 595 596 /* put in diagonal portion */ 597 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 598 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 599 } 600 601 /* put in upper diagonal portion */ 602 while (m-- > 0) { 603 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 604 } 605 } 606 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 607 608 /*-------------------------------------------------------------------- 609 Gather all column indices to all processors 610 */ 611 for (i=0; i<size; i++) { 612 recvcounts[i] = 0; 613 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 614 recvcounts[i] += lens[j]; 615 } 616 } 617 displs[0] = 0; 618 for (i=1; i<size; i++) { 619 displs[i] = displs[i-1] + recvcounts[i-1]; 620 } 621 #if defined(PETSC_HAVE_MPI_IN_PLACE) 622 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 623 #else 624 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 625 #endif 626 /*-------------------------------------------------------------------- 627 Assemble the matrix into useable form (note numerical values not yet set) 628 */ 629 /* set the b->ilen (length of each row) values */ 630 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 631 /* set the b->i indices */ 632 b->i[0] = 0; 633 for (i=1; i<=A->rmap->N; i++) { 634 b->i[i] = b->i[i-1] + lens[i-1]; 635 } 636 ierr = PetscFree(lens);CHKERRQ(ierr); 637 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 638 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 639 640 } else { 641 B = **Bin; 642 b = (Mat_SeqAIJ*)B->data; 643 } 644 645 /*-------------------------------------------------------------------- 646 Copy my part of matrix numerical values into the values location 647 */ 648 if (flag == MAT_GET_VALUES) { 649 sendcount = ad->nz + bd->nz; 650 sendbuf = b->a + b->i[rstarts[rank]]; 651 a_sendbuf = ad->a; 652 b_sendbuf = bd->a; 653 b_sendj = bd->j; 654 n = A->rmap->rend - A->rmap->rstart; 655 cnt = 0; 656 for (i=0; i<n; i++) { 657 658 /* put in lower diagonal portion */ 659 m = bd->i[i+1] - bd->i[i]; 660 while (m > 0) { 661 /* is it above diagonal (in bd (compressed) numbering) */ 662 if (garray[*b_sendj] > A->rmap->rstart + i) break; 663 sendbuf[cnt++] = *b_sendbuf++; 664 m--; 665 b_sendj++; 666 } 667 668 /* put in diagonal portion */ 669 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 670 sendbuf[cnt++] = *a_sendbuf++; 671 } 672 673 /* put in upper diagonal portion */ 674 while (m-- > 0) { 675 sendbuf[cnt++] = *b_sendbuf++; 676 b_sendj++; 677 } 678 } 679 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 680 681 /* ----------------------------------------------------------------- 682 Gather all numerical values to all processors 683 */ 684 if (!recvcounts) { 685 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 686 } 687 for (i=0; i<size; i++) { 688 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 689 } 690 displs[0] = 0; 691 for (i=1; i<size; i++) { 692 displs[i] = displs[i-1] + recvcounts[i-1]; 693 } 694 recvbuf = b->a; 695 #if defined(PETSC_HAVE_MPI_IN_PLACE) 696 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 697 #else 698 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 699 #endif 700 } /* endof (flag == MAT_GET_VALUES) */ 701 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 702 703 if (A->symmetric) { 704 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 705 } else if (A->hermitian) { 706 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 707 } else if (A->structurally_symmetric) { 708 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 709 } 710 PetscFunctionReturn(0); 711 } 712 713 714 715 #undef __FUNCT__ 716 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 717 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 718 { 719 PetscErrorCode ierr; 720 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 721 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 722 723 PetscFunctionBegin; 724 /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */ 725 /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level. 726 However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that 727 take care to order the result correctly by assembling it with MatSetValues() (after preallocating). 728 */ 729 for (i = 0; i < ismax; ++i) { 730 PetscBool sorted; 731 ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr); 732 if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i); 733 } 734 735 /* 736 Check for special case: each processor gets entire matrix 737 */ 738 if (ismax == 1 && C->rmap->N == C->cmap->N) { 739 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 740 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 741 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 742 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 743 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 744 wantallmatrix = PETSC_TRUE; 745 746 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 747 } 748 } 749 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 750 if (twantallmatrix) { 751 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 752 PetscFunctionReturn(0); 753 } 754 755 /* Allocate memory to hold all the submatrices */ 756 if (scall != MAT_REUSE_MATRIX) { 757 ierr = PetscMalloc1((ismax+1),submat);CHKERRQ(ierr); 758 } 759 760 /* Check for special case: each processor gets entire matrix columns */ 761 ierr = PetscMalloc1((ismax+1),&allcolumns);CHKERRQ(ierr); 762 for (i=0; i<ismax; i++) { 763 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 764 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 765 if (colflag && ncol == C->cmap->N) { 766 allcolumns[i] = PETSC_TRUE; 767 } else { 768 allcolumns[i] = PETSC_FALSE; 769 } 770 } 771 772 /* Determine the number of stages through which submatrices are done */ 773 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 774 775 /* 776 Each stage will extract nmax submatrices. 777 nmax is determined by the matrix column dimension. 778 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 779 */ 780 if (!nmax) nmax = 1; 781 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 782 783 /* Make sure every processor loops through the nstages */ 784 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 785 786 for (i=0,pos=0; i<nstages; i++) { 787 if (pos+nmax <= ismax) max_no = nmax; 788 else if (pos == ismax) max_no = 0; 789 else max_no = ismax-pos; 790 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 791 pos += max_no; 792 } 793 794 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 795 PetscFunctionReturn(0); 796 } 797 798 /* -------------------------------------------------------------------------*/ 799 #undef __FUNCT__ 800 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 801 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 802 { 803 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 804 Mat A = c->A; 805 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 806 const PetscInt **icol,**irow; 807 PetscInt *nrow,*ncol,start; 808 PetscErrorCode ierr; 809 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 810 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 811 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 812 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 813 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 814 #if defined(PETSC_USE_CTABLE) 815 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 816 #else 817 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 818 #endif 819 const PetscInt *irow_i; 820 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 821 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 822 MPI_Request *r_waits4,*s_waits3,*s_waits4; 823 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 824 MPI_Status *r_status3,*r_status4,*s_status4; 825 MPI_Comm comm; 826 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 827 PetscMPIInt *onodes1,*olengths1; 828 PetscMPIInt idex,idex2,end; 829 830 PetscFunctionBegin; 831 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 832 tag0 = ((PetscObject)C)->tag; 833 size = c->size; 834 rank = c->rank; 835 836 /* Get some new tags to keep the communication clean */ 837 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 838 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 839 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 840 841 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 842 843 for (i=0; i<ismax; i++) { 844 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 845 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 846 if (allcolumns[i]) { 847 icol[i] = NULL; 848 ncol[i] = C->cmap->N; 849 } else { 850 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 851 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 852 } 853 } 854 855 /* evaluate communication - mesg to who, length of mesg, and buffer space 856 required. Based on this, buffers are allocated, and data copied into them*/ 857 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 858 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 859 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 860 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 861 for (i=0; i<ismax; i++) { 862 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 863 jmax = nrow[i]; 864 irow_i = irow[i]; 865 for (j=0; j<jmax; j++) { 866 l = 0; 867 row = irow_i[j]; 868 while (row >= C->rmap->range[l+1]) l++; 869 proc = l; 870 w4[proc]++; 871 } 872 for (j=0; j<size; j++) { 873 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 874 } 875 } 876 877 nrqs = 0; /* no of outgoing messages */ 878 msz = 0; /* total mesg length (for all procs) */ 879 w1[rank] = 0; /* no mesg sent to self */ 880 w3[rank] = 0; 881 for (i=0; i<size; i++) { 882 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 883 } 884 ierr = PetscMalloc1((nrqs+1),&pa);CHKERRQ(ierr); /*(proc -array)*/ 885 for (i=0,j=0; i<size; i++) { 886 if (w1[i]) { pa[j] = i; j++; } 887 } 888 889 /* Each message would have a header = 1 + 2*(no of IS) + data */ 890 for (i=0; i<nrqs; i++) { 891 j = pa[i]; 892 w1[j] += w2[j] + 2* w3[j]; 893 msz += w1[j]; 894 } 895 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 896 897 /* Determine the number of messages to expect, their lengths, from from-ids */ 898 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 899 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 900 901 /* Now post the Irecvs corresponding to these messages */ 902 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 903 904 ierr = PetscFree(onodes1);CHKERRQ(ierr); 905 ierr = PetscFree(olengths1);CHKERRQ(ierr); 906 907 /* Allocate Memory for outgoing messages */ 908 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 909 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 910 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 911 912 { 913 PetscInt *iptr = tmp,ict = 0; 914 for (i=0; i<nrqs; i++) { 915 j = pa[i]; 916 iptr += ict; 917 sbuf1[j] = iptr; 918 ict = w1[j]; 919 } 920 } 921 922 /* Form the outgoing messages */ 923 /* Initialize the header space */ 924 for (i=0; i<nrqs; i++) { 925 j = pa[i]; 926 sbuf1[j][0] = 0; 927 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 928 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 929 } 930 931 /* Parse the isrow and copy data into outbuf */ 932 for (i=0; i<ismax; i++) { 933 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 934 irow_i = irow[i]; 935 jmax = nrow[i]; 936 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 937 l = 0; 938 row = irow_i[j]; 939 while (row >= C->rmap->range[l+1]) l++; 940 proc = l; 941 if (proc != rank) { /* copy to the outgoing buf*/ 942 ctr[proc]++; 943 *ptr[proc] = row; 944 ptr[proc]++; 945 } 946 } 947 /* Update the headers for the current IS */ 948 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 949 if ((ctr_j = ctr[j])) { 950 sbuf1_j = sbuf1[j]; 951 k = ++sbuf1_j[0]; 952 sbuf1_j[2*k] = ctr_j; 953 sbuf1_j[2*k-1] = i; 954 } 955 } 956 } 957 958 /* Now post the sends */ 959 ierr = PetscMalloc1((nrqs+1),&s_waits1);CHKERRQ(ierr); 960 for (i=0; i<nrqs; ++i) { 961 j = pa[i]; 962 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 963 } 964 965 /* Post Receives to capture the buffer size */ 966 ierr = PetscMalloc1((nrqs+1),&r_waits2);CHKERRQ(ierr); 967 ierr = PetscMalloc1((nrqs+1),&rbuf2);CHKERRQ(ierr); 968 rbuf2[0] = tmp + msz; 969 for (i=1; i<nrqs; ++i) { 970 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 971 } 972 for (i=0; i<nrqs; ++i) { 973 j = pa[i]; 974 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 975 } 976 977 /* Send to other procs the buf size they should allocate */ 978 979 980 /* Receive messages*/ 981 ierr = PetscMalloc1((nrqr+1),&s_waits2);CHKERRQ(ierr); 982 ierr = PetscMalloc1((nrqr+1),&r_status1);CHKERRQ(ierr); 983 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 984 { 985 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 986 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 987 PetscInt *sbuf2_i; 988 989 for (i=0; i<nrqr; ++i) { 990 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 991 992 req_size[idex] = 0; 993 rbuf1_i = rbuf1[idex]; 994 start = 2*rbuf1_i[0] + 1; 995 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 996 ierr = PetscMalloc1((end+1),&sbuf2[idex]);CHKERRQ(ierr); 997 sbuf2_i = sbuf2[idex]; 998 for (j=start; j<end; j++) { 999 id = rbuf1_i[j] - rstart; 1000 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1001 sbuf2_i[j] = ncols; 1002 req_size[idex] += ncols; 1003 } 1004 req_source[idex] = r_status1[i].MPI_SOURCE; 1005 /* form the header */ 1006 sbuf2_i[0] = req_size[idex]; 1007 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1008 1009 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1010 } 1011 } 1012 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1013 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1014 1015 /* recv buffer sizes */ 1016 /* Receive messages*/ 1017 1018 ierr = PetscMalloc1((nrqs+1),&rbuf3);CHKERRQ(ierr); 1019 ierr = PetscMalloc1((nrqs+1),&rbuf4);CHKERRQ(ierr); 1020 ierr = PetscMalloc1((nrqs+1),&r_waits3);CHKERRQ(ierr); 1021 ierr = PetscMalloc1((nrqs+1),&r_waits4);CHKERRQ(ierr); 1022 ierr = PetscMalloc1((nrqs+1),&r_status2);CHKERRQ(ierr); 1023 1024 for (i=0; i<nrqs; ++i) { 1025 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1026 ierr = PetscMalloc1((rbuf2[idex][0]+1),&rbuf3[idex]);CHKERRQ(ierr); 1027 ierr = PetscMalloc1((rbuf2[idex][0]+1),&rbuf4[idex]);CHKERRQ(ierr); 1028 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1029 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1030 } 1031 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1032 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1033 1034 /* Wait on sends1 and sends2 */ 1035 ierr = PetscMalloc1((nrqs+1),&s_status1);CHKERRQ(ierr); 1036 ierr = PetscMalloc1((nrqr+1),&s_status2);CHKERRQ(ierr); 1037 1038 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1039 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1040 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1041 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1042 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1043 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1044 1045 /* Now allocate buffers for a->j, and send them off */ 1046 ierr = PetscMalloc1((nrqr+1),&sbuf_aj);CHKERRQ(ierr); 1047 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1048 ierr = PetscMalloc1((j+1),&sbuf_aj[0]);CHKERRQ(ierr); 1049 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1050 1051 ierr = PetscMalloc1((nrqr+1),&s_waits3);CHKERRQ(ierr); 1052 { 1053 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1054 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1055 PetscInt cend = C->cmap->rend; 1056 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1057 1058 for (i=0; i<nrqr; i++) { 1059 rbuf1_i = rbuf1[i]; 1060 sbuf_aj_i = sbuf_aj[i]; 1061 ct1 = 2*rbuf1_i[0] + 1; 1062 ct2 = 0; 1063 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1064 kmax = rbuf1[i][2*j]; 1065 for (k=0; k<kmax; k++,ct1++) { 1066 row = rbuf1_i[ct1] - rstart; 1067 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1068 ncols = nzA + nzB; 1069 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1070 1071 /* load the column indices for this row into cols*/ 1072 cols = sbuf_aj_i + ct2; 1073 1074 lwrite = 0; 1075 for (l=0; l<nzB; l++) { 1076 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1077 } 1078 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1079 for (l=0; l<nzB; l++) { 1080 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1081 } 1082 1083 ct2 += ncols; 1084 } 1085 } 1086 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1087 } 1088 } 1089 ierr = PetscMalloc1((nrqs+1),&r_status3);CHKERRQ(ierr); 1090 ierr = PetscMalloc1((nrqr+1),&s_status3);CHKERRQ(ierr); 1091 1092 /* Allocate buffers for a->a, and send them off */ 1093 ierr = PetscMalloc1((nrqr+1),&sbuf_aa);CHKERRQ(ierr); 1094 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1095 ierr = PetscMalloc1((j+1),&sbuf_aa[0]);CHKERRQ(ierr); 1096 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1097 1098 ierr = PetscMalloc1((nrqr+1),&s_waits4);CHKERRQ(ierr); 1099 { 1100 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1101 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1102 PetscInt cend = C->cmap->rend; 1103 PetscInt *b_j = b->j; 1104 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1105 1106 for (i=0; i<nrqr; i++) { 1107 rbuf1_i = rbuf1[i]; 1108 sbuf_aa_i = sbuf_aa[i]; 1109 ct1 = 2*rbuf1_i[0]+1; 1110 ct2 = 0; 1111 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1112 kmax = rbuf1_i[2*j]; 1113 for (k=0; k<kmax; k++,ct1++) { 1114 row = rbuf1_i[ct1] - rstart; 1115 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1116 ncols = nzA + nzB; 1117 cworkB = b_j + b_i[row]; 1118 vworkA = a_a + a_i[row]; 1119 vworkB = b_a + b_i[row]; 1120 1121 /* load the column values for this row into vals*/ 1122 vals = sbuf_aa_i+ct2; 1123 1124 lwrite = 0; 1125 for (l=0; l<nzB; l++) { 1126 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1127 } 1128 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1129 for (l=0; l<nzB; l++) { 1130 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1131 } 1132 1133 ct2 += ncols; 1134 } 1135 } 1136 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1137 } 1138 } 1139 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1140 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1141 ierr = PetscMalloc1((nrqs+1),&r_status4);CHKERRQ(ierr); 1142 ierr = PetscMalloc1((nrqr+1),&s_status4);CHKERRQ(ierr); 1143 1144 /* Form the matrix */ 1145 /* create col map: global col of C -> local col of submatrices */ 1146 { 1147 const PetscInt *icol_i; 1148 #if defined(PETSC_USE_CTABLE) 1149 ierr = PetscMalloc1((1+ismax),&cmap);CHKERRQ(ierr); 1150 for (i=0; i<ismax; i++) { 1151 if (!allcolumns[i]) { 1152 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1153 1154 jmax = ncol[i]; 1155 icol_i = icol[i]; 1156 cmap_i = cmap[i]; 1157 for (j=0; j<jmax; j++) { 1158 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1159 } 1160 } else { 1161 cmap[i] = NULL; 1162 } 1163 } 1164 #else 1165 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1166 for (i=0; i<ismax; i++) { 1167 if (!allcolumns[i]) { 1168 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1169 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1170 jmax = ncol[i]; 1171 icol_i = icol[i]; 1172 cmap_i = cmap[i]; 1173 for (j=0; j<jmax; j++) { 1174 cmap_i[icol_i[j]] = j+1; 1175 } 1176 } else { 1177 cmap[i] = NULL; 1178 } 1179 } 1180 #endif 1181 } 1182 1183 /* Create lens which is required for MatCreate... */ 1184 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1185 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1186 if (ismax) { 1187 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1188 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1189 } 1190 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1191 1192 /* Update lens from local data */ 1193 for (i=0; i<ismax; i++) { 1194 jmax = nrow[i]; 1195 if (!allcolumns[i]) cmap_i = cmap[i]; 1196 irow_i = irow[i]; 1197 lens_i = lens[i]; 1198 for (j=0; j<jmax; j++) { 1199 l = 0; 1200 row = irow_i[j]; 1201 while (row >= C->rmap->range[l+1]) l++; 1202 proc = l; 1203 if (proc == rank) { 1204 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1205 if (!allcolumns[i]) { 1206 for (k=0; k<ncols; k++) { 1207 #if defined(PETSC_USE_CTABLE) 1208 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1209 #else 1210 tcol = cmap_i[cols[k]]; 1211 #endif 1212 if (tcol) lens_i[j]++; 1213 } 1214 } else { /* allcolumns */ 1215 lens_i[j] = ncols; 1216 } 1217 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1218 } 1219 } 1220 } 1221 1222 /* Create row map: global row of C -> local row of submatrices */ 1223 #if defined(PETSC_USE_CTABLE) 1224 ierr = PetscMalloc1((1+ismax),&rmap);CHKERRQ(ierr); 1225 for (i=0; i<ismax; i++) { 1226 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1227 rmap_i = rmap[i]; 1228 irow_i = irow[i]; 1229 jmax = nrow[i]; 1230 for (j=0; j<jmax; j++) { 1231 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1232 } 1233 } 1234 #else 1235 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1236 if (ismax) { 1237 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1238 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1239 } 1240 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1241 for (i=0; i<ismax; i++) { 1242 rmap_i = rmap[i]; 1243 irow_i = irow[i]; 1244 jmax = nrow[i]; 1245 for (j=0; j<jmax; j++) { 1246 rmap_i[irow_i[j]] = j; 1247 } 1248 } 1249 #endif 1250 1251 /* Update lens from offproc data */ 1252 { 1253 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1254 1255 for (tmp2=0; tmp2<nrqs; tmp2++) { 1256 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1257 idex = pa[idex2]; 1258 sbuf1_i = sbuf1[idex]; 1259 jmax = sbuf1_i[0]; 1260 ct1 = 2*jmax+1; 1261 ct2 = 0; 1262 rbuf2_i = rbuf2[idex2]; 1263 rbuf3_i = rbuf3[idex2]; 1264 for (j=1; j<=jmax; j++) { 1265 is_no = sbuf1_i[2*j-1]; 1266 max1 = sbuf1_i[2*j]; 1267 lens_i = lens[is_no]; 1268 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1269 rmap_i = rmap[is_no]; 1270 for (k=0; k<max1; k++,ct1++) { 1271 #if defined(PETSC_USE_CTABLE) 1272 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1273 row--; 1274 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1275 #else 1276 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1277 #endif 1278 max2 = rbuf2_i[ct1]; 1279 for (l=0; l<max2; l++,ct2++) { 1280 if (!allcolumns[is_no]) { 1281 #if defined(PETSC_USE_CTABLE) 1282 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1283 #else 1284 tcol = cmap_i[rbuf3_i[ct2]]; 1285 #endif 1286 if (tcol) lens_i[row]++; 1287 } else { /* allcolumns */ 1288 lens_i[row]++; /* lens_i[row] += max2 ? */ 1289 } 1290 } 1291 } 1292 } 1293 } 1294 } 1295 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1296 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1297 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1298 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1299 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1300 1301 /* Create the submatrices */ 1302 if (scall == MAT_REUSE_MATRIX) { 1303 PetscBool flag; 1304 1305 /* 1306 Assumes new rows are same length as the old rows,hence bug! 1307 */ 1308 for (i=0; i<ismax; i++) { 1309 mat = (Mat_SeqAIJ*)(submats[i]->data); 1310 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1311 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1312 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1313 /* Initial matrix as if empty */ 1314 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1315 1316 submats[i]->factortype = C->factortype; 1317 } 1318 } else { 1319 for (i=0; i<ismax; i++) { 1320 PetscInt rbs,cbs; 1321 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1322 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1323 1324 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1325 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1326 1327 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1328 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1329 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1330 } 1331 } 1332 1333 /* Assemble the matrices */ 1334 /* First assemble the local rows */ 1335 { 1336 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1337 PetscScalar *imat_a; 1338 1339 for (i=0; i<ismax; i++) { 1340 mat = (Mat_SeqAIJ*)submats[i]->data; 1341 imat_ilen = mat->ilen; 1342 imat_j = mat->j; 1343 imat_i = mat->i; 1344 imat_a = mat->a; 1345 1346 if (!allcolumns[i]) cmap_i = cmap[i]; 1347 rmap_i = rmap[i]; 1348 irow_i = irow[i]; 1349 jmax = nrow[i]; 1350 for (j=0; j<jmax; j++) { 1351 l = 0; 1352 row = irow_i[j]; 1353 while (row >= C->rmap->range[l+1]) l++; 1354 proc = l; 1355 if (proc == rank) { 1356 old_row = row; 1357 #if defined(PETSC_USE_CTABLE) 1358 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1359 row--; 1360 #else 1361 row = rmap_i[row]; 1362 #endif 1363 ilen_row = imat_ilen[row]; 1364 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1365 mat_i = imat_i[row]; 1366 mat_a = imat_a + mat_i; 1367 mat_j = imat_j + mat_i; 1368 if (!allcolumns[i]) { 1369 for (k=0; k<ncols; k++) { 1370 #if defined(PETSC_USE_CTABLE) 1371 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1372 #else 1373 tcol = cmap_i[cols[k]]; 1374 #endif 1375 if (tcol) { 1376 *mat_j++ = tcol - 1; 1377 *mat_a++ = vals[k]; 1378 ilen_row++; 1379 } 1380 } 1381 } else { /* allcolumns */ 1382 for (k=0; k<ncols; k++) { 1383 *mat_j++ = cols[k]; /* global col index! */ 1384 *mat_a++ = vals[k]; 1385 ilen_row++; 1386 } 1387 } 1388 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1389 1390 imat_ilen[row] = ilen_row; 1391 } 1392 } 1393 } 1394 } 1395 1396 /* Now assemble the off proc rows*/ 1397 { 1398 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1399 PetscInt *imat_j,*imat_i; 1400 PetscScalar *imat_a,*rbuf4_i; 1401 1402 for (tmp2=0; tmp2<nrqs; tmp2++) { 1403 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1404 idex = pa[idex2]; 1405 sbuf1_i = sbuf1[idex]; 1406 jmax = sbuf1_i[0]; 1407 ct1 = 2*jmax + 1; 1408 ct2 = 0; 1409 rbuf2_i = rbuf2[idex2]; 1410 rbuf3_i = rbuf3[idex2]; 1411 rbuf4_i = rbuf4[idex2]; 1412 for (j=1; j<=jmax; j++) { 1413 is_no = sbuf1_i[2*j-1]; 1414 rmap_i = rmap[is_no]; 1415 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1416 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1417 imat_ilen = mat->ilen; 1418 imat_j = mat->j; 1419 imat_i = mat->i; 1420 imat_a = mat->a; 1421 max1 = sbuf1_i[2*j]; 1422 for (k=0; k<max1; k++,ct1++) { 1423 row = sbuf1_i[ct1]; 1424 #if defined(PETSC_USE_CTABLE) 1425 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1426 row--; 1427 #else 1428 row = rmap_i[row]; 1429 #endif 1430 ilen = imat_ilen[row]; 1431 mat_i = imat_i[row]; 1432 mat_a = imat_a + mat_i; 1433 mat_j = imat_j + mat_i; 1434 max2 = rbuf2_i[ct1]; 1435 if (!allcolumns[is_no]) { 1436 for (l=0; l<max2; l++,ct2++) { 1437 1438 #if defined(PETSC_USE_CTABLE) 1439 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1440 #else 1441 tcol = cmap_i[rbuf3_i[ct2]]; 1442 #endif 1443 if (tcol) { 1444 *mat_j++ = tcol - 1; 1445 *mat_a++ = rbuf4_i[ct2]; 1446 ilen++; 1447 } 1448 } 1449 } else { /* allcolumns */ 1450 for (l=0; l<max2; l++,ct2++) { 1451 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1452 *mat_a++ = rbuf4_i[ct2]; 1453 ilen++; 1454 } 1455 } 1456 imat_ilen[row] = ilen; 1457 } 1458 } 1459 } 1460 } 1461 1462 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1463 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1464 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1465 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1466 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1467 1468 /* Restore the indices */ 1469 for (i=0; i<ismax; i++) { 1470 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1471 if (!allcolumns[i]) { 1472 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1473 } 1474 } 1475 1476 /* Destroy allocated memory */ 1477 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1478 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1479 ierr = PetscFree(pa);CHKERRQ(ierr); 1480 1481 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1482 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1483 for (i=0; i<nrqr; ++i) { 1484 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1485 } 1486 for (i=0; i<nrqs; ++i) { 1487 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1488 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1489 } 1490 1491 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1492 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1493 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1494 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1495 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1496 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1497 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1498 1499 #if defined(PETSC_USE_CTABLE) 1500 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1501 #else 1502 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1503 #endif 1504 ierr = PetscFree(rmap);CHKERRQ(ierr); 1505 1506 for (i=0; i<ismax; i++) { 1507 if (!allcolumns[i]) { 1508 #if defined(PETSC_USE_CTABLE) 1509 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1510 #else 1511 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1512 #endif 1513 } 1514 } 1515 ierr = PetscFree(cmap);CHKERRQ(ierr); 1516 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1517 ierr = PetscFree(lens);CHKERRQ(ierr); 1518 1519 for (i=0; i<ismax; i++) { 1520 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1521 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1522 } 1523 PetscFunctionReturn(0); 1524 } 1525 1526 /* 1527 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1528 Be careful not to destroy them elsewhere. 1529 */ 1530 #undef __FUNCT__ 1531 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1532 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1533 { 1534 /* If making this function public, change the error returned in this function away from _PLIB. */ 1535 PetscErrorCode ierr; 1536 Mat_MPIAIJ *aij; 1537 PetscBool seqaij; 1538 1539 PetscFunctionBegin; 1540 /* Check to make sure the component matrices are compatible with C. */ 1541 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1542 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1543 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1544 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1545 if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1546 1547 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1548 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1549 ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr); 1550 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1551 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1552 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1553 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1554 aij = (Mat_MPIAIJ*)((*C)->data); 1555 aij->A = A; 1556 aij->B = B; 1557 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr); 1558 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr); 1559 1560 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1561 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1562 PetscFunctionReturn(0); 1563 } 1564 1565 #undef __FUNCT__ 1566 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1567 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1568 { 1569 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1570 1571 PetscFunctionBegin; 1572 PetscValidPointer(A,2); 1573 PetscValidPointer(B,3); 1574 *A = aij->A; 1575 *B = aij->B; 1576 /* Note that we don't incref *A and *B, so be careful! */ 1577 PetscFunctionReturn(0); 1578 } 1579 1580 #undef __FUNCT__ 1581 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1582 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1583 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1584 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1585 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1586 { 1587 PetscErrorCode ierr; 1588 PetscMPIInt size, flag; 1589 PetscInt i,ii; 1590 PetscInt ismax_c; 1591 1592 PetscFunctionBegin; 1593 if (!ismax) PetscFunctionReturn(0); 1594 1595 for (i = 0, ismax_c = 0; i < ismax; ++i) { 1596 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 1597 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1598 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1599 if (size > 1) ++ismax_c; 1600 } 1601 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1602 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1603 } else { /* if (ismax_c) */ 1604 Mat *A,*B; 1605 IS *isrow_c, *iscol_c; 1606 PetscMPIInt size; 1607 /* 1608 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1609 array of sequential matrices underlying the resulting parallel matrices. 1610 Which arrays to allocate is based on the value of MatReuse scall. 1611 There are as many diag matrices as there are original index sets. 1612 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1613 1614 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1615 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1616 will deposite the extracted diag and off-diag parts. 1617 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1618 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1619 */ 1620 1621 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1622 if (scall != MAT_REUSE_MATRIX) { 1623 ierr = PetscMalloc1((ismax),submat);CHKERRQ(ierr); 1624 } else { 1625 ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr); 1626 ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr); 1627 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1628 for (i = 0, ii = 0; i < ismax; ++i) { 1629 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1630 if (size > 1) { 1631 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 1632 ++ii; 1633 } else A[i] = (*submat)[i]; 1634 } 1635 } 1636 /* 1637 Construct the complements of the iscol ISs for parallel ISs only. 1638 These are used to extract the off-diag portion of the resulting parallel matrix. 1639 The row IS for the off-diag portion is the same as for the diag portion, 1640 so we merely alias the row IS, while skipping those that are sequential. 1641 */ 1642 ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr); 1643 for (i = 0, ii = 0; i < ismax; ++i) { 1644 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1645 if (size > 1) { 1646 isrow_c[ii] = isrow[i]; 1647 1648 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 1649 ++ii; 1650 } 1651 } 1652 /* Now obtain the sequential A and B submatrices separately. */ 1653 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 1654 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 1655 for (ii = 0; ii < ismax_c; ++ii) { 1656 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 1657 } 1658 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 1659 /* 1660 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1661 have been extracted directly into the parallel matrices containing them, or 1662 simply into the sequential matrix identical with the corresponding A (if size == 1). 1663 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1664 A is put into the correct submat slot (if size == 1). 1665 */ 1666 if (scall != MAT_REUSE_MATRIX) { 1667 for (i = 0, ii = 0; i < ismax; ++i) { 1668 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1669 if (size > 1) { 1670 /* 1671 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1672 */ 1673 /* Construct submat[i] from the Seq pieces A and B. */ 1674 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 1675 1676 ++ii; 1677 } else (*submat)[i] = A[i]; 1678 } 1679 } 1680 ierr = PetscFree(A);CHKERRQ(ierr); 1681 ierr = PetscFree(B);CHKERRQ(ierr); 1682 } 1683 PetscFunctionReturn(0); 1684 } /* MatGetSubMatricesParallel_MPIXAIJ() */ 1685 1686 1687 1688 #undef __FUNCT__ 1689 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1690 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1691 { 1692 PetscErrorCode ierr; 1693 1694 PetscFunctionBegin; 1695 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 1696 PetscFunctionReturn(0); 1697 } 1698