1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++) { 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 162 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 163 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 164 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 165 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 167 168 for (i=0; i<imax; i++) { 169 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 170 data[i] = d_p + M*i; 171 } 172 } 173 174 /* Parse the IS and update local tables and the outgoing buf with the data*/ 175 { 176 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 177 PetscBT table_i; 178 179 for (i=0; i<imax; i++) { 180 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 181 n_i = n[i]; 182 table_i = table[i]; 183 idx_i = idx[i]; 184 data_i = data[i]; 185 isz_i = isz[i]; 186 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 187 row = idx_i[j]; 188 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 189 if (proc != rank) { /* copy to the outgoing buffer */ 190 ctr[proc]++; 191 *ptr[proc] = row; 192 ptr[proc]++; 193 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 194 } 195 /* Update the headers for the current IS */ 196 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 197 if ((ctr_j = ctr[j])) { 198 outdat_j = outdat[j]; 199 k = ++outdat_j[0]; 200 outdat_j[2*k] = ctr_j; 201 outdat_j[2*k-1] = i; 202 } 203 } 204 isz[i] = isz_i; 205 } 206 } 207 208 /* Now post the sends */ 209 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 210 for (i=0; i<nrqs; ++i) { 211 j = pa[i]; 212 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 213 } 214 215 /* No longer need the original indices*/ 216 for (i=0; i<imax; ++i) { 217 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 218 } 219 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 220 221 for (i=0; i<imax; ++i) { 222 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 223 } 224 225 /* Do Local work*/ 226 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 227 228 /* Receive messages*/ 229 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 230 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 231 232 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 233 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 234 235 /* Phase 1 sends are complete - deallocate buffers */ 236 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 237 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 238 239 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 240 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 241 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 242 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 243 ierr = PetscFree(rbuf);CHKERRQ(ierr); 244 245 246 /* Send the data back*/ 247 /* Do a global reduction to know the buffer space req for incoming messages*/ 248 { 249 PetscMPIInt *rw1; 250 251 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 252 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 253 254 for (i=0; i<nrqr; ++i) { 255 proc = recv_status[i].MPI_SOURCE; 256 257 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 258 rw1[proc] = isz1[i]; 259 } 260 ierr = PetscFree(onodes1);CHKERRQ(ierr); 261 ierr = PetscFree(olengths1);CHKERRQ(ierr); 262 263 /* Determine the number of messages to expect, their lengths, from from-ids */ 264 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 265 ierr = PetscFree(rw1);CHKERRQ(ierr); 266 } 267 /* Now post the Irecvs corresponding to these messages */ 268 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 269 270 /* Now post the sends */ 271 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 272 for (i=0; i<nrqr; ++i) { 273 j = recv_status[i].MPI_SOURCE; 274 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 275 } 276 277 /* receive work done on other processors*/ 278 { 279 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 280 PetscMPIInt idex; 281 PetscBT table_i; 282 MPI_Status *status2; 283 284 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 285 for (i=0; i<nrqs; ++i) { 286 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 287 /* Process the message*/ 288 rbuf2_i = rbuf2[idex]; 289 ct1 = 2*rbuf2_i[0]+1; 290 jmax = rbuf2[idex][0]; 291 for (j=1; j<=jmax; j++) { 292 max = rbuf2_i[2*j]; 293 is_no = rbuf2_i[2*j-1]; 294 isz_i = isz[is_no]; 295 data_i = data[is_no]; 296 table_i = table[is_no]; 297 for (k=0; k<max; k++,ct1++) { 298 row = rbuf2_i[ct1]; 299 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 300 } 301 isz[is_no] = isz_i; 302 } 303 } 304 305 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 306 ierr = PetscFree(status2);CHKERRQ(ierr); 307 } 308 309 for (i=0; i<imax; ++i) { 310 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 311 } 312 313 ierr = PetscFree(onodes2);CHKERRQ(ierr); 314 ierr = PetscFree(olengths2);CHKERRQ(ierr); 315 316 ierr = PetscFree(pa);CHKERRQ(ierr); 317 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 318 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 319 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 320 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 323 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 324 ierr = PetscFree(s_status);CHKERRQ(ierr); 325 ierr = PetscFree(recv_status);CHKERRQ(ierr); 326 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 327 ierr = PetscFree(xdata);CHKERRQ(ierr); 328 ierr = PetscFree(isz1);CHKERRQ(ierr); 329 PetscFunctionReturn(0); 330 } 331 332 #undef __FUNCT__ 333 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 334 /* 335 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 336 the work on the local processor. 337 338 Inputs: 339 C - MAT_MPIAIJ; 340 imax - total no of index sets processed at a time; 341 table - an array of char - size = m bits. 342 343 Output: 344 isz - array containing the count of the solution elements corresponding 345 to each index set; 346 data - pointer to the solutions 347 */ 348 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 349 { 350 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 351 Mat A = c->A,B = c->B; 352 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 353 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 354 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 355 PetscBT table_i; 356 357 PetscFunctionBegin; 358 rstart = C->rmap->rstart; 359 cstart = C->cmap->rstart; 360 ai = a->i; 361 aj = a->j; 362 bi = b->i; 363 bj = b->j; 364 garray = c->garray; 365 366 367 for (i=0; i<imax; i++) { 368 data_i = data[i]; 369 table_i = table[i]; 370 isz_i = isz[i]; 371 for (j=0,max=isz[i]; j<max; j++) { 372 row = data_i[j] - rstart; 373 start = ai[row]; 374 end = ai[row+1]; 375 for (k=start; k<end; k++) { /* Amat */ 376 val = aj[k] + cstart; 377 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 378 } 379 start = bi[row]; 380 end = bi[row+1]; 381 for (k=start; k<end; k++) { /* Bmat */ 382 val = garray[bj[k]]; 383 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 384 } 385 } 386 isz[i] = isz_i; 387 } 388 PetscFunctionReturn(0); 389 } 390 391 #undef __FUNCT__ 392 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 393 /* 394 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 395 and return the output 396 397 Input: 398 C - the matrix 399 nrqr - no of messages being processed. 400 rbuf - an array of pointers to the recieved requests 401 402 Output: 403 xdata - array of messages to be sent back 404 isz1 - size of each message 405 406 For better efficiency perhaps we should malloc separately each xdata[i], 407 then if a remalloc is required we need only copy the data for that one row 408 rather then all previous rows as it is now where a single large chunck of 409 memory is used. 410 411 */ 412 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 413 { 414 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 415 Mat A = c->A,B = c->B; 416 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 417 PetscErrorCode ierr; 418 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 419 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 420 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 421 PetscInt *rbuf_i,kmax,rbuf_0; 422 PetscBT xtable; 423 424 PetscFunctionBegin; 425 m = C->rmap->N; 426 rstart = C->rmap->rstart; 427 cstart = C->cmap->rstart; 428 ai = a->i; 429 aj = a->j; 430 bi = b->i; 431 bj = b->j; 432 garray = c->garray; 433 434 435 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 436 rbuf_i = rbuf[i]; 437 rbuf_0 = rbuf_i[0]; 438 ct += rbuf_0; 439 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 440 } 441 442 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 443 else max1 = 1; 444 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 445 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 446 ++no_malloc; 447 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 448 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 449 450 ct3 = 0; 451 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 452 rbuf_i = rbuf[i]; 453 rbuf_0 = rbuf_i[0]; 454 ct1 = 2*rbuf_0+1; 455 ct2 = ct1; 456 ct3 += ct1; 457 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 458 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 459 oct2 = ct2; 460 kmax = rbuf_i[2*j]; 461 for (k=0; k<kmax; k++,ct1++) { 462 row = rbuf_i[ct1]; 463 if (!PetscBTLookupSet(xtable,row)) { 464 if (!(ct3 < mem_estimate)) { 465 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 466 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 467 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 468 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 469 xdata[0] = tmp; 470 mem_estimate = new_estimate; ++no_malloc; 471 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 472 } 473 xdata[i][ct2++] = row; 474 ct3++; 475 } 476 } 477 for (k=oct2,max2=ct2; k<max2; k++) { 478 row = xdata[i][k] - rstart; 479 start = ai[row]; 480 end = ai[row+1]; 481 for (l=start; l<end; l++) { 482 val = aj[l] + cstart; 483 if (!PetscBTLookupSet(xtable,val)) { 484 if (!(ct3 < mem_estimate)) { 485 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 486 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 487 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 488 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 489 xdata[0] = tmp; 490 mem_estimate = new_estimate; ++no_malloc; 491 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 492 } 493 xdata[i][ct2++] = val; 494 ct3++; 495 } 496 } 497 start = bi[row]; 498 end = bi[row+1]; 499 for (l=start; l<end; l++) { 500 val = garray[bj[l]]; 501 if (!PetscBTLookupSet(xtable,val)) { 502 if (!(ct3 < mem_estimate)) { 503 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 504 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 505 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 506 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 507 xdata[0] = tmp; 508 mem_estimate = new_estimate; ++no_malloc; 509 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 510 } 511 xdata[i][ct2++] = val; 512 ct3++; 513 } 514 } 515 } 516 /* Update the header*/ 517 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 518 xdata[i][2*j-1] = rbuf_i[2*j-1]; 519 } 520 xdata[i][0] = rbuf_0; 521 xdata[i+1] = xdata[i] + ct2; 522 isz1[i] = ct2; /* size of each message */ 523 } 524 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 525 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 526 PetscFunctionReturn(0); 527 } 528 /* -------------------------------------------------------------------------*/ 529 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 530 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 531 /* 532 Every processor gets the entire matrix 533 */ 534 #undef __FUNCT__ 535 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 536 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 537 { 538 Mat B; 539 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 540 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 541 PetscErrorCode ierr; 542 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 543 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 544 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 545 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 546 547 PetscFunctionBegin; 548 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 549 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 550 551 if (scall == MAT_INITIAL_MATRIX) { 552 /* ---------------------------------------------------------------- 553 Tell every processor the number of nonzeros per row 554 */ 555 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 556 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 557 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 558 } 559 sendcount = A->rmap->rend - A->rmap->rstart; 560 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 561 for (i=0; i<size; i++) { 562 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 563 displs[i] = A->rmap->range[i]; 564 } 565 #if defined(PETSC_HAVE_MPI_IN_PLACE) 566 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 567 #else 568 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 569 #endif 570 /* --------------------------------------------------------------- 571 Create the sequential matrix of the same type as the local block diagonal 572 */ 573 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 574 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 575 ierr = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 576 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 577 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 578 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 579 **Bin = B; 580 b = (Mat_SeqAIJ*)B->data; 581 582 /*-------------------------------------------------------------------- 583 Copy my part of matrix column indices over 584 */ 585 sendcount = ad->nz + bd->nz; 586 jsendbuf = b->j + b->i[rstarts[rank]]; 587 a_jsendbuf = ad->j; 588 b_jsendbuf = bd->j; 589 n = A->rmap->rend - A->rmap->rstart; 590 cnt = 0; 591 for (i=0; i<n; i++) { 592 593 /* put in lower diagonal portion */ 594 m = bd->i[i+1] - bd->i[i]; 595 while (m > 0) { 596 /* is it above diagonal (in bd (compressed) numbering) */ 597 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 598 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 599 m--; 600 } 601 602 /* put in diagonal portion */ 603 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 604 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 605 } 606 607 /* put in upper diagonal portion */ 608 while (m-- > 0) { 609 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 610 } 611 } 612 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 613 614 /*-------------------------------------------------------------------- 615 Gather all column indices to all processors 616 */ 617 for (i=0; i<size; i++) { 618 recvcounts[i] = 0; 619 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 620 recvcounts[i] += lens[j]; 621 } 622 } 623 displs[0] = 0; 624 for (i=1; i<size; i++) { 625 displs[i] = displs[i-1] + recvcounts[i-1]; 626 } 627 #if defined(PETSC_HAVE_MPI_IN_PLACE) 628 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 629 #else 630 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 631 #endif 632 /*-------------------------------------------------------------------- 633 Assemble the matrix into useable form (note numerical values not yet set) 634 */ 635 /* set the b->ilen (length of each row) values */ 636 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 637 /* set the b->i indices */ 638 b->i[0] = 0; 639 for (i=1; i<=A->rmap->N; i++) { 640 b->i[i] = b->i[i-1] + lens[i-1]; 641 } 642 ierr = PetscFree(lens);CHKERRQ(ierr); 643 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 644 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 645 646 } else { 647 B = **Bin; 648 b = (Mat_SeqAIJ*)B->data; 649 } 650 651 /*-------------------------------------------------------------------- 652 Copy my part of matrix numerical values into the values location 653 */ 654 if (flag == MAT_GET_VALUES) { 655 sendcount = ad->nz + bd->nz; 656 sendbuf = b->a + b->i[rstarts[rank]]; 657 a_sendbuf = ad->a; 658 b_sendbuf = bd->a; 659 b_sendj = bd->j; 660 n = A->rmap->rend - A->rmap->rstart; 661 cnt = 0; 662 for (i=0; i<n; i++) { 663 664 /* put in lower diagonal portion */ 665 m = bd->i[i+1] - bd->i[i]; 666 while (m > 0) { 667 /* is it above diagonal (in bd (compressed) numbering) */ 668 if (garray[*b_sendj] > A->rmap->rstart + i) break; 669 sendbuf[cnt++] = *b_sendbuf++; 670 m--; 671 b_sendj++; 672 } 673 674 /* put in diagonal portion */ 675 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 676 sendbuf[cnt++] = *a_sendbuf++; 677 } 678 679 /* put in upper diagonal portion */ 680 while (m-- > 0) { 681 sendbuf[cnt++] = *b_sendbuf++; 682 b_sendj++; 683 } 684 } 685 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 686 687 /* ----------------------------------------------------------------- 688 Gather all numerical values to all processors 689 */ 690 if (!recvcounts) { 691 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 692 } 693 for (i=0; i<size; i++) { 694 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 695 } 696 displs[0] = 0; 697 for (i=1; i<size; i++) { 698 displs[i] = displs[i-1] + recvcounts[i-1]; 699 } 700 recvbuf = b->a; 701 #if defined(PETSC_HAVE_MPI_IN_PLACE) 702 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 703 #else 704 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 705 #endif 706 } /* endof (flag == MAT_GET_VALUES) */ 707 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 708 709 if (A->symmetric) { 710 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 711 } else if (A->hermitian) { 712 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 713 } else if (A->structurally_symmetric) { 714 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 715 } 716 PetscFunctionReturn(0); 717 } 718 719 720 721 #undef __FUNCT__ 722 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 723 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 724 { 725 PetscErrorCode ierr; 726 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 727 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 728 729 PetscFunctionBegin; 730 /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */ 731 /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level. 732 However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that 733 take care to order the result correctly by assembling it with MatSetValues() (after preallocating). 734 */ 735 for (i = 0; i < ismax; ++i) { 736 PetscBool sorted; 737 ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr); 738 if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i); 739 } 740 741 /* 742 Check for special case: each processor gets entire matrix 743 */ 744 if (ismax == 1 && C->rmap->N == C->cmap->N) { 745 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 746 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 747 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 748 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 749 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 750 wantallmatrix = PETSC_TRUE; 751 752 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 753 } 754 } 755 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 756 if (twantallmatrix) { 757 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 758 PetscFunctionReturn(0); 759 } 760 761 /* Allocate memory to hold all the submatrices */ 762 if (scall != MAT_REUSE_MATRIX) { 763 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 764 } 765 766 /* Check for special case: each processor gets entire matrix columns */ 767 ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr); 768 for (i=0; i<ismax; i++) { 769 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 770 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 771 if (colflag && ncol == C->cmap->N) { 772 allcolumns[i] = PETSC_TRUE; 773 } else { 774 allcolumns[i] = PETSC_FALSE; 775 } 776 } 777 778 /* Determine the number of stages through which submatrices are done */ 779 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 780 781 /* 782 Each stage will extract nmax submatrices. 783 nmax is determined by the matrix column dimension. 784 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 785 */ 786 if (!nmax) nmax = 1; 787 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 788 789 /* Make sure every processor loops through the nstages */ 790 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 791 792 for (i=0,pos=0; i<nstages; i++) { 793 if (pos+nmax <= ismax) max_no = nmax; 794 else if (pos == ismax) max_no = 0; 795 else max_no = ismax-pos; 796 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 797 pos += max_no; 798 } 799 800 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 801 PetscFunctionReturn(0); 802 } 803 804 /* -------------------------------------------------------------------------*/ 805 #undef __FUNCT__ 806 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 807 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 808 { 809 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 810 Mat A = c->A; 811 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 812 const PetscInt **icol,**irow; 813 PetscInt *nrow,*ncol,start; 814 PetscErrorCode ierr; 815 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 816 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 817 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 818 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 819 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 820 #if defined(PETSC_USE_CTABLE) 821 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 822 #else 823 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 824 #endif 825 const PetscInt *irow_i; 826 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 827 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 828 MPI_Request *r_waits4,*s_waits3,*s_waits4; 829 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 830 MPI_Status *r_status3,*r_status4,*s_status4; 831 MPI_Comm comm; 832 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 833 PetscMPIInt *onodes1,*olengths1; 834 PetscMPIInt idex,idex2,end; 835 836 PetscFunctionBegin; 837 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 838 tag0 = ((PetscObject)C)->tag; 839 size = c->size; 840 rank = c->rank; 841 842 /* Get some new tags to keep the communication clean */ 843 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 844 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 845 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 846 847 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 848 849 for (i=0; i<ismax; i++) { 850 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 851 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 852 if (allcolumns[i]) { 853 icol[i] = NULL; 854 ncol[i] = C->cmap->N; 855 } else { 856 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 857 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 858 } 859 } 860 861 /* evaluate communication - mesg to who, length of mesg, and buffer space 862 required. Based on this, buffers are allocated, and data copied into them*/ 863 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 864 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 865 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 866 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 867 for (i=0; i<ismax; i++) { 868 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 869 jmax = nrow[i]; 870 irow_i = irow[i]; 871 for (j=0; j<jmax; j++) { 872 l = 0; 873 row = irow_i[j]; 874 while (row >= C->rmap->range[l+1]) l++; 875 proc = l; 876 w4[proc]++; 877 } 878 for (j=0; j<size; j++) { 879 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 880 } 881 } 882 883 nrqs = 0; /* no of outgoing messages */ 884 msz = 0; /* total mesg length (for all procs) */ 885 w1[rank] = 0; /* no mesg sent to self */ 886 w3[rank] = 0; 887 for (i=0; i<size; i++) { 888 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 889 } 890 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 891 for (i=0,j=0; i<size; i++) { 892 if (w1[i]) { pa[j] = i; j++; } 893 } 894 895 /* Each message would have a header = 1 + 2*(no of IS) + data */ 896 for (i=0; i<nrqs; i++) { 897 j = pa[i]; 898 w1[j] += w2[j] + 2* w3[j]; 899 msz += w1[j]; 900 } 901 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 902 903 /* Determine the number of messages to expect, their lengths, from from-ids */ 904 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 905 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 906 907 /* Now post the Irecvs corresponding to these messages */ 908 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 909 910 ierr = PetscFree(onodes1);CHKERRQ(ierr); 911 ierr = PetscFree(olengths1);CHKERRQ(ierr); 912 913 /* Allocate Memory for outgoing messages */ 914 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 915 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 916 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 917 918 { 919 PetscInt *iptr = tmp,ict = 0; 920 for (i=0; i<nrqs; i++) { 921 j = pa[i]; 922 iptr += ict; 923 sbuf1[j] = iptr; 924 ict = w1[j]; 925 } 926 } 927 928 /* Form the outgoing messages */ 929 /* Initialize the header space */ 930 for (i=0; i<nrqs; i++) { 931 j = pa[i]; 932 sbuf1[j][0] = 0; 933 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 934 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 935 } 936 937 /* Parse the isrow and copy data into outbuf */ 938 for (i=0; i<ismax; i++) { 939 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 940 irow_i = irow[i]; 941 jmax = nrow[i]; 942 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 943 l = 0; 944 row = irow_i[j]; 945 while (row >= C->rmap->range[l+1]) l++; 946 proc = l; 947 if (proc != rank) { /* copy to the outgoing buf*/ 948 ctr[proc]++; 949 *ptr[proc] = row; 950 ptr[proc]++; 951 } 952 } 953 /* Update the headers for the current IS */ 954 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 955 if ((ctr_j = ctr[j])) { 956 sbuf1_j = sbuf1[j]; 957 k = ++sbuf1_j[0]; 958 sbuf1_j[2*k] = ctr_j; 959 sbuf1_j[2*k-1] = i; 960 } 961 } 962 } 963 964 /* Now post the sends */ 965 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 966 for (i=0; i<nrqs; ++i) { 967 j = pa[i]; 968 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 969 } 970 971 /* Post Receives to capture the buffer size */ 972 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 973 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 974 rbuf2[0] = tmp + msz; 975 for (i=1; i<nrqs; ++i) { 976 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 977 } 978 for (i=0; i<nrqs; ++i) { 979 j = pa[i]; 980 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 981 } 982 983 /* Send to other procs the buf size they should allocate */ 984 985 986 /* Receive messages*/ 987 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 988 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 989 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 990 { 991 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 992 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 993 PetscInt *sbuf2_i; 994 995 for (i=0; i<nrqr; ++i) { 996 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 997 998 req_size[idex] = 0; 999 rbuf1_i = rbuf1[idex]; 1000 start = 2*rbuf1_i[0] + 1; 1001 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1002 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 1003 sbuf2_i = sbuf2[idex]; 1004 for (j=start; j<end; j++) { 1005 id = rbuf1_i[j] - rstart; 1006 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1007 sbuf2_i[j] = ncols; 1008 req_size[idex] += ncols; 1009 } 1010 req_source[idex] = r_status1[i].MPI_SOURCE; 1011 /* form the header */ 1012 sbuf2_i[0] = req_size[idex]; 1013 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 1014 1015 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1016 } 1017 } 1018 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1019 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1020 1021 /* recv buffer sizes */ 1022 /* Receive messages*/ 1023 1024 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1025 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1026 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1027 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1028 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1029 1030 for (i=0; i<nrqs; ++i) { 1031 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1032 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1033 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1034 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1035 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1036 } 1037 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1038 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1039 1040 /* Wait on sends1 and sends2 */ 1041 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1042 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1043 1044 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1045 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1046 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1047 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1048 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1049 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1050 1051 /* Now allocate buffers for a->j, and send them off */ 1052 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1053 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1054 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1055 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1056 1057 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1058 { 1059 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1060 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1061 PetscInt cend = C->cmap->rend; 1062 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1063 1064 for (i=0; i<nrqr; i++) { 1065 rbuf1_i = rbuf1[i]; 1066 sbuf_aj_i = sbuf_aj[i]; 1067 ct1 = 2*rbuf1_i[0] + 1; 1068 ct2 = 0; 1069 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1070 kmax = rbuf1[i][2*j]; 1071 for (k=0; k<kmax; k++,ct1++) { 1072 row = rbuf1_i[ct1] - rstart; 1073 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1074 ncols = nzA + nzB; 1075 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1076 1077 /* load the column indices for this row into cols*/ 1078 cols = sbuf_aj_i + ct2; 1079 1080 lwrite = 0; 1081 for (l=0; l<nzB; l++) { 1082 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1083 } 1084 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1085 for (l=0; l<nzB; l++) { 1086 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1087 } 1088 1089 ct2 += ncols; 1090 } 1091 } 1092 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1093 } 1094 } 1095 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1096 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1097 1098 /* Allocate buffers for a->a, and send them off */ 1099 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1100 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1101 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1102 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1103 1104 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1105 { 1106 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1107 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1108 PetscInt cend = C->cmap->rend; 1109 PetscInt *b_j = b->j; 1110 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1111 1112 for (i=0; i<nrqr; i++) { 1113 rbuf1_i = rbuf1[i]; 1114 sbuf_aa_i = sbuf_aa[i]; 1115 ct1 = 2*rbuf1_i[0]+1; 1116 ct2 = 0; 1117 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1118 kmax = rbuf1_i[2*j]; 1119 for (k=0; k<kmax; k++,ct1++) { 1120 row = rbuf1_i[ct1] - rstart; 1121 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1122 ncols = nzA + nzB; 1123 cworkB = b_j + b_i[row]; 1124 vworkA = a_a + a_i[row]; 1125 vworkB = b_a + b_i[row]; 1126 1127 /* load the column values for this row into vals*/ 1128 vals = sbuf_aa_i+ct2; 1129 1130 lwrite = 0; 1131 for (l=0; l<nzB; l++) { 1132 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1133 } 1134 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1135 for (l=0; l<nzB; l++) { 1136 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1137 } 1138 1139 ct2 += ncols; 1140 } 1141 } 1142 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1143 } 1144 } 1145 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1146 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1147 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1148 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1149 1150 /* Form the matrix */ 1151 /* create col map: global col of C -> local col of submatrices */ 1152 { 1153 const PetscInt *icol_i; 1154 #if defined(PETSC_USE_CTABLE) 1155 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1156 for (i=0; i<ismax; i++) { 1157 if (!allcolumns[i]) { 1158 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1159 1160 jmax = ncol[i]; 1161 icol_i = icol[i]; 1162 cmap_i = cmap[i]; 1163 for (j=0; j<jmax; j++) { 1164 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1165 } 1166 } else { 1167 cmap[i] = NULL; 1168 } 1169 } 1170 #else 1171 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1172 for (i=0; i<ismax; i++) { 1173 if (!allcolumns[i]) { 1174 ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr); 1175 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1176 jmax = ncol[i]; 1177 icol_i = icol[i]; 1178 cmap_i = cmap[i]; 1179 for (j=0; j<jmax; j++) { 1180 cmap_i[icol_i[j]] = j+1; 1181 } 1182 } else { 1183 cmap[i] = NULL; 1184 } 1185 } 1186 #endif 1187 } 1188 1189 /* Create lens which is required for MatCreate... */ 1190 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1191 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1192 if (ismax) { 1193 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1194 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1195 } 1196 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1197 1198 /* Update lens from local data */ 1199 for (i=0; i<ismax; i++) { 1200 jmax = nrow[i]; 1201 if (!allcolumns[i]) cmap_i = cmap[i]; 1202 irow_i = irow[i]; 1203 lens_i = lens[i]; 1204 for (j=0; j<jmax; j++) { 1205 l = 0; 1206 row = irow_i[j]; 1207 while (row >= C->rmap->range[l+1]) l++; 1208 proc = l; 1209 if (proc == rank) { 1210 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1211 if (!allcolumns[i]) { 1212 for (k=0; k<ncols; k++) { 1213 #if defined(PETSC_USE_CTABLE) 1214 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1215 #else 1216 tcol = cmap_i[cols[k]]; 1217 #endif 1218 if (tcol) lens_i[j]++; 1219 } 1220 } else { /* allcolumns */ 1221 lens_i[j] = ncols; 1222 } 1223 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1224 } 1225 } 1226 } 1227 1228 /* Create row map: global row of C -> local row of submatrices */ 1229 #if defined(PETSC_USE_CTABLE) 1230 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1231 for (i=0; i<ismax; i++) { 1232 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1233 rmap_i = rmap[i]; 1234 irow_i = irow[i]; 1235 jmax = nrow[i]; 1236 for (j=0; j<jmax; j++) { 1237 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1238 } 1239 } 1240 #else 1241 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1242 if (ismax) { 1243 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1244 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1245 } 1246 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1247 for (i=0; i<ismax; i++) { 1248 rmap_i = rmap[i]; 1249 irow_i = irow[i]; 1250 jmax = nrow[i]; 1251 for (j=0; j<jmax; j++) { 1252 rmap_i[irow_i[j]] = j; 1253 } 1254 } 1255 #endif 1256 1257 /* Update lens from offproc data */ 1258 { 1259 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1260 1261 for (tmp2=0; tmp2<nrqs; tmp2++) { 1262 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1263 idex = pa[idex2]; 1264 sbuf1_i = sbuf1[idex]; 1265 jmax = sbuf1_i[0]; 1266 ct1 = 2*jmax+1; 1267 ct2 = 0; 1268 rbuf2_i = rbuf2[idex2]; 1269 rbuf3_i = rbuf3[idex2]; 1270 for (j=1; j<=jmax; j++) { 1271 is_no = sbuf1_i[2*j-1]; 1272 max1 = sbuf1_i[2*j]; 1273 lens_i = lens[is_no]; 1274 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1275 rmap_i = rmap[is_no]; 1276 for (k=0; k<max1; k++,ct1++) { 1277 #if defined(PETSC_USE_CTABLE) 1278 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1279 row--; 1280 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1281 #else 1282 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1283 #endif 1284 max2 = rbuf2_i[ct1]; 1285 for (l=0; l<max2; l++,ct2++) { 1286 if (!allcolumns[is_no]) { 1287 #if defined(PETSC_USE_CTABLE) 1288 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1289 #else 1290 tcol = cmap_i[rbuf3_i[ct2]]; 1291 #endif 1292 if (tcol) lens_i[row]++; 1293 } else { /* allcolumns */ 1294 lens_i[row]++; /* lens_i[row] += max2 ? */ 1295 } 1296 } 1297 } 1298 } 1299 } 1300 } 1301 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1302 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1303 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1304 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1305 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1306 1307 /* Create the submatrices */ 1308 if (scall == MAT_REUSE_MATRIX) { 1309 PetscBool flag; 1310 1311 /* 1312 Assumes new rows are same length as the old rows,hence bug! 1313 */ 1314 for (i=0; i<ismax; i++) { 1315 mat = (Mat_SeqAIJ*)(submats[i]->data); 1316 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1317 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1318 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1319 /* Initial matrix as if empty */ 1320 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1321 1322 submats[i]->factortype = C->factortype; 1323 } 1324 } else { 1325 for (i=0; i<ismax; i++) { 1326 PetscInt rbs,cbs; 1327 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1328 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1329 1330 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1331 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1332 1333 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1334 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1335 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1336 } 1337 } 1338 1339 /* Assemble the matrices */ 1340 /* First assemble the local rows */ 1341 { 1342 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1343 PetscScalar *imat_a; 1344 1345 for (i=0; i<ismax; i++) { 1346 mat = (Mat_SeqAIJ*)submats[i]->data; 1347 imat_ilen = mat->ilen; 1348 imat_j = mat->j; 1349 imat_i = mat->i; 1350 imat_a = mat->a; 1351 1352 if (!allcolumns[i]) cmap_i = cmap[i]; 1353 rmap_i = rmap[i]; 1354 irow_i = irow[i]; 1355 jmax = nrow[i]; 1356 for (j=0; j<jmax; j++) { 1357 l = 0; 1358 row = irow_i[j]; 1359 while (row >= C->rmap->range[l+1]) l++; 1360 proc = l; 1361 if (proc == rank) { 1362 old_row = row; 1363 #if defined(PETSC_USE_CTABLE) 1364 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1365 row--; 1366 #else 1367 row = rmap_i[row]; 1368 #endif 1369 ilen_row = imat_ilen[row]; 1370 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1371 mat_i = imat_i[row]; 1372 mat_a = imat_a + mat_i; 1373 mat_j = imat_j + mat_i; 1374 if (!allcolumns[i]) { 1375 for (k=0; k<ncols; k++) { 1376 #if defined(PETSC_USE_CTABLE) 1377 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1378 #else 1379 tcol = cmap_i[cols[k]]; 1380 #endif 1381 if (tcol) { 1382 *mat_j++ = tcol - 1; 1383 *mat_a++ = vals[k]; 1384 ilen_row++; 1385 } 1386 } 1387 } else { /* allcolumns */ 1388 for (k=0; k<ncols; k++) { 1389 *mat_j++ = cols[k]; /* global col index! */ 1390 *mat_a++ = vals[k]; 1391 ilen_row++; 1392 } 1393 } 1394 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1395 1396 imat_ilen[row] = ilen_row; 1397 } 1398 } 1399 } 1400 } 1401 1402 /* Now assemble the off proc rows*/ 1403 { 1404 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1405 PetscInt *imat_j,*imat_i; 1406 PetscScalar *imat_a,*rbuf4_i; 1407 1408 for (tmp2=0; tmp2<nrqs; tmp2++) { 1409 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1410 idex = pa[idex2]; 1411 sbuf1_i = sbuf1[idex]; 1412 jmax = sbuf1_i[0]; 1413 ct1 = 2*jmax + 1; 1414 ct2 = 0; 1415 rbuf2_i = rbuf2[idex2]; 1416 rbuf3_i = rbuf3[idex2]; 1417 rbuf4_i = rbuf4[idex2]; 1418 for (j=1; j<=jmax; j++) { 1419 is_no = sbuf1_i[2*j-1]; 1420 rmap_i = rmap[is_no]; 1421 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1422 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1423 imat_ilen = mat->ilen; 1424 imat_j = mat->j; 1425 imat_i = mat->i; 1426 imat_a = mat->a; 1427 max1 = sbuf1_i[2*j]; 1428 for (k=0; k<max1; k++,ct1++) { 1429 row = sbuf1_i[ct1]; 1430 #if defined(PETSC_USE_CTABLE) 1431 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1432 row--; 1433 #else 1434 row = rmap_i[row]; 1435 #endif 1436 ilen = imat_ilen[row]; 1437 mat_i = imat_i[row]; 1438 mat_a = imat_a + mat_i; 1439 mat_j = imat_j + mat_i; 1440 max2 = rbuf2_i[ct1]; 1441 if (!allcolumns[is_no]) { 1442 for (l=0; l<max2; l++,ct2++) { 1443 1444 #if defined(PETSC_USE_CTABLE) 1445 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1446 #else 1447 tcol = cmap_i[rbuf3_i[ct2]]; 1448 #endif 1449 if (tcol) { 1450 *mat_j++ = tcol - 1; 1451 *mat_a++ = rbuf4_i[ct2]; 1452 ilen++; 1453 } 1454 } 1455 } else { /* allcolumns */ 1456 for (l=0; l<max2; l++,ct2++) { 1457 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1458 *mat_a++ = rbuf4_i[ct2]; 1459 ilen++; 1460 } 1461 } 1462 imat_ilen[row] = ilen; 1463 } 1464 } 1465 } 1466 } 1467 1468 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1469 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1470 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1471 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1472 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1473 1474 /* Restore the indices */ 1475 for (i=0; i<ismax; i++) { 1476 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1477 if (!allcolumns[i]) { 1478 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1479 } 1480 } 1481 1482 /* Destroy allocated memory */ 1483 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1484 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1485 ierr = PetscFree(pa);CHKERRQ(ierr); 1486 1487 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1488 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1489 for (i=0; i<nrqr; ++i) { 1490 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1491 } 1492 for (i=0; i<nrqs; ++i) { 1493 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1494 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1495 } 1496 1497 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1498 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1499 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1500 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1501 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1502 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1503 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1504 1505 #if defined(PETSC_USE_CTABLE) 1506 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1507 #else 1508 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1509 #endif 1510 ierr = PetscFree(rmap);CHKERRQ(ierr); 1511 1512 for (i=0; i<ismax; i++) { 1513 if (!allcolumns[i]) { 1514 #if defined(PETSC_USE_CTABLE) 1515 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1516 #else 1517 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1518 #endif 1519 } 1520 } 1521 ierr = PetscFree(cmap);CHKERRQ(ierr); 1522 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1523 ierr = PetscFree(lens);CHKERRQ(ierr); 1524 1525 for (i=0; i<ismax; i++) { 1526 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1527 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1528 } 1529 PetscFunctionReturn(0); 1530 } 1531 1532 /* 1533 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1534 Be careful not to destroy them elsewhere. 1535 */ 1536 #undef __FUNCT__ 1537 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1538 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1539 { 1540 /* If making this function public, change the error returned in this function away from _PLIB. */ 1541 PetscErrorCode ierr; 1542 Mat_MPIAIJ *aij; 1543 PetscBool seqaij; 1544 1545 PetscFunctionBegin; 1546 /* Check to make sure the component matrices are compatible with C. */ 1547 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1548 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1549 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1550 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1551 if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1552 1553 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1554 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1555 ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs);CHKERRQ(ierr); 1556 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1557 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1558 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1559 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1560 aij = (Mat_MPIAIJ*)((*C)->data); 1561 aij->A = A; 1562 aij->B = B; 1563 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr); 1564 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr); 1565 1566 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1567 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1568 PetscFunctionReturn(0); 1569 } 1570 1571 #undef __FUNCT__ 1572 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1573 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1574 { 1575 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1576 1577 PetscFunctionBegin; 1578 PetscValidPointer(A,2); 1579 PetscValidPointer(B,3); 1580 *A = aij->A; 1581 *B = aij->B; 1582 /* Note that we don't incref *A and *B, so be careful! */ 1583 PetscFunctionReturn(0); 1584 } 1585 1586 #undef __FUNCT__ 1587 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1588 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1589 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1590 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1591 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1592 { 1593 PetscErrorCode ierr; 1594 PetscMPIInt size, flag; 1595 PetscInt i,ii; 1596 PetscInt ismax_c; 1597 1598 PetscFunctionBegin; 1599 if (!ismax) PetscFunctionReturn(0); 1600 1601 for (i = 0, ismax_c = 0; i < ismax; ++i) { 1602 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 1603 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1604 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1605 if (size > 1) ++ismax_c; 1606 } 1607 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1608 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1609 } else { /* if (ismax_c) */ 1610 Mat *A,*B; 1611 IS *isrow_c, *iscol_c; 1612 PetscMPIInt size; 1613 /* 1614 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1615 array of sequential matrices underlying the resulting parallel matrices. 1616 Which arrays to allocate is based on the value of MatReuse scall. 1617 There are as many diag matrices as there are original index sets. 1618 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1619 1620 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1621 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1622 will deposite the extracted diag and off-diag parts. 1623 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1624 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1625 */ 1626 1627 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1628 if (scall != MAT_REUSE_MATRIX) { 1629 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1630 } else { 1631 ierr = PetscMalloc(ismax*sizeof(Mat), &A);CHKERRQ(ierr); 1632 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B);CHKERRQ(ierr); 1633 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1634 for (i = 0, ii = 0; i < ismax; ++i) { 1635 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1636 if (size > 1) { 1637 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 1638 ++ii; 1639 } else A[i] = (*submat)[i]; 1640 } 1641 } 1642 /* 1643 Construct the complements of the iscol ISs for parallel ISs only. 1644 These are used to extract the off-diag portion of the resulting parallel matrix. 1645 The row IS for the off-diag portion is the same as for the diag portion, 1646 so we merely alias the row IS, while skipping those that are sequential. 1647 */ 1648 ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr); 1649 for (i = 0, ii = 0; i < ismax; ++i) { 1650 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1651 if (size > 1) { 1652 isrow_c[ii] = isrow[i]; 1653 1654 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 1655 ++ii; 1656 } 1657 } 1658 /* Now obtain the sequential A and B submatrices separately. */ 1659 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 1660 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 1661 for (ii = 0; ii < ismax_c; ++ii) { 1662 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 1663 } 1664 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 1665 /* 1666 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1667 have been extracted directly into the parallel matrices containing them, or 1668 simply into the sequential matrix identical with the corresponding A (if size == 1). 1669 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1670 A is put into the correct submat slot (if size == 1). 1671 */ 1672 if (scall != MAT_REUSE_MATRIX) { 1673 for (i = 0, ii = 0; i < ismax; ++i) { 1674 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1675 if (size > 1) { 1676 /* 1677 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1678 */ 1679 /* Construct submat[i] from the Seq pieces A and B. */ 1680 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 1681 1682 ++ii; 1683 } else (*submat)[i] = A[i]; 1684 } 1685 } 1686 ierr = PetscFree(A);CHKERRQ(ierr); 1687 ierr = PetscFree(B);CHKERRQ(ierr); 1688 } 1689 PetscFunctionReturn(0); 1690 } /* MatGetSubMatricesParallel_MPIXAIJ() */ 1691 1692 1693 1694 #undef __FUNCT__ 1695 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1696 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1697 { 1698 PetscErrorCode ierr; 1699 1700 PetscFunctionBegin; 1701 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 1702 PetscFunctionReturn(0); 1703 } 1704