1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 #include <../src/mat/impls/aij/mpi/mpiaij.h> 8 #include <petscbt.h> 9 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 12 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 13 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 15 16 #undef __FUNCT__ 17 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 18 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 19 { 20 PetscErrorCode ierr; 21 PetscInt i; 22 23 PetscFunctionBegin; 24 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 25 for (i=0; i<ov; ++i) { 26 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 27 } 28 PetscFunctionReturn(0); 29 } 30 31 /* 32 Sample message format: 33 If a processor A wants processor B to process some elements corresponding 34 to index sets is[1],is[5] 35 mesg [0] = 2 (no of index sets in the mesg) 36 ----------- 37 mesg [1] = 1 => is[1] 38 mesg [2] = sizeof(is[1]); 39 ----------- 40 mesg [3] = 5 => is[5] 41 mesg [4] = sizeof(is[5]); 42 ----------- 43 mesg [5] 44 mesg [n] datas[1] 45 ----------- 46 mesg[n+1] 47 mesg[m] data(is[5]) 48 ----------- 49 50 Notes: 51 nrqs - no of requests sent (or to be sent out) 52 nrqr - no of requests recieved (which have to be or which have been processed 53 */ 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 56 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 57 { 58 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 59 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 60 const PetscInt **idx,*idx_i; 61 PetscInt *n,**data,len; 62 PetscErrorCode ierr; 63 PetscMPIInt size,rank,tag1,tag2; 64 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 65 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 66 PetscBT *table; 67 MPI_Comm comm; 68 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 69 MPI_Status *s_status,*recv_status; 70 char *t_p; 71 72 PetscFunctionBegin; 73 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 74 size = c->size; 75 rank = c->rank; 76 M = C->rmap->N; 77 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 79 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 80 81 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 82 83 for (i=0; i<imax; i++) { 84 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 85 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 86 } 87 88 /* evaluate communication - mesg to who,length of mesg, and buffer space 89 required. Based on this, buffers are allocated, and data copied into them*/ 90 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 91 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 94 for (i=0; i<imax; i++) { 95 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 96 idx_i = idx[i]; 97 len = n[i]; 98 for (j=0; j<len; j++) { 99 row = idx_i[j]; 100 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 101 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 102 w4[proc]++; 103 } 104 for (j=0; j<size; j++) { 105 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 106 } 107 } 108 109 nrqs = 0; /* no of outgoing messages */ 110 msz = 0; /* total mesg length (for all proc */ 111 w1[rank] = 0; /* no mesg sent to intself */ 112 w3[rank] = 0; 113 for (i=0; i<size; i++) { 114 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 115 } 116 /* pa - is list of processors to communicate with */ 117 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 118 for (i=0,j=0; i<size; i++) { 119 if (w1[i]) {pa[j] = i; j++;} 120 } 121 122 /* Each message would have a header = 1 + 2*(no of IS) + data */ 123 for (i=0; i<nrqs; i++) { 124 j = pa[i]; 125 w1[j] += w2[j] + 2*w3[j]; 126 msz += w1[j]; 127 } 128 129 /* Determine the number of messages to expect, their lengths, from from-ids */ 130 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 131 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 132 133 /* Now post the Irecvs corresponding to these messages */ 134 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 135 136 /* Allocate Memory for outgoing messages */ 137 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 138 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 140 141 { 142 PetscInt *iptr = tmp,ict = 0; 143 for (i=0; i<nrqs; i++) { 144 j = pa[i]; 145 iptr += ict; 146 outdat[j] = iptr; 147 ict = w1[j]; 148 } 149 } 150 151 /* Form the outgoing messages */ 152 /*plug in the headers*/ 153 for (i=0; i<nrqs; i++) { 154 j = pa[i]; 155 outdat[j][0] = 0; 156 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 157 ptr[j] = outdat[j] + 2*w3[j] + 1; 158 } 159 160 /* Memory for doing local proc's work*/ 161 { 162 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 163 164 for (i=0; i<imax; i++) { 165 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 166 data[i] = d_p + M*i; 167 } 168 } 169 170 /* Parse the IS and update local tables and the outgoing buf with the data*/ 171 { 172 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 173 PetscBT table_i; 174 175 for (i=0; i<imax; i++) { 176 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 177 n_i = n[i]; 178 table_i = table[i]; 179 idx_i = idx[i]; 180 data_i = data[i]; 181 isz_i = isz[i]; 182 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 183 row = idx_i[j]; 184 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 185 if (proc != rank) { /* copy to the outgoing buffer */ 186 ctr[proc]++; 187 *ptr[proc] = row; 188 ptr[proc]++; 189 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 190 } 191 /* Update the headers for the current IS */ 192 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 193 if ((ctr_j = ctr[j])) { 194 outdat_j = outdat[j]; 195 k = ++outdat_j[0]; 196 outdat_j[2*k] = ctr_j; 197 outdat_j[2*k-1] = i; 198 } 199 } 200 isz[i] = isz_i; 201 } 202 } 203 204 /* Now post the sends */ 205 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 206 for (i=0; i<nrqs; ++i) { 207 j = pa[i]; 208 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 209 } 210 211 /* No longer need the original indices*/ 212 for (i=0; i<imax; ++i) { 213 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 214 } 215 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 216 217 for (i=0; i<imax; ++i) { 218 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 219 } 220 221 /* Do Local work*/ 222 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 223 224 /* Receive messages*/ 225 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 226 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 227 228 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 229 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 230 231 /* Phase 1 sends are complete - deallocate buffers */ 232 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 233 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 234 235 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 236 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 237 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 238 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 239 ierr = PetscFree(rbuf);CHKERRQ(ierr); 240 241 242 /* Send the data back*/ 243 /* Do a global reduction to know the buffer space req for incoming messages*/ 244 { 245 PetscMPIInt *rw1; 246 247 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 248 249 for (i=0; i<nrqr; ++i) { 250 proc = recv_status[i].MPI_SOURCE; 251 252 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 253 rw1[proc] = isz1[i]; 254 } 255 ierr = PetscFree(onodes1);CHKERRQ(ierr); 256 ierr = PetscFree(olengths1);CHKERRQ(ierr); 257 258 /* Determine the number of messages to expect, their lengths, from from-ids */ 259 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 260 ierr = PetscFree(rw1);CHKERRQ(ierr); 261 } 262 /* Now post the Irecvs corresponding to these messages */ 263 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 264 265 /* Now post the sends */ 266 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 267 for (i=0; i<nrqr; ++i) { 268 j = recv_status[i].MPI_SOURCE; 269 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 270 } 271 272 /* receive work done on other processors*/ 273 { 274 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 275 PetscMPIInt idex; 276 PetscBT table_i; 277 MPI_Status *status2; 278 279 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 280 for (i=0; i<nrqs; ++i) { 281 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 282 /* Process the message*/ 283 rbuf2_i = rbuf2[idex]; 284 ct1 = 2*rbuf2_i[0]+1; 285 jmax = rbuf2[idex][0]; 286 for (j=1; j<=jmax; j++) { 287 max = rbuf2_i[2*j]; 288 is_no = rbuf2_i[2*j-1]; 289 isz_i = isz[is_no]; 290 data_i = data[is_no]; 291 table_i = table[is_no]; 292 for (k=0; k<max; k++,ct1++) { 293 row = rbuf2_i[ct1]; 294 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 295 } 296 isz[is_no] = isz_i; 297 } 298 } 299 300 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 301 ierr = PetscFree(status2);CHKERRQ(ierr); 302 } 303 304 for (i=0; i<imax; ++i) { 305 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 306 } 307 308 ierr = PetscFree(onodes2);CHKERRQ(ierr); 309 ierr = PetscFree(olengths2);CHKERRQ(ierr); 310 311 ierr = PetscFree(pa);CHKERRQ(ierr); 312 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 313 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 314 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 315 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 316 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 317 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 318 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 319 ierr = PetscFree(s_status);CHKERRQ(ierr); 320 ierr = PetscFree(recv_status);CHKERRQ(ierr); 321 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 322 ierr = PetscFree(xdata);CHKERRQ(ierr); 323 ierr = PetscFree(isz1);CHKERRQ(ierr); 324 PetscFunctionReturn(0); 325 } 326 327 #undef __FUNCT__ 328 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 329 /* 330 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 331 the work on the local processor. 332 333 Inputs: 334 C - MAT_MPIAIJ; 335 imax - total no of index sets processed at a time; 336 table - an array of char - size = m bits. 337 338 Output: 339 isz - array containing the count of the solution elements corresponding 340 to each index set; 341 data - pointer to the solutions 342 */ 343 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 344 { 345 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 346 Mat A = c->A,B = c->B; 347 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 348 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 349 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 350 PetscBT table_i; 351 352 PetscFunctionBegin; 353 rstart = C->rmap->rstart; 354 cstart = C->cmap->rstart; 355 ai = a->i; 356 aj = a->j; 357 bi = b->i; 358 bj = b->j; 359 garray = c->garray; 360 361 362 for (i=0; i<imax; i++) { 363 data_i = data[i]; 364 table_i = table[i]; 365 isz_i = isz[i]; 366 for (j=0,max=isz[i]; j<max; j++) { 367 row = data_i[j] - rstart; 368 start = ai[row]; 369 end = ai[row+1]; 370 for (k=start; k<end; k++) { /* Amat */ 371 val = aj[k] + cstart; 372 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 373 } 374 start = bi[row]; 375 end = bi[row+1]; 376 for (k=start; k<end; k++) { /* Bmat */ 377 val = garray[bj[k]]; 378 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 379 } 380 } 381 isz[i] = isz_i; 382 } 383 PetscFunctionReturn(0); 384 } 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 388 /* 389 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 390 and return the output 391 392 Input: 393 C - the matrix 394 nrqr - no of messages being processed. 395 rbuf - an array of pointers to the recieved requests 396 397 Output: 398 xdata - array of messages to be sent back 399 isz1 - size of each message 400 401 For better efficiency perhaps we should malloc separately each xdata[i], 402 then if a remalloc is required we need only copy the data for that one row 403 rather then all previous rows as it is now where a single large chunck of 404 memory is used. 405 406 */ 407 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 408 { 409 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 410 Mat A = c->A,B = c->B; 411 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 412 PetscErrorCode ierr; 413 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 414 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 415 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 416 PetscInt *rbuf_i,kmax,rbuf_0; 417 PetscBT xtable; 418 419 PetscFunctionBegin; 420 m = C->rmap->N; 421 rstart = C->rmap->rstart; 422 cstart = C->cmap->rstart; 423 ai = a->i; 424 aj = a->j; 425 bi = b->i; 426 bj = b->j; 427 garray = c->garray; 428 429 430 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 431 rbuf_i = rbuf[i]; 432 rbuf_0 = rbuf_i[0]; 433 ct += rbuf_0; 434 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 435 } 436 437 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 438 else max1 = 1; 439 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 440 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 441 ++no_malloc; 442 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 443 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 444 445 ct3 = 0; 446 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 447 rbuf_i = rbuf[i]; 448 rbuf_0 = rbuf_i[0]; 449 ct1 = 2*rbuf_0+1; 450 ct2 = ct1; 451 ct3 += ct1; 452 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 453 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 454 oct2 = ct2; 455 kmax = rbuf_i[2*j]; 456 for (k=0; k<kmax; k++,ct1++) { 457 row = rbuf_i[ct1]; 458 if (!PetscBTLookupSet(xtable,row)) { 459 if (!(ct3 < mem_estimate)) { 460 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 461 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 462 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 463 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 464 xdata[0] = tmp; 465 mem_estimate = new_estimate; ++no_malloc; 466 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 467 } 468 xdata[i][ct2++] = row; 469 ct3++; 470 } 471 } 472 for (k=oct2,max2=ct2; k<max2; k++) { 473 row = xdata[i][k] - rstart; 474 start = ai[row]; 475 end = ai[row+1]; 476 for (l=start; l<end; l++) { 477 val = aj[l] + cstart; 478 if (!PetscBTLookupSet(xtable,val)) { 479 if (!(ct3 < mem_estimate)) { 480 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 481 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 482 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 483 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 484 xdata[0] = tmp; 485 mem_estimate = new_estimate; ++no_malloc; 486 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 487 } 488 xdata[i][ct2++] = val; 489 ct3++; 490 } 491 } 492 start = bi[row]; 493 end = bi[row+1]; 494 for (l=start; l<end; l++) { 495 val = garray[bj[l]]; 496 if (!PetscBTLookupSet(xtable,val)) { 497 if (!(ct3 < mem_estimate)) { 498 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 499 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 500 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 501 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 502 xdata[0] = tmp; 503 mem_estimate = new_estimate; ++no_malloc; 504 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 505 } 506 xdata[i][ct2++] = val; 507 ct3++; 508 } 509 } 510 } 511 /* Update the header*/ 512 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 513 xdata[i][2*j-1] = rbuf_i[2*j-1]; 514 } 515 xdata[i][0] = rbuf_0; 516 xdata[i+1] = xdata[i] + ct2; 517 isz1[i] = ct2; /* size of each message */ 518 } 519 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 520 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 521 PetscFunctionReturn(0); 522 } 523 /* -------------------------------------------------------------------------*/ 524 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 525 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 526 /* 527 Every processor gets the entire matrix 528 */ 529 #undef __FUNCT__ 530 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 531 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 532 { 533 Mat B; 534 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 535 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 536 PetscErrorCode ierr; 537 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 538 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 539 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 540 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 541 542 PetscFunctionBegin; 543 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 544 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 545 546 if (scall == MAT_INITIAL_MATRIX) { 547 /* ---------------------------------------------------------------- 548 Tell every processor the number of nonzeros per row 549 */ 550 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 551 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 552 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 553 } 554 sendcount = A->rmap->rend - A->rmap->rstart; 555 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 556 for (i=0; i<size; i++) { 557 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 558 displs[i] = A->rmap->range[i]; 559 } 560 #if defined(PETSC_HAVE_MPI_IN_PLACE) 561 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 562 #else 563 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 564 #endif 565 /* --------------------------------------------------------------- 566 Create the sequential matrix of the same type as the local block diagonal 567 */ 568 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 569 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 570 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 571 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 572 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 573 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 574 **Bin = B; 575 b = (Mat_SeqAIJ*)B->data; 576 577 /*-------------------------------------------------------------------- 578 Copy my part of matrix column indices over 579 */ 580 sendcount = ad->nz + bd->nz; 581 jsendbuf = b->j + b->i[rstarts[rank]]; 582 a_jsendbuf = ad->j; 583 b_jsendbuf = bd->j; 584 n = A->rmap->rend - A->rmap->rstart; 585 cnt = 0; 586 for (i=0; i<n; i++) { 587 588 /* put in lower diagonal portion */ 589 m = bd->i[i+1] - bd->i[i]; 590 while (m > 0) { 591 /* is it above diagonal (in bd (compressed) numbering) */ 592 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 593 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 594 m--; 595 } 596 597 /* put in diagonal portion */ 598 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 599 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 600 } 601 602 /* put in upper diagonal portion */ 603 while (m-- > 0) { 604 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 605 } 606 } 607 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 608 609 /*-------------------------------------------------------------------- 610 Gather all column indices to all processors 611 */ 612 for (i=0; i<size; i++) { 613 recvcounts[i] = 0; 614 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 615 recvcounts[i] += lens[j]; 616 } 617 } 618 displs[0] = 0; 619 for (i=1; i<size; i++) { 620 displs[i] = displs[i-1] + recvcounts[i-1]; 621 } 622 #if defined(PETSC_HAVE_MPI_IN_PLACE) 623 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 624 #else 625 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 626 #endif 627 /*-------------------------------------------------------------------- 628 Assemble the matrix into useable form (note numerical values not yet set) 629 */ 630 /* set the b->ilen (length of each row) values */ 631 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 632 /* set the b->i indices */ 633 b->i[0] = 0; 634 for (i=1; i<=A->rmap->N; i++) { 635 b->i[i] = b->i[i-1] + lens[i-1]; 636 } 637 ierr = PetscFree(lens);CHKERRQ(ierr); 638 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 639 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 640 641 } else { 642 B = **Bin; 643 b = (Mat_SeqAIJ*)B->data; 644 } 645 646 /*-------------------------------------------------------------------- 647 Copy my part of matrix numerical values into the values location 648 */ 649 if (flag == MAT_GET_VALUES) { 650 sendcount = ad->nz + bd->nz; 651 sendbuf = b->a + b->i[rstarts[rank]]; 652 a_sendbuf = ad->a; 653 b_sendbuf = bd->a; 654 b_sendj = bd->j; 655 n = A->rmap->rend - A->rmap->rstart; 656 cnt = 0; 657 for (i=0; i<n; i++) { 658 659 /* put in lower diagonal portion */ 660 m = bd->i[i+1] - bd->i[i]; 661 while (m > 0) { 662 /* is it above diagonal (in bd (compressed) numbering) */ 663 if (garray[*b_sendj] > A->rmap->rstart + i) break; 664 sendbuf[cnt++] = *b_sendbuf++; 665 m--; 666 b_sendj++; 667 } 668 669 /* put in diagonal portion */ 670 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 671 sendbuf[cnt++] = *a_sendbuf++; 672 } 673 674 /* put in upper diagonal portion */ 675 while (m-- > 0) { 676 sendbuf[cnt++] = *b_sendbuf++; 677 b_sendj++; 678 } 679 } 680 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 681 682 /* ----------------------------------------------------------------- 683 Gather all numerical values to all processors 684 */ 685 if (!recvcounts) { 686 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 687 } 688 for (i=0; i<size; i++) { 689 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 690 } 691 displs[0] = 0; 692 for (i=1; i<size; i++) { 693 displs[i] = displs[i-1] + recvcounts[i-1]; 694 } 695 recvbuf = b->a; 696 #if defined(PETSC_HAVE_MPI_IN_PLACE) 697 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 698 #else 699 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 700 #endif 701 } /* endof (flag == MAT_GET_VALUES) */ 702 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 703 704 if (A->symmetric) { 705 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 706 } else if (A->hermitian) { 707 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 708 } else if (A->structurally_symmetric) { 709 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 710 } 711 PetscFunctionReturn(0); 712 } 713 714 715 716 #undef __FUNCT__ 717 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 718 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 719 { 720 PetscErrorCode ierr; 721 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 722 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 723 724 PetscFunctionBegin; 725 726 /* 727 Check for special case: each processor gets entire matrix 728 */ 729 if (ismax == 1 && C->rmap->N == C->cmap->N) { 730 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 731 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 732 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 733 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 734 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 735 wantallmatrix = PETSC_TRUE; 736 737 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 738 } 739 } 740 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 741 if (twantallmatrix) { 742 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 743 PetscFunctionReturn(0); 744 } 745 746 /* Allocate memory to hold all the submatrices */ 747 if (scall != MAT_REUSE_MATRIX) { 748 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 749 } 750 751 /* Check for special case: each processor gets entire matrix columns */ 752 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 753 for (i=0; i<ismax; i++) { 754 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 755 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 756 if (colflag && ncol == C->cmap->N) { 757 allcolumns[i] = PETSC_TRUE; 758 } else { 759 allcolumns[i] = PETSC_FALSE; 760 } 761 } 762 763 /* Determine the number of stages through which submatrices are done */ 764 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 765 766 /* 767 Each stage will extract nmax submatrices. 768 nmax is determined by the matrix column dimension. 769 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 770 */ 771 if (!nmax) nmax = 1; 772 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 773 774 /* Make sure every processor loops through the nstages */ 775 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 776 777 for (i=0,pos=0; i<nstages; i++) { 778 if (pos+nmax <= ismax) max_no = nmax; 779 else if (pos == ismax) max_no = 0; 780 else max_no = ismax-pos; 781 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 782 pos += max_no; 783 } 784 785 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 786 PetscFunctionReturn(0); 787 } 788 789 /* -------------------------------------------------------------------------*/ 790 #undef __FUNCT__ 791 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 792 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 793 { 794 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 795 Mat A = c->A; 796 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 797 const PetscInt **icol,**irow; 798 PetscInt *nrow,*ncol,start; 799 PetscErrorCode ierr; 800 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 801 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 802 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 803 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 804 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 805 #if defined(PETSC_USE_CTABLE) 806 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 807 #else 808 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 809 #endif 810 const PetscInt *irow_i; 811 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 812 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 813 MPI_Request *r_waits4,*s_waits3,*s_waits4; 814 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 815 MPI_Status *r_status3,*r_status4,*s_status4; 816 MPI_Comm comm; 817 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 818 PetscMPIInt *onodes1,*olengths1; 819 PetscMPIInt idex,idex2,end; 820 821 PetscFunctionBegin; 822 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 823 tag0 = ((PetscObject)C)->tag; 824 size = c->size; 825 rank = c->rank; 826 827 /* Get some new tags to keep the communication clean */ 828 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 829 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 830 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 831 832 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 833 834 for (i=0; i<ismax; i++) { 835 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 836 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 837 if (allcolumns[i]) { 838 icol[i] = NULL; 839 ncol[i] = C->cmap->N; 840 } else { 841 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 842 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 843 } 844 } 845 846 /* evaluate communication - mesg to who, length of mesg, and buffer space 847 required. Based on this, buffers are allocated, and data copied into them*/ 848 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 849 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 850 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 851 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 852 for (i=0; i<ismax; i++) { 853 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 854 jmax = nrow[i]; 855 irow_i = irow[i]; 856 for (j=0; j<jmax; j++) { 857 l = 0; 858 row = irow_i[j]; 859 while (row >= C->rmap->range[l+1]) l++; 860 proc = l; 861 w4[proc]++; 862 } 863 for (j=0; j<size; j++) { 864 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 865 } 866 } 867 868 nrqs = 0; /* no of outgoing messages */ 869 msz = 0; /* total mesg length (for all procs) */ 870 w1[rank] = 0; /* no mesg sent to self */ 871 w3[rank] = 0; 872 for (i=0; i<size; i++) { 873 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 874 } 875 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 876 for (i=0,j=0; i<size; i++) { 877 if (w1[i]) { pa[j] = i; j++; } 878 } 879 880 /* Each message would have a header = 1 + 2*(no of IS) + data */ 881 for (i=0; i<nrqs; i++) { 882 j = pa[i]; 883 w1[j] += w2[j] + 2* w3[j]; 884 msz += w1[j]; 885 } 886 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 887 888 /* Determine the number of messages to expect, their lengths, from from-ids */ 889 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 890 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 891 892 /* Now post the Irecvs corresponding to these messages */ 893 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 894 895 ierr = PetscFree(onodes1);CHKERRQ(ierr); 896 ierr = PetscFree(olengths1);CHKERRQ(ierr); 897 898 /* Allocate Memory for outgoing messages */ 899 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 900 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 901 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 902 903 { 904 PetscInt *iptr = tmp,ict = 0; 905 for (i=0; i<nrqs; i++) { 906 j = pa[i]; 907 iptr += ict; 908 sbuf1[j] = iptr; 909 ict = w1[j]; 910 } 911 } 912 913 /* Form the outgoing messages */ 914 /* Initialize the header space */ 915 for (i=0; i<nrqs; i++) { 916 j = pa[i]; 917 sbuf1[j][0] = 0; 918 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 919 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 920 } 921 922 /* Parse the isrow and copy data into outbuf */ 923 for (i=0; i<ismax; i++) { 924 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 925 irow_i = irow[i]; 926 jmax = nrow[i]; 927 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 928 l = 0; 929 row = irow_i[j]; 930 while (row >= C->rmap->range[l+1]) l++; 931 proc = l; 932 if (proc != rank) { /* copy to the outgoing buf*/ 933 ctr[proc]++; 934 *ptr[proc] = row; 935 ptr[proc]++; 936 } 937 } 938 /* Update the headers for the current IS */ 939 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 940 if ((ctr_j = ctr[j])) { 941 sbuf1_j = sbuf1[j]; 942 k = ++sbuf1_j[0]; 943 sbuf1_j[2*k] = ctr_j; 944 sbuf1_j[2*k-1] = i; 945 } 946 } 947 } 948 949 /* Now post the sends */ 950 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 951 for (i=0; i<nrqs; ++i) { 952 j = pa[i]; 953 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 954 } 955 956 /* Post Receives to capture the buffer size */ 957 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 958 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 959 rbuf2[0] = tmp + msz; 960 for (i=1; i<nrqs; ++i) { 961 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 962 } 963 for (i=0; i<nrqs; ++i) { 964 j = pa[i]; 965 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 966 } 967 968 /* Send to other procs the buf size they should allocate */ 969 970 971 /* Receive messages*/ 972 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 973 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 974 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 975 { 976 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 977 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 978 PetscInt *sbuf2_i; 979 980 for (i=0; i<nrqr; ++i) { 981 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 982 983 req_size[idex] = 0; 984 rbuf1_i = rbuf1[idex]; 985 start = 2*rbuf1_i[0] + 1; 986 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 987 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 988 sbuf2_i = sbuf2[idex]; 989 for (j=start; j<end; j++) { 990 id = rbuf1_i[j] - rstart; 991 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 992 sbuf2_i[j] = ncols; 993 req_size[idex] += ncols; 994 } 995 req_source[idex] = r_status1[i].MPI_SOURCE; 996 /* form the header */ 997 sbuf2_i[0] = req_size[idex]; 998 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 999 1000 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1001 } 1002 } 1003 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1004 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1005 1006 /* recv buffer sizes */ 1007 /* Receive messages*/ 1008 1009 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 1010 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1011 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 1012 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1013 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 1014 1015 for (i=0; i<nrqs; ++i) { 1016 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1017 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 1018 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 1019 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1020 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1021 } 1022 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1023 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1024 1025 /* Wait on sends1 and sends2 */ 1026 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1027 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 1028 1029 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1030 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1031 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1032 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1033 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1034 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1035 1036 /* Now allocate buffers for a->j, and send them off */ 1037 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1038 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1039 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1040 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1041 1042 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 1043 { 1044 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1045 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1046 PetscInt cend = C->cmap->rend; 1047 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1048 1049 for (i=0; i<nrqr; i++) { 1050 rbuf1_i = rbuf1[i]; 1051 sbuf_aj_i = sbuf_aj[i]; 1052 ct1 = 2*rbuf1_i[0] + 1; 1053 ct2 = 0; 1054 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1055 kmax = rbuf1[i][2*j]; 1056 for (k=0; k<kmax; k++,ct1++) { 1057 row = rbuf1_i[ct1] - rstart; 1058 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1059 ncols = nzA + nzB; 1060 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1061 1062 /* load the column indices for this row into cols*/ 1063 cols = sbuf_aj_i + ct2; 1064 1065 lwrite = 0; 1066 for (l=0; l<nzB; l++) { 1067 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1068 } 1069 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1070 for (l=0; l<nzB; l++) { 1071 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1072 } 1073 1074 ct2 += ncols; 1075 } 1076 } 1077 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1078 } 1079 } 1080 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 1081 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 1082 1083 /* Allocate buffers for a->a, and send them off */ 1084 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1085 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1086 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1087 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1088 1089 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1090 { 1091 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1092 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1093 PetscInt cend = C->cmap->rend; 1094 PetscInt *b_j = b->j; 1095 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1096 1097 for (i=0; i<nrqr; i++) { 1098 rbuf1_i = rbuf1[i]; 1099 sbuf_aa_i = sbuf_aa[i]; 1100 ct1 = 2*rbuf1_i[0]+1; 1101 ct2 = 0; 1102 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1103 kmax = rbuf1_i[2*j]; 1104 for (k=0; k<kmax; k++,ct1++) { 1105 row = rbuf1_i[ct1] - rstart; 1106 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1107 ncols = nzA + nzB; 1108 cworkB = b_j + b_i[row]; 1109 vworkA = a_a + a_i[row]; 1110 vworkB = b_a + b_i[row]; 1111 1112 /* load the column values for this row into vals*/ 1113 vals = sbuf_aa_i+ct2; 1114 1115 lwrite = 0; 1116 for (l=0; l<nzB; l++) { 1117 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1118 } 1119 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1120 for (l=0; l<nzB; l++) { 1121 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1122 } 1123 1124 ct2 += ncols; 1125 } 1126 } 1127 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1128 } 1129 } 1130 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1131 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1132 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1133 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1134 1135 /* Form the matrix */ 1136 /* create col map: global col of C -> local col of submatrices */ 1137 { 1138 const PetscInt *icol_i; 1139 #if defined(PETSC_USE_CTABLE) 1140 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 1141 for (i=0; i<ismax; i++) { 1142 if (!allcolumns[i]) { 1143 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1144 1145 jmax = ncol[i]; 1146 icol_i = icol[i]; 1147 cmap_i = cmap[i]; 1148 for (j=0; j<jmax; j++) { 1149 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1150 } 1151 } else { 1152 cmap[i] = NULL; 1153 } 1154 } 1155 #else 1156 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1157 for (i=0; i<ismax; i++) { 1158 if (!allcolumns[i]) { 1159 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1160 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1161 jmax = ncol[i]; 1162 icol_i = icol[i]; 1163 cmap_i = cmap[i]; 1164 for (j=0; j<jmax; j++) { 1165 cmap_i[icol_i[j]] = j+1; 1166 } 1167 } else { 1168 cmap[i] = NULL; 1169 } 1170 } 1171 #endif 1172 } 1173 1174 /* Create lens which is required for MatCreate... */ 1175 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1176 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1177 if (ismax) { 1178 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1179 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1180 } 1181 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1182 1183 /* Update lens from local data */ 1184 for (i=0; i<ismax; i++) { 1185 jmax = nrow[i]; 1186 if (!allcolumns[i]) cmap_i = cmap[i]; 1187 irow_i = irow[i]; 1188 lens_i = lens[i]; 1189 for (j=0; j<jmax; j++) { 1190 l = 0; 1191 row = irow_i[j]; 1192 while (row >= C->rmap->range[l+1]) l++; 1193 proc = l; 1194 if (proc == rank) { 1195 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1196 if (!allcolumns[i]) { 1197 for (k=0; k<ncols; k++) { 1198 #if defined(PETSC_USE_CTABLE) 1199 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1200 #else 1201 tcol = cmap_i[cols[k]]; 1202 #endif 1203 if (tcol) lens_i[j]++; 1204 } 1205 } else { /* allcolumns */ 1206 lens_i[j] = ncols; 1207 } 1208 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1209 } 1210 } 1211 } 1212 1213 /* Create row map: global row of C -> local row of submatrices */ 1214 #if defined(PETSC_USE_CTABLE) 1215 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 1216 for (i=0; i<ismax; i++) { 1217 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1218 rmap_i = rmap[i]; 1219 irow_i = irow[i]; 1220 jmax = nrow[i]; 1221 for (j=0; j<jmax; j++) { 1222 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1223 } 1224 } 1225 #else 1226 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1227 if (ismax) { 1228 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1229 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1230 } 1231 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1232 for (i=0; i<ismax; i++) { 1233 rmap_i = rmap[i]; 1234 irow_i = irow[i]; 1235 jmax = nrow[i]; 1236 for (j=0; j<jmax; j++) { 1237 rmap_i[irow_i[j]] = j; 1238 } 1239 } 1240 #endif 1241 1242 /* Update lens from offproc data */ 1243 { 1244 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1245 1246 for (tmp2=0; tmp2<nrqs; tmp2++) { 1247 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1248 idex = pa[idex2]; 1249 sbuf1_i = sbuf1[idex]; 1250 jmax = sbuf1_i[0]; 1251 ct1 = 2*jmax+1; 1252 ct2 = 0; 1253 rbuf2_i = rbuf2[idex2]; 1254 rbuf3_i = rbuf3[idex2]; 1255 for (j=1; j<=jmax; j++) { 1256 is_no = sbuf1_i[2*j-1]; 1257 max1 = sbuf1_i[2*j]; 1258 lens_i = lens[is_no]; 1259 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1260 rmap_i = rmap[is_no]; 1261 for (k=0; k<max1; k++,ct1++) { 1262 #if defined(PETSC_USE_CTABLE) 1263 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1264 row--; 1265 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1266 #else 1267 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1268 #endif 1269 max2 = rbuf2_i[ct1]; 1270 for (l=0; l<max2; l++,ct2++) { 1271 if (!allcolumns[is_no]) { 1272 #if defined(PETSC_USE_CTABLE) 1273 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1274 #else 1275 tcol = cmap_i[rbuf3_i[ct2]]; 1276 #endif 1277 if (tcol) lens_i[row]++; 1278 } else { /* allcolumns */ 1279 lens_i[row]++; /* lens_i[row] += max2 ? */ 1280 } 1281 } 1282 } 1283 } 1284 } 1285 } 1286 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1287 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1288 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1289 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1290 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1291 1292 /* Create the submatrices */ 1293 if (scall == MAT_REUSE_MATRIX) { 1294 PetscBool flag; 1295 1296 /* 1297 Assumes new rows are same length as the old rows,hence bug! 1298 */ 1299 for (i=0; i<ismax; i++) { 1300 mat = (Mat_SeqAIJ*)(submats[i]->data); 1301 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1302 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1303 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1304 /* Initial matrix as if empty */ 1305 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1306 1307 submats[i]->factortype = C->factortype; 1308 } 1309 } else { 1310 for (i=0; i<ismax; i++) { 1311 PetscInt rbs,cbs; 1312 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1313 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1314 1315 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1316 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1317 1318 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1319 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1320 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1321 } 1322 } 1323 1324 /* Assemble the matrices */ 1325 /* First assemble the local rows */ 1326 { 1327 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1328 PetscScalar *imat_a; 1329 1330 for (i=0; i<ismax; i++) { 1331 mat = (Mat_SeqAIJ*)submats[i]->data; 1332 imat_ilen = mat->ilen; 1333 imat_j = mat->j; 1334 imat_i = mat->i; 1335 imat_a = mat->a; 1336 1337 if (!allcolumns[i]) cmap_i = cmap[i]; 1338 rmap_i = rmap[i]; 1339 irow_i = irow[i]; 1340 jmax = nrow[i]; 1341 for (j=0; j<jmax; j++) { 1342 l = 0; 1343 row = irow_i[j]; 1344 while (row >= C->rmap->range[l+1]) l++; 1345 proc = l; 1346 if (proc == rank) { 1347 old_row = row; 1348 #if defined(PETSC_USE_CTABLE) 1349 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1350 row--; 1351 #else 1352 row = rmap_i[row]; 1353 #endif 1354 ilen_row = imat_ilen[row]; 1355 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1356 mat_i = imat_i[row]; 1357 mat_a = imat_a + mat_i; 1358 mat_j = imat_j + mat_i; 1359 if (!allcolumns[i]) { 1360 for (k=0; k<ncols; k++) { 1361 #if defined(PETSC_USE_CTABLE) 1362 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1363 #else 1364 tcol = cmap_i[cols[k]]; 1365 #endif 1366 if (tcol) { 1367 *mat_j++ = tcol - 1; 1368 *mat_a++ = vals[k]; 1369 ilen_row++; 1370 } 1371 } 1372 } else { /* allcolumns */ 1373 for (k=0; k<ncols; k++) { 1374 *mat_j++ = cols[k]; /* global col index! */ 1375 *mat_a++ = vals[k]; 1376 ilen_row++; 1377 } 1378 } 1379 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1380 1381 imat_ilen[row] = ilen_row; 1382 } 1383 } 1384 } 1385 } 1386 1387 /* Now assemble the off proc rows*/ 1388 { 1389 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1390 PetscInt *imat_j,*imat_i; 1391 PetscScalar *imat_a,*rbuf4_i; 1392 1393 for (tmp2=0; tmp2<nrqs; tmp2++) { 1394 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1395 idex = pa[idex2]; 1396 sbuf1_i = sbuf1[idex]; 1397 jmax = sbuf1_i[0]; 1398 ct1 = 2*jmax + 1; 1399 ct2 = 0; 1400 rbuf2_i = rbuf2[idex2]; 1401 rbuf3_i = rbuf3[idex2]; 1402 rbuf4_i = rbuf4[idex2]; 1403 for (j=1; j<=jmax; j++) { 1404 is_no = sbuf1_i[2*j-1]; 1405 rmap_i = rmap[is_no]; 1406 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1407 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1408 imat_ilen = mat->ilen; 1409 imat_j = mat->j; 1410 imat_i = mat->i; 1411 imat_a = mat->a; 1412 max1 = sbuf1_i[2*j]; 1413 for (k=0; k<max1; k++,ct1++) { 1414 row = sbuf1_i[ct1]; 1415 #if defined(PETSC_USE_CTABLE) 1416 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1417 row--; 1418 #else 1419 row = rmap_i[row]; 1420 #endif 1421 ilen = imat_ilen[row]; 1422 mat_i = imat_i[row]; 1423 mat_a = imat_a + mat_i; 1424 mat_j = imat_j + mat_i; 1425 max2 = rbuf2_i[ct1]; 1426 if (!allcolumns[is_no]) { 1427 for (l=0; l<max2; l++,ct2++) { 1428 1429 #if defined(PETSC_USE_CTABLE) 1430 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1431 #else 1432 tcol = cmap_i[rbuf3_i[ct2]]; 1433 #endif 1434 if (tcol) { 1435 *mat_j++ = tcol - 1; 1436 *mat_a++ = rbuf4_i[ct2]; 1437 ilen++; 1438 } 1439 } 1440 } else { /* allcolumns */ 1441 for (l=0; l<max2; l++,ct2++) { 1442 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1443 *mat_a++ = rbuf4_i[ct2]; 1444 ilen++; 1445 } 1446 } 1447 imat_ilen[row] = ilen; 1448 } 1449 } 1450 } 1451 } 1452 1453 /* sort the rows */ 1454 { 1455 PetscInt *imat_ilen,*imat_j,*imat_i; 1456 PetscScalar *imat_a; 1457 1458 for (i=0; i<ismax; i++) { 1459 mat = (Mat_SeqAIJ*)submats[i]->data; 1460 imat_j = mat->j; 1461 imat_i = mat->i; 1462 imat_a = mat->a; 1463 imat_ilen = mat->ilen; 1464 1465 if (allcolumns[i]) continue; 1466 jmax = nrow[i]; 1467 for (j=0; j<jmax; j++) { 1468 PetscInt ilen; 1469 1470 mat_i = imat_i[j]; 1471 mat_a = imat_a + mat_i; 1472 mat_j = imat_j + mat_i; 1473 ilen = imat_ilen[j]; 1474 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 1475 } 1476 } 1477 } 1478 1479 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1480 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1481 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1482 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1483 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1484 1485 /* Restore the indices */ 1486 for (i=0; i<ismax; i++) { 1487 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1488 if (!allcolumns[i]) { 1489 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1490 } 1491 } 1492 1493 /* Destroy allocated memory */ 1494 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1495 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1496 ierr = PetscFree(pa);CHKERRQ(ierr); 1497 1498 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1499 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1500 for (i=0; i<nrqr; ++i) { 1501 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1502 } 1503 for (i=0; i<nrqs; ++i) { 1504 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1505 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1506 } 1507 1508 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1509 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1510 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1511 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1512 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1513 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1514 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1515 1516 #if defined(PETSC_USE_CTABLE) 1517 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1518 #else 1519 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1520 #endif 1521 ierr = PetscFree(rmap);CHKERRQ(ierr); 1522 1523 for (i=0; i<ismax; i++) { 1524 if (!allcolumns[i]) { 1525 #if defined(PETSC_USE_CTABLE) 1526 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1527 #else 1528 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1529 #endif 1530 } 1531 } 1532 ierr = PetscFree(cmap);CHKERRQ(ierr); 1533 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1534 ierr = PetscFree(lens);CHKERRQ(ierr); 1535 1536 for (i=0; i<ismax; i++) { 1537 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1538 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1539 } 1540 PetscFunctionReturn(0); 1541 } 1542 1543 /* 1544 Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B. 1545 Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset 1546 of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n). 1547 If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B. 1548 After that B's columns are mapped into C's global column space, so that C is in the "disassembled" 1549 state, and needs to be "assembled" later by compressing B's column space. 1550 1551 This function may be called in lieu of preallocation, so C should not be expected to be preallocated. 1552 Following this call, C->A & C->B have been created, even if empty. 1553 */ 1554 #undef __FUNCT__ 1555 #define __FUNCT__ "MatSetSeqMats_MPIAIJ" 1556 PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B) 1557 { 1558 /* If making this function public, change the error returned in this function away from _PLIB. */ 1559 PetscErrorCode ierr; 1560 Mat_MPIAIJ *aij; 1561 Mat_SeqAIJ *Baij; 1562 PetscBool seqaij,Bdisassembled; 1563 PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count; 1564 PetscScalar v; 1565 const PetscInt *rowindices,*colindices; 1566 1567 PetscFunctionBegin; 1568 /* Check to make sure the component matrices (and embeddings) are compatible with C. */ 1569 if (A) { 1570 ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 1571 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type"); 1572 if (rowemb) { 1573 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 1574 if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n); 1575 } else { 1576 if (C->rmap->n != A->rmap->n) { 1577 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix"); 1578 } 1579 } 1580 if (dcolemb) { 1581 ierr = ISGetLocalSize(dcolemb,&n);CHKERRQ(ierr); 1582 if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n); 1583 } else { 1584 if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix"); 1585 } 1586 } 1587 if (B) { 1588 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);CHKERRQ(ierr); 1589 if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type"); 1590 if (rowemb) { 1591 ierr = ISGetLocalSize(rowemb,&m);CHKERRQ(ierr); 1592 if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n); 1593 } else { 1594 if (C->rmap->n != B->rmap->n) { 1595 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix"); 1596 } 1597 } 1598 if (ocolemb) { 1599 ierr = ISGetLocalSize(ocolemb,&n);CHKERRQ(ierr); 1600 if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n); 1601 } else { 1602 if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix"); 1603 } 1604 } 1605 1606 aij = (Mat_MPIAIJ*)(C->data); 1607 if (!aij->A) { 1608 /* Mimic parts of MatMPIAIJSetPreallocation() */ 1609 ierr = MatCreate(PETSC_COMM_SELF,&aij->A);CHKERRQ(ierr); 1610 ierr = MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);CHKERRQ(ierr); 1611 ierr = MatSetBlockSizesFromMats(aij->A,C,C);CHKERRQ(ierr); 1612 ierr = MatSetType(aij->A,MATSEQAIJ);CHKERRQ(ierr); 1613 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);CHKERRQ(ierr); 1614 } 1615 if (A) { 1616 ierr = MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);CHKERRQ(ierr); 1617 } else { 1618 ierr = MatSetUp(aij->A);CHKERRQ(ierr); 1619 } 1620 if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */ 1621 /* 1622 If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and 1623 need to "disassemble" B -- convert it to using C's global indices. 1624 To insert the values we take the safer, albeit more expensive, route of MatSetValues(). 1625 1626 If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate; 1627 we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out. 1628 1629 TODO: Put B's values into aij->B's aij structure in place using the embedding ISs? 1630 At least avoid calling MatSetValues() and the implied searches? 1631 */ 1632 1633 if (B && pattern == DIFFERENT_NONZERO_PATTERN) { 1634 #if defined(PETSC_USE_CTABLE) 1635 ierr = PetscTableDestroy(&aij->colmap);CHKERRQ(ierr); 1636 #else 1637 ierr = PetscFree(aij->colmap);CHKERRQ(ierr); 1638 ierr = PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1639 #endif 1640 ngcol = 0; 1641 if (aij->lvec) { 1642 ierr = VecGetSize(aij->lvec,&ngcol);CHKERRQ(ierr); 1643 } 1644 if (aij->garray) { 1645 ierr = PetscFree(aij->garray);CHKERRQ(ierr); 1646 ierr = PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));CHKERRQ(ierr); 1647 } 1648 ierr = VecDestroy(&aij->lvec);CHKERRQ(ierr); 1649 ierr = VecScatterDestroy(&aij->Mvctx);CHKERRQ(ierr); 1650 } 1651 if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) { 1652 ierr = MatDestroy(&aij->B);CHKERRQ(ierr); 1653 } 1654 if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) { 1655 ierr = MatZeroEntries(aij->B);CHKERRQ(ierr); 1656 } 1657 } 1658 Bdisassembled = PETSC_FALSE; 1659 if (!aij->B) { 1660 ierr = MatCreate(PETSC_COMM_SELF,&aij->B);CHKERRQ(ierr); 1661 ierr = PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);CHKERRQ(ierr); 1662 ierr = MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);CHKERRQ(ierr); 1663 ierr = MatSetBlockSizesFromMats(aij->B,B,B);CHKERRQ(ierr); 1664 ierr = MatSetType(aij->B,MATSEQAIJ);CHKERRQ(ierr); 1665 Bdisassembled = PETSC_TRUE; 1666 } 1667 if (B) { 1668 Baij = (Mat_SeqAIJ*)(B->data); 1669 if (pattern == DIFFERENT_NONZERO_PATTERN) { 1670 ierr = PetscMalloc1(B->rmap->n,&nz);CHKERRQ(ierr); 1671 for (i=0; i<B->rmap->n; i++) { 1672 nz[i] = Baij->i[i+1] - Baij->i[i]; 1673 } 1674 ierr = MatSeqAIJSetPreallocation(aij->B,0,nz);CHKERRQ(ierr); 1675 ierr = PetscFree(nz);CHKERRQ(ierr); 1676 } 1677 1678 ierr = PetscLayoutGetRange(C->rmap,&rstart,&rend);CHKERRQ(ierr); 1679 shift = rend-rstart; 1680 count = 0; 1681 rowindices = NULL; 1682 colindices = NULL; 1683 if (rowemb) { 1684 ierr = ISGetIndices(rowemb,&rowindices);CHKERRQ(ierr); 1685 } 1686 if (ocolemb) { 1687 ierr = ISGetIndices(ocolemb,&colindices);CHKERRQ(ierr); 1688 } 1689 for (i=0; i<B->rmap->n; i++) { 1690 PetscInt row; 1691 row = i; 1692 if (rowindices) row = rowindices[i]; 1693 for (j=Baij->i[i]; j<Baij->i[i+1]; j++) { 1694 col = Baij->j[count]; 1695 if (colindices) col = colindices[col]; 1696 if (Bdisassembled && col>=rstart) col += shift; 1697 v = Baij->a[count]; 1698 ierr = MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);CHKERRQ(ierr); 1699 ++count; 1700 } 1701 } 1702 /* No assembly for aij->B is necessary. */ 1703 /* FIXME: set aij->B's nonzerostate correctly. */ 1704 } else { 1705 ierr = MatSetUp(aij->B);CHKERRQ(ierr); 1706 } 1707 C->preallocated = PETSC_TRUE; 1708 C->was_assembled = PETSC_FALSE; 1709 C->assembled = PETSC_FALSE; 1710 /* 1711 C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ(). 1712 Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's. 1713 */ 1714 PetscFunctionReturn(0); 1715 } 1716 1717 #undef __FUNCT__ 1718 #define __FUNCT__ "MatGetSeqMats_MPIAIJ" 1719 /* 1720 B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray. 1721 */ 1722 PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B) 1723 { 1724 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1725 1726 PetscFunctionBegin; 1727 PetscValidPointer(A,2); 1728 PetscValidPointer(B,3); 1729 /* FIXME: make sure C is assembled */ 1730 *A = aij->A; 1731 *B = aij->B; 1732 /* Note that we don't incref *A and *B, so be careful! */ 1733 PetscFunctionReturn(0); 1734 } 1735 1736 /* 1737 Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C. 1738 NOT SCALABLE due to the use of ISGetNonlocalIS() (see below). 1739 */ 1740 #undef __FUNCT__ 1741 #define __FUNCT__ "MatGetSubMatricesMPI_MPIXAIJ" 1742 PetscErrorCode MatGetSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1743 PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**), 1744 PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*), 1745 PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat), 1746 PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat)) 1747 { 1748 PetscErrorCode ierr; 1749 PetscMPIInt isize,flag; 1750 PetscInt i,ii,cismax,ispar; 1751 Mat *A,*B; 1752 IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p; 1753 1754 PetscFunctionBegin; 1755 if (!ismax) PetscFunctionReturn(0); 1756 1757 for (i = 0, cismax = 0; i < ismax; ++i) { 1758 PetscMPIInt isize; 1759 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);CHKERRQ(ierr); 1760 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1761 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &isize);CHKERRQ(ierr); 1762 if (isize > 1) ++cismax; 1763 } 1764 /* 1765 If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction. 1766 ispar counts the number of parallel ISs across C's comm. 1767 */ 1768 ierr = MPI_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 1769 if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */ 1770 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1771 PetscFunctionReturn(0); 1772 } 1773 1774 /* if (ispar) */ 1775 /* 1776 Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only. 1777 These are used to extract the off-diag portion of the resulting parallel matrix. 1778 The row IS for the off-diag portion is the same as for the diag portion, 1779 so we merely alias (without increfing) the row IS, while skipping those that are sequential. 1780 */ 1781 ierr = PetscMalloc2(cismax,&cisrow,cismax,&ciscol);CHKERRQ(ierr); 1782 ierr = PetscMalloc1(cismax,&ciscol_p);CHKERRQ(ierr); 1783 for (i = 0, ii = 0; i < ismax; ++i) { 1784 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 1785 if (isize > 1) { 1786 /* 1787 TODO: This is the part that's ***NOT SCALABLE***. 1788 To fix this we need to extract just the indices of C's nonzero columns 1789 that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal 1790 part of iscol[i] -- without actually computing ciscol[ii]. This also has 1791 to be done without serializing on the IS list, so, most likely, it is best 1792 done by rewriting MatGetSubMatrices_MPIAIJ() directly. 1793 */ 1794 ierr = ISGetNonlocalIS(iscol[i],&(ciscol[ii]));CHKERRQ(ierr); 1795 /* Now we have to 1796 (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices 1797 were sorted on each rank, concatenated they might no longer be sorted; 1798 (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the 1799 indices in the nondecreasing order to the original index positions. 1800 If ciscol[ii] is strictly increasing, the permutation IS is NULL. 1801 */ 1802 ierr = ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);CHKERRQ(ierr); 1803 ierr = ISSort(ciscol[ii]);CHKERRQ(ierr); 1804 ++ii; 1805 } 1806 } 1807 ierr = PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);CHKERRQ(ierr); 1808 for (i = 0, ii = 0; i < ismax; ++i) { 1809 PetscInt j,issize; 1810 const PetscInt *indices; 1811 1812 /* 1813 Permute the indices into a nondecreasing order. Reject row and col indices with duplicates. 1814 */ 1815 ierr = ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);CHKERRQ(ierr); 1816 ierr = ISSort(isrow[i]);CHKERRQ(ierr); 1817 ierr = ISGetLocalSize(isrow[i],&issize);CHKERRQ(ierr); 1818 ierr = ISGetIndices(isrow[i],&indices);CHKERRQ(ierr); 1819 for (j = 1; j < issize; ++j) { 1820 if (indices[j] == indices[j-1]) { 1821 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 1822 } 1823 } 1824 ierr = ISRestoreIndices(isrow[i],&indices);CHKERRQ(ierr); 1825 1826 1827 ierr = ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);CHKERRQ(ierr); 1828 ierr = ISSort(iscol[i]);CHKERRQ(ierr); 1829 ierr = ISGetLocalSize(iscol[i],&issize);CHKERRQ(ierr); 1830 ierr = ISGetIndices(iscol[i],&indices);CHKERRQ(ierr); 1831 for (j = 1; j < issize; ++j) { 1832 if (indices[j-1] == indices[j]) { 1833 SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]); 1834 } 1835 } 1836 ierr = ISRestoreIndices(iscol[i],&indices);CHKERRQ(ierr); 1837 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 1838 if (isize > 1) { 1839 cisrow[ii] = isrow[i]; 1840 ++ii; 1841 } 1842 } 1843 /* 1844 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1845 array of sequential matrices underlying the resulting parallel matrices. 1846 Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or 1847 contain duplicates. 1848 1849 There are as many diag matrices as there are original index sets. There are only as many parallel 1850 and off-diag matrices, as there are parallel (comm size > 1) index sets. 1851 1852 ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq(): 1853 - If the array of MPI matrices already exists and is being reused, we need to allocate the array 1854 and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq 1855 will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them 1856 with A[i] and B[ii] extracted from the corresponding MPI submat. 1857 - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii] 1858 will have a different order from what getsubmats_seq expects. To handle this case -- indicated 1859 by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii] 1860 (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its 1861 values into A[i] and B[ii] sitting inside the corresponding submat. 1862 - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding 1863 A[i], B[ii], AA[i] or BB[ii] matrices. 1864 */ 1865 /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */ 1866 if (scall == MAT_INITIAL_MATRIX) { 1867 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 1868 /* If not reusing, sequential matrix arrays are allocated by getsubmats_seq(). */ 1869 } else { 1870 ierr = PetscMalloc1(ismax,&A);CHKERRQ(ierr); 1871 ierr = PetscMalloc1(cismax,&B);CHKERRQ(ierr); 1872 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well, unless permutations are not NULL. */ 1873 for (i = 0, ii = 0; i < ismax; ++i) { 1874 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 1875 if (isize > 1) { 1876 Mat AA,BB; 1877 ierr = (*getlocalmats)((*submat)[i],&AA,&BB);CHKERRQ(ierr); 1878 if (!isrow_p[i] && !iscol_p[i]) { 1879 A[i] = AA; 1880 } else { 1881 /* TODO: extract A[i] composed on AA. */ 1882 ierr = MatDuplicate(AA,MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 1883 } 1884 if (!isrow_p[i] && !ciscol_p[ii]) { 1885 B[ii] = BB; 1886 } else { 1887 /* TODO: extract B[ii] composed on BB. */ 1888 ierr = MatDuplicate(BB,MAT_SHARE_NONZERO_PATTERN,B+ii);CHKERRQ(ierr); 1889 } 1890 ++ii; 1891 } else { 1892 if (!isrow_p[i] && !iscol_p[i]) { 1893 A[i] = (*submat)[i]; 1894 } else { 1895 /* TODO: extract A[i] composed on (*submat)[i]. */ 1896 ierr = MatDuplicate((*submat)[i],MAT_SHARE_NONZERO_PATTERN,A+i);CHKERRQ(ierr); 1897 } 1898 } 1899 } 1900 } 1901 /* Now obtain the sequential A and B submatrices separately. */ 1902 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,&A);CHKERRQ(ierr); 1903 ierr = (*getsubmats_seq)(C,cismax,cisrow,ciscol,scall,&B);CHKERRQ(ierr); 1904 /* 1905 If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential 1906 matrices A & B have been extracted directly into the parallel matrices containing them, or 1907 simply into the sequential matrix identical with the corresponding A (if isize == 1). 1908 Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected 1909 to have the same sparsity pattern. 1910 Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap 1911 must be constructed for C. This is done by setseqmat(s). 1912 */ 1913 for (i = 0, ii = 0; i < ismax; ++i) { 1914 /* 1915 TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol? 1916 That way we can avoid sorting and computing permutations when reusing. 1917 To this end: 1918 - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX 1919 - if caching arrays to hold the ISs, make and compose a container for them so that it can 1920 be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents). 1921 */ 1922 MatStructure pattern; 1923 if (scall == MAT_INITIAL_MATRIX) { 1924 pattern = DIFFERENT_NONZERO_PATTERN; 1925 } else { 1926 pattern = SAME_NONZERO_PATTERN; 1927 } 1928 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm,&isize);CHKERRQ(ierr); 1929 /* Construct submat[i] from the Seq pieces A (and B, if necessary). */ 1930 if (isize > 1) { 1931 if (scall == MAT_INITIAL_MATRIX) { 1932 ierr = MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);CHKERRQ(ierr); 1933 ierr = MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1934 ierr = MatSetType((*submat)[i],MATMPIAIJ);CHKERRQ(ierr); 1935 ierr = PetscLayoutSetUp((*submat)[i]->rmap);CHKERRQ(ierr); 1936 ierr = PetscLayoutSetUp((*submat)[i]->cmap);CHKERRQ(ierr); 1937 } 1938 /* 1939 For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix. 1940 */ 1941 { 1942 Mat AA,BB; 1943 AA = NULL; 1944 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) AA = A[i]; 1945 BB = NULL; 1946 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) BB = B[ii]; 1947 if (AA || BB) { 1948 ierr = setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);CHKERRQ(ierr); 1949 ierr = MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1950 ierr = MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1951 } 1952 if (isrow_p[i] || iscol_p[i] || scall == MAT_INITIAL_MATRIX) { 1953 /* TODO: Compose AA for future use, if (isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX. */ 1954 ierr = MatDestroy(&AA);CHKERRQ(ierr); 1955 } 1956 if (isrow_p[i] || ciscol_p[ii] || scall == MAT_INITIAL_MATRIX) { 1957 /* TODO: Compose BB for future use, if (isrow_p[i] || ciscol_p[i]) && MAT_INITIAL_MATRIX */ 1958 ierr = MatDestroy(&BB);CHKERRQ(ierr); 1959 } 1960 } 1961 ierr = ISDestroy(ciscol+ii);CHKERRQ(ierr); 1962 ierr = ISDestroy(ciscol_p+ii);CHKERRQ(ierr); 1963 ++ii; 1964 } else { /* if (isize == 1) */ 1965 if (scall == MAT_INITIAL_MATRIX) { 1966 if (isrow_p[i] || iscol_p[i]) { 1967 ierr = MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);CHKERRQ(ierr); 1968 } else (*submat)[i] = A[i]; 1969 } 1970 if (isrow_p[i] || iscol_p[i]) { 1971 ierr = setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);CHKERRQ(ierr); 1972 /* Otherwise A is extracted straight into (*submats)[i]. */ 1973 /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */ 1974 ierr = MatDestroy(A+i);CHKERRQ(ierr); 1975 } 1976 } 1977 ierr = ISDestroy(&isrow_p[i]);CHKERRQ(ierr); 1978 ierr = ISDestroy(&iscol_p[i]);CHKERRQ(ierr); 1979 } 1980 ierr = PetscFree2(cisrow,ciscol);CHKERRQ(ierr); 1981 ierr = PetscFree2(isrow_p,iscol_p);CHKERRQ(ierr); 1982 ierr = PetscFree(ciscol_p);CHKERRQ(ierr); 1983 ierr = PetscFree(A);CHKERRQ(ierr); 1984 ierr = PetscFree(B);CHKERRQ(ierr); 1985 PetscFunctionReturn(0); 1986 } 1987 1988 1989 1990 #undef __FUNCT__ 1991 #define __FUNCT__ "MatGetSubMatricesMPI_MPIAIJ" 1992 PetscErrorCode MatGetSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1993 { 1994 PetscErrorCode ierr; 1995 1996 PetscFunctionBegin; 1997 ierr = MatGetSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);CHKERRQ(ierr); 1998 PetscFunctionReturn(0); 1999 } 2000