1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,&idx,imax,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++) { 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, M*imax,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr); 162 163 for (i=0; i<imax; i++) { 164 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 165 data[i] = d_p + M*i; 166 } 167 } 168 169 /* Parse the IS and update local tables and the outgoing buf with the data*/ 170 { 171 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 172 PetscBT table_i; 173 174 for (i=0; i<imax; i++) { 175 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 176 n_i = n[i]; 177 table_i = table[i]; 178 idx_i = idx[i]; 179 data_i = data[i]; 180 isz_i = isz[i]; 181 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 182 row = idx_i[j]; 183 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 184 if (proc != rank) { /* copy to the outgoing buffer */ 185 ctr[proc]++; 186 *ptr[proc] = row; 187 ptr[proc]++; 188 } else if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; /* Update the local table */ 189 } 190 /* Update the headers for the current IS */ 191 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 192 if ((ctr_j = ctr[j])) { 193 outdat_j = outdat[j]; 194 k = ++outdat_j[0]; 195 outdat_j[2*k] = ctr_j; 196 outdat_j[2*k-1] = i; 197 } 198 } 199 isz[i] = isz_i; 200 } 201 } 202 203 /* Now post the sends */ 204 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 205 for (i=0; i<nrqs; ++i) { 206 j = pa[i]; 207 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 208 } 209 210 /* No longer need the original indices*/ 211 for (i=0; i<imax; ++i) { 212 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 213 } 214 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 215 216 for (i=0; i<imax; ++i) { 217 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 218 } 219 220 /* Do Local work*/ 221 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 222 223 /* Receive messages*/ 224 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr); 225 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 226 227 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr); 228 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 229 230 /* Phase 1 sends are complete - deallocate buffers */ 231 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 232 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 233 234 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr); 235 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr); 236 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 237 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 238 ierr = PetscFree(rbuf);CHKERRQ(ierr); 239 240 241 /* Send the data back*/ 242 /* Do a global reduction to know the buffer space req for incoming messages*/ 243 { 244 PetscMPIInt *rw1; 245 246 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr); 247 248 for (i=0; i<nrqr; ++i) { 249 proc = recv_status[i].MPI_SOURCE; 250 251 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 252 rw1[proc] = isz1[i]; 253 } 254 ierr = PetscFree(onodes1);CHKERRQ(ierr); 255 ierr = PetscFree(olengths1);CHKERRQ(ierr); 256 257 /* Determine the number of messages to expect, their lengths, from from-ids */ 258 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 259 ierr = PetscFree(rw1);CHKERRQ(ierr); 260 } 261 /* Now post the Irecvs corresponding to these messages */ 262 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 263 264 /* Now post the sends */ 265 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 266 for (i=0; i<nrqr; ++i) { 267 j = recv_status[i].MPI_SOURCE; 268 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 269 } 270 271 /* receive work done on other processors*/ 272 { 273 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 274 PetscMPIInt idex; 275 PetscBT table_i; 276 MPI_Status *status2; 277 278 ierr = PetscMalloc1((PetscMax(nrqr,nrqs)+1),&status2);CHKERRQ(ierr); 279 for (i=0; i<nrqs; ++i) { 280 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 281 /* Process the message*/ 282 rbuf2_i = rbuf2[idex]; 283 ct1 = 2*rbuf2_i[0]+1; 284 jmax = rbuf2[idex][0]; 285 for (j=1; j<=jmax; j++) { 286 max = rbuf2_i[2*j]; 287 is_no = rbuf2_i[2*j-1]; 288 isz_i = isz[is_no]; 289 data_i = data[is_no]; 290 table_i = table[is_no]; 291 for (k=0; k<max; k++,ct1++) { 292 row = rbuf2_i[ct1]; 293 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row; 294 } 295 isz[is_no] = isz_i; 296 } 297 } 298 299 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 300 ierr = PetscFree(status2);CHKERRQ(ierr); 301 } 302 303 for (i=0; i<imax; ++i) { 304 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 305 } 306 307 ierr = PetscFree(onodes2);CHKERRQ(ierr); 308 ierr = PetscFree(olengths2);CHKERRQ(ierr); 309 310 ierr = PetscFree(pa);CHKERRQ(ierr); 311 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 312 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 313 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 314 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 315 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 316 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 317 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 318 ierr = PetscFree(s_status);CHKERRQ(ierr); 319 ierr = PetscFree(recv_status);CHKERRQ(ierr); 320 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 321 ierr = PetscFree(xdata);CHKERRQ(ierr); 322 ierr = PetscFree(isz1);CHKERRQ(ierr); 323 PetscFunctionReturn(0); 324 } 325 326 #undef __FUNCT__ 327 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 328 /* 329 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 330 the work on the local processor. 331 332 Inputs: 333 C - MAT_MPIAIJ; 334 imax - total no of index sets processed at a time; 335 table - an array of char - size = m bits. 336 337 Output: 338 isz - array containing the count of the solution elements corresponding 339 to each index set; 340 data - pointer to the solutions 341 */ 342 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 343 { 344 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 345 Mat A = c->A,B = c->B; 346 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 347 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 348 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 349 PetscBT table_i; 350 351 PetscFunctionBegin; 352 rstart = C->rmap->rstart; 353 cstart = C->cmap->rstart; 354 ai = a->i; 355 aj = a->j; 356 bi = b->i; 357 bj = b->j; 358 garray = c->garray; 359 360 361 for (i=0; i<imax; i++) { 362 data_i = data[i]; 363 table_i = table[i]; 364 isz_i = isz[i]; 365 for (j=0,max=isz[i]; j<max; j++) { 366 row = data_i[j] - rstart; 367 start = ai[row]; 368 end = ai[row+1]; 369 for (k=start; k<end; k++) { /* Amat */ 370 val = aj[k] + cstart; 371 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 372 } 373 start = bi[row]; 374 end = bi[row+1]; 375 for (k=start; k<end; k++) { /* Bmat */ 376 val = garray[bj[k]]; 377 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val; 378 } 379 } 380 isz[i] = isz_i; 381 } 382 PetscFunctionReturn(0); 383 } 384 385 #undef __FUNCT__ 386 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 387 /* 388 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 389 and return the output 390 391 Input: 392 C - the matrix 393 nrqr - no of messages being processed. 394 rbuf - an array of pointers to the recieved requests 395 396 Output: 397 xdata - array of messages to be sent back 398 isz1 - size of each message 399 400 For better efficiency perhaps we should malloc separately each xdata[i], 401 then if a remalloc is required we need only copy the data for that one row 402 rather then all previous rows as it is now where a single large chunck of 403 memory is used. 404 405 */ 406 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 407 { 408 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 409 Mat A = c->A,B = c->B; 410 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 411 PetscErrorCode ierr; 412 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 413 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 414 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 415 PetscInt *rbuf_i,kmax,rbuf_0; 416 PetscBT xtable; 417 418 PetscFunctionBegin; 419 m = C->rmap->N; 420 rstart = C->rmap->rstart; 421 cstart = C->cmap->rstart; 422 ai = a->i; 423 aj = a->j; 424 bi = b->i; 425 bj = b->j; 426 garray = c->garray; 427 428 429 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 430 rbuf_i = rbuf[i]; 431 rbuf_0 = rbuf_i[0]; 432 ct += rbuf_0; 433 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j]; 434 } 435 436 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 437 else max1 = 1; 438 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 439 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr); 440 ++no_malloc; 441 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 442 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 443 444 ct3 = 0; 445 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 446 rbuf_i = rbuf[i]; 447 rbuf_0 = rbuf_i[0]; 448 ct1 = 2*rbuf_0+1; 449 ct2 = ct1; 450 ct3 += ct1; 451 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 452 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 453 oct2 = ct2; 454 kmax = rbuf_i[2*j]; 455 for (k=0; k<kmax; k++,ct1++) { 456 row = rbuf_i[ct1]; 457 if (!PetscBTLookupSet(xtable,row)) { 458 if (!(ct3 < mem_estimate)) { 459 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 460 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 461 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 462 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 463 xdata[0] = tmp; 464 mem_estimate = new_estimate; ++no_malloc; 465 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 466 } 467 xdata[i][ct2++] = row; 468 ct3++; 469 } 470 } 471 for (k=oct2,max2=ct2; k<max2; k++) { 472 row = xdata[i][k] - rstart; 473 start = ai[row]; 474 end = ai[row+1]; 475 for (l=start; l<end; l++) { 476 val = aj[l] + cstart; 477 if (!PetscBTLookupSet(xtable,val)) { 478 if (!(ct3 < mem_estimate)) { 479 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 480 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 481 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 482 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 483 xdata[0] = tmp; 484 mem_estimate = new_estimate; ++no_malloc; 485 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 486 } 487 xdata[i][ct2++] = val; 488 ct3++; 489 } 490 } 491 start = bi[row]; 492 end = bi[row+1]; 493 for (l=start; l<end; l++) { 494 val = garray[bj[l]]; 495 if (!PetscBTLookupSet(xtable,val)) { 496 if (!(ct3 < mem_estimate)) { 497 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 498 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr); 499 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 500 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 501 xdata[0] = tmp; 502 mem_estimate = new_estimate; ++no_malloc; 503 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1]; 504 } 505 xdata[i][ct2++] = val; 506 ct3++; 507 } 508 } 509 } 510 /* Update the header*/ 511 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 512 xdata[i][2*j-1] = rbuf_i[2*j-1]; 513 } 514 xdata[i][0] = rbuf_0; 515 xdata[i+1] = xdata[i] + ct2; 516 isz1[i] = ct2; /* size of each message */ 517 } 518 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 519 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 520 PetscFunctionReturn(0); 521 } 522 /* -------------------------------------------------------------------------*/ 523 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 524 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 525 /* 526 Every processor gets the entire matrix 527 */ 528 #undef __FUNCT__ 529 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 530 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 531 { 532 Mat B; 533 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 534 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 535 PetscErrorCode ierr; 536 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 537 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 538 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 539 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 540 541 PetscFunctionBegin; 542 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr); 543 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);CHKERRQ(ierr); 544 545 if (scall == MAT_INITIAL_MATRIX) { 546 /* ---------------------------------------------------------------- 547 Tell every processor the number of nonzeros per row 548 */ 549 ierr = PetscMalloc1(A->rmap->N,&lens);CHKERRQ(ierr); 550 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 551 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 552 } 553 sendcount = A->rmap->rend - A->rmap->rstart; 554 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 555 for (i=0; i<size; i++) { 556 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 557 displs[i] = A->rmap->range[i]; 558 } 559 #if defined(PETSC_HAVE_MPI_IN_PLACE) 560 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 561 #else 562 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 563 #endif 564 /* --------------------------------------------------------------- 565 Create the sequential matrix of the same type as the local block diagonal 566 */ 567 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 568 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 569 ierr = MatSetBlockSizesFromMats(B,A,A);CHKERRQ(ierr); 570 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 571 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 572 ierr = PetscMalloc1(1,Bin);CHKERRQ(ierr); 573 **Bin = B; 574 b = (Mat_SeqAIJ*)B->data; 575 576 /*-------------------------------------------------------------------- 577 Copy my part of matrix column indices over 578 */ 579 sendcount = ad->nz + bd->nz; 580 jsendbuf = b->j + b->i[rstarts[rank]]; 581 a_jsendbuf = ad->j; 582 b_jsendbuf = bd->j; 583 n = A->rmap->rend - A->rmap->rstart; 584 cnt = 0; 585 for (i=0; i<n; i++) { 586 587 /* put in lower diagonal portion */ 588 m = bd->i[i+1] - bd->i[i]; 589 while (m > 0) { 590 /* is it above diagonal (in bd (compressed) numbering) */ 591 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 592 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 593 m--; 594 } 595 596 /* put in diagonal portion */ 597 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 598 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 599 } 600 601 /* put in upper diagonal portion */ 602 while (m-- > 0) { 603 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 604 } 605 } 606 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 607 608 /*-------------------------------------------------------------------- 609 Gather all column indices to all processors 610 */ 611 for (i=0; i<size; i++) { 612 recvcounts[i] = 0; 613 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 614 recvcounts[i] += lens[j]; 615 } 616 } 617 displs[0] = 0; 618 for (i=1; i<size; i++) { 619 displs[i] = displs[i-1] + recvcounts[i-1]; 620 } 621 #if defined(PETSC_HAVE_MPI_IN_PLACE) 622 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 623 #else 624 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 625 #endif 626 /*-------------------------------------------------------------------- 627 Assemble the matrix into useable form (note numerical values not yet set) 628 */ 629 /* set the b->ilen (length of each row) values */ 630 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 631 /* set the b->i indices */ 632 b->i[0] = 0; 633 for (i=1; i<=A->rmap->N; i++) { 634 b->i[i] = b->i[i-1] + lens[i-1]; 635 } 636 ierr = PetscFree(lens);CHKERRQ(ierr); 637 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 638 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 639 640 } else { 641 B = **Bin; 642 b = (Mat_SeqAIJ*)B->data; 643 } 644 645 /*-------------------------------------------------------------------- 646 Copy my part of matrix numerical values into the values location 647 */ 648 if (flag == MAT_GET_VALUES) { 649 sendcount = ad->nz + bd->nz; 650 sendbuf = b->a + b->i[rstarts[rank]]; 651 a_sendbuf = ad->a; 652 b_sendbuf = bd->a; 653 b_sendj = bd->j; 654 n = A->rmap->rend - A->rmap->rstart; 655 cnt = 0; 656 for (i=0; i<n; i++) { 657 658 /* put in lower diagonal portion */ 659 m = bd->i[i+1] - bd->i[i]; 660 while (m > 0) { 661 /* is it above diagonal (in bd (compressed) numbering) */ 662 if (garray[*b_sendj] > A->rmap->rstart + i) break; 663 sendbuf[cnt++] = *b_sendbuf++; 664 m--; 665 b_sendj++; 666 } 667 668 /* put in diagonal portion */ 669 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 670 sendbuf[cnt++] = *a_sendbuf++; 671 } 672 673 /* put in upper diagonal portion */ 674 while (m-- > 0) { 675 sendbuf[cnt++] = *b_sendbuf++; 676 b_sendj++; 677 } 678 } 679 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 680 681 /* ----------------------------------------------------------------- 682 Gather all numerical values to all processors 683 */ 684 if (!recvcounts) { 685 ierr = PetscMalloc2(size,&recvcounts,size,&displs);CHKERRQ(ierr); 686 } 687 for (i=0; i<size; i++) { 688 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 689 } 690 displs[0] = 0; 691 for (i=1; i<size; i++) { 692 displs[i] = displs[i-1] + recvcounts[i-1]; 693 } 694 recvbuf = b->a; 695 #if defined(PETSC_HAVE_MPI_IN_PLACE) 696 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 697 #else 698 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));CHKERRQ(ierr); 699 #endif 700 } /* endof (flag == MAT_GET_VALUES) */ 701 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 702 703 if (A->symmetric) { 704 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 705 } else if (A->hermitian) { 706 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 707 } else if (A->structurally_symmetric) { 708 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 709 } 710 PetscFunctionReturn(0); 711 } 712 713 714 715 #undef __FUNCT__ 716 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 717 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 718 { 719 PetscErrorCode ierr; 720 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 721 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 722 723 PetscFunctionBegin; 724 725 /* 726 Check for special case: each processor gets entire matrix 727 */ 728 if (ismax == 1 && C->rmap->N == C->cmap->N) { 729 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 730 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 731 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 732 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 733 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 734 wantallmatrix = PETSC_TRUE; 735 736 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);CHKERRQ(ierr); 737 } 738 } 739 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 740 if (twantallmatrix) { 741 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 742 PetscFunctionReturn(0); 743 } 744 745 /* Allocate memory to hold all the submatrices */ 746 if (scall != MAT_REUSE_MATRIX) { 747 ierr = PetscMalloc1(ismax+1,submat);CHKERRQ(ierr); 748 } 749 750 /* Check for special case: each processor gets entire matrix columns */ 751 ierr = PetscMalloc1(ismax+1,&allcolumns);CHKERRQ(ierr); 752 for (i=0; i<ismax; i++) { 753 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 754 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 755 if (colflag && ncol == C->cmap->N) { 756 allcolumns[i] = PETSC_TRUE; 757 } else { 758 allcolumns[i] = PETSC_FALSE; 759 } 760 } 761 762 /* Determine the number of stages through which submatrices are done */ 763 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 764 765 /* 766 Each stage will extract nmax submatrices. 767 nmax is determined by the matrix column dimension. 768 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 769 */ 770 if (!nmax) nmax = 1; 771 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); 772 773 /* Make sure every processor loops through the nstages */ 774 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr); 775 776 for (i=0,pos=0; i<nstages; i++) { 777 if (pos+nmax <= ismax) max_no = nmax; 778 else if (pos == ismax) max_no = 0; 779 else max_no = ismax-pos; 780 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 781 pos += max_no; 782 } 783 784 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 785 PetscFunctionReturn(0); 786 } 787 788 /* -------------------------------------------------------------------------*/ 789 #undef __FUNCT__ 790 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 791 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 792 { 793 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 794 Mat A = c->A; 795 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 796 const PetscInt **icol,**irow; 797 PetscInt *nrow,*ncol,start; 798 PetscErrorCode ierr; 799 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 800 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 801 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 802 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 803 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 804 #if defined(PETSC_USE_CTABLE) 805 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i; 806 #else 807 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i; 808 #endif 809 const PetscInt *irow_i; 810 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 811 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 812 MPI_Request *r_waits4,*s_waits3,*s_waits4; 813 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 814 MPI_Status *r_status3,*r_status4,*s_status4; 815 MPI_Comm comm; 816 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 817 PetscMPIInt *onodes1,*olengths1; 818 PetscMPIInt idex,idex2,end; 819 820 PetscFunctionBegin; 821 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 822 tag0 = ((PetscObject)C)->tag; 823 size = c->size; 824 rank = c->rank; 825 826 /* Get some new tags to keep the communication clean */ 827 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 828 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 829 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 830 831 ierr = PetscMalloc4(ismax,&irow,ismax,&icol,ismax,&nrow,ismax,&ncol);CHKERRQ(ierr); 832 833 for (i=0; i<ismax; i++) { 834 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 835 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 836 if (allcolumns[i]) { 837 icol[i] = NULL; 838 ncol[i] = C->cmap->N; 839 } else { 840 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 841 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 842 } 843 } 844 845 /* evaluate communication - mesg to who, length of mesg, and buffer space 846 required. Based on this, buffers are allocated, and data copied into them*/ 847 ierr = PetscMalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size */ 848 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 849 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 850 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 851 for (i=0; i<ismax; i++) { 852 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 853 jmax = nrow[i]; 854 irow_i = irow[i]; 855 for (j=0; j<jmax; j++) { 856 l = 0; 857 row = irow_i[j]; 858 while (row >= C->rmap->range[l+1]) l++; 859 proc = l; 860 w4[proc]++; 861 } 862 for (j=0; j<size; j++) { 863 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 864 } 865 } 866 867 nrqs = 0; /* no of outgoing messages */ 868 msz = 0; /* total mesg length (for all procs) */ 869 w1[rank] = 0; /* no mesg sent to self */ 870 w3[rank] = 0; 871 for (i=0; i<size; i++) { 872 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 873 } 874 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/ 875 for (i=0,j=0; i<size; i++) { 876 if (w1[i]) { pa[j] = i; j++; } 877 } 878 879 /* Each message would have a header = 1 + 2*(no of IS) + data */ 880 for (i=0; i<nrqs; i++) { 881 j = pa[i]; 882 w1[j] += w2[j] + 2* w3[j]; 883 msz += w1[j]; 884 } 885 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 886 887 /* Determine the number of messages to expect, their lengths, from from-ids */ 888 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 889 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 890 891 /* Now post the Irecvs corresponding to these messages */ 892 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 893 894 ierr = PetscFree(onodes1);CHKERRQ(ierr); 895 ierr = PetscFree(olengths1);CHKERRQ(ierr); 896 897 /* Allocate Memory for outgoing messages */ 898 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr); 899 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 900 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 901 902 { 903 PetscInt *iptr = tmp,ict = 0; 904 for (i=0; i<nrqs; i++) { 905 j = pa[i]; 906 iptr += ict; 907 sbuf1[j] = iptr; 908 ict = w1[j]; 909 } 910 } 911 912 /* Form the outgoing messages */ 913 /* Initialize the header space */ 914 for (i=0; i<nrqs; i++) { 915 j = pa[i]; 916 sbuf1[j][0] = 0; 917 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 918 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 919 } 920 921 /* Parse the isrow and copy data into outbuf */ 922 for (i=0; i<ismax; i++) { 923 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 924 irow_i = irow[i]; 925 jmax = nrow[i]; 926 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 927 l = 0; 928 row = irow_i[j]; 929 while (row >= C->rmap->range[l+1]) l++; 930 proc = l; 931 if (proc != rank) { /* copy to the outgoing buf*/ 932 ctr[proc]++; 933 *ptr[proc] = row; 934 ptr[proc]++; 935 } 936 } 937 /* Update the headers for the current IS */ 938 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 939 if ((ctr_j = ctr[j])) { 940 sbuf1_j = sbuf1[j]; 941 k = ++sbuf1_j[0]; 942 sbuf1_j[2*k] = ctr_j; 943 sbuf1_j[2*k-1] = i; 944 } 945 } 946 } 947 948 /* Now post the sends */ 949 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr); 950 for (i=0; i<nrqs; ++i) { 951 j = pa[i]; 952 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 953 } 954 955 /* Post Receives to capture the buffer size */ 956 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr); 957 ierr = PetscMalloc1(nrqs+1,&rbuf2);CHKERRQ(ierr); 958 rbuf2[0] = tmp + msz; 959 for (i=1; i<nrqs; ++i) { 960 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 961 } 962 for (i=0; i<nrqs; ++i) { 963 j = pa[i]; 964 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 965 } 966 967 /* Send to other procs the buf size they should allocate */ 968 969 970 /* Receive messages*/ 971 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr); 972 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr); 973 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source);CHKERRQ(ierr); 974 { 975 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 976 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 977 PetscInt *sbuf2_i; 978 979 for (i=0; i<nrqr; ++i) { 980 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 981 982 req_size[idex] = 0; 983 rbuf1_i = rbuf1[idex]; 984 start = 2*rbuf1_i[0] + 1; 985 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 986 ierr = PetscMalloc1(end+1,&sbuf2[idex]);CHKERRQ(ierr); 987 sbuf2_i = sbuf2[idex]; 988 for (j=start; j<end; j++) { 989 id = rbuf1_i[j] - rstart; 990 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 991 sbuf2_i[j] = ncols; 992 req_size[idex] += ncols; 993 } 994 req_source[idex] = r_status1[i].MPI_SOURCE; 995 /* form the header */ 996 sbuf2_i[0] = req_size[idex]; 997 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j]; 998 999 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1000 } 1001 } 1002 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1003 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1004 1005 /* recv buffer sizes */ 1006 /* Receive messages*/ 1007 1008 ierr = PetscMalloc1(nrqs+1,&rbuf3);CHKERRQ(ierr); 1009 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr); 1010 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr); 1011 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr); 1012 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr); 1013 1014 for (i=0; i<nrqs; ++i) { 1015 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1016 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf3[idex]);CHKERRQ(ierr); 1017 ierr = PetscMalloc1(rbuf2[idex][0]+1,&rbuf4[idex]);CHKERRQ(ierr); 1018 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1019 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1020 } 1021 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1022 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1023 1024 /* Wait on sends1 and sends2 */ 1025 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr); 1026 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr); 1027 1028 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1029 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1030 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1031 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1032 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1033 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1034 1035 /* Now allocate buffers for a->j, and send them off */ 1036 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr); 1037 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1038 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr); 1039 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1040 1041 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr); 1042 { 1043 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1044 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1045 PetscInt cend = C->cmap->rend; 1046 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1047 1048 for (i=0; i<nrqr; i++) { 1049 rbuf1_i = rbuf1[i]; 1050 sbuf_aj_i = sbuf_aj[i]; 1051 ct1 = 2*rbuf1_i[0] + 1; 1052 ct2 = 0; 1053 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1054 kmax = rbuf1[i][2*j]; 1055 for (k=0; k<kmax; k++,ct1++) { 1056 row = rbuf1_i[ct1] - rstart; 1057 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1058 ncols = nzA + nzB; 1059 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1060 1061 /* load the column indices for this row into cols*/ 1062 cols = sbuf_aj_i + ct2; 1063 1064 lwrite = 0; 1065 for (l=0; l<nzB; l++) { 1066 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1067 } 1068 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1069 for (l=0; l<nzB; l++) { 1070 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1071 } 1072 1073 ct2 += ncols; 1074 } 1075 } 1076 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1077 } 1078 } 1079 ierr = PetscMalloc1(nrqs+1,&r_status3);CHKERRQ(ierr); 1080 ierr = PetscMalloc1(nrqr+1,&s_status3);CHKERRQ(ierr); 1081 1082 /* Allocate buffers for a->a, and send them off */ 1083 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr); 1084 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1085 ierr = PetscMalloc1(j+1,&sbuf_aa[0]);CHKERRQ(ierr); 1086 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1087 1088 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr); 1089 { 1090 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1091 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1092 PetscInt cend = C->cmap->rend; 1093 PetscInt *b_j = b->j; 1094 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1095 1096 for (i=0; i<nrqr; i++) { 1097 rbuf1_i = rbuf1[i]; 1098 sbuf_aa_i = sbuf_aa[i]; 1099 ct1 = 2*rbuf1_i[0]+1; 1100 ct2 = 0; 1101 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1102 kmax = rbuf1_i[2*j]; 1103 for (k=0; k<kmax; k++,ct1++) { 1104 row = rbuf1_i[ct1] - rstart; 1105 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1106 ncols = nzA + nzB; 1107 cworkB = b_j + b_i[row]; 1108 vworkA = a_a + a_i[row]; 1109 vworkB = b_a + b_i[row]; 1110 1111 /* load the column values for this row into vals*/ 1112 vals = sbuf_aa_i+ct2; 1113 1114 lwrite = 0; 1115 for (l=0; l<nzB; l++) { 1116 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1117 } 1118 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1119 for (l=0; l<nzB; l++) { 1120 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1121 } 1122 1123 ct2 += ncols; 1124 } 1125 } 1126 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1127 } 1128 } 1129 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1130 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1131 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr); 1132 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr); 1133 1134 /* Form the matrix */ 1135 /* create col map: global col of C -> local col of submatrices */ 1136 { 1137 const PetscInt *icol_i; 1138 #if defined(PETSC_USE_CTABLE) 1139 ierr = PetscMalloc1(1+ismax,&cmap);CHKERRQ(ierr); 1140 for (i=0; i<ismax; i++) { 1141 if (!allcolumns[i]) { 1142 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1143 1144 jmax = ncol[i]; 1145 icol_i = icol[i]; 1146 cmap_i = cmap[i]; 1147 for (j=0; j<jmax; j++) { 1148 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1149 } 1150 } else { 1151 cmap[i] = NULL; 1152 } 1153 } 1154 #else 1155 ierr = PetscMalloc1(ismax,&cmap);CHKERRQ(ierr); 1156 for (i=0; i<ismax; i++) { 1157 if (!allcolumns[i]) { 1158 ierr = PetscMalloc1(C->cmap->N,&cmap[i]);CHKERRQ(ierr); 1159 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1160 jmax = ncol[i]; 1161 icol_i = icol[i]; 1162 cmap_i = cmap[i]; 1163 for (j=0; j<jmax; j++) { 1164 cmap_i[icol_i[j]] = j+1; 1165 } 1166 } else { 1167 cmap[i] = NULL; 1168 } 1169 } 1170 #endif 1171 } 1172 1173 /* Create lens which is required for MatCreate... */ 1174 for (i=0,j=0; i<ismax; i++) j += nrow[i]; 1175 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr); 1176 if (ismax) { 1177 ierr = PetscMalloc1(j,&lens[0]);CHKERRQ(ierr); 1178 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1179 } 1180 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1]; 1181 1182 /* Update lens from local data */ 1183 for (i=0; i<ismax; i++) { 1184 jmax = nrow[i]; 1185 if (!allcolumns[i]) cmap_i = cmap[i]; 1186 irow_i = irow[i]; 1187 lens_i = lens[i]; 1188 for (j=0; j<jmax; j++) { 1189 l = 0; 1190 row = irow_i[j]; 1191 while (row >= C->rmap->range[l+1]) l++; 1192 proc = l; 1193 if (proc == rank) { 1194 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1195 if (!allcolumns[i]) { 1196 for (k=0; k<ncols; k++) { 1197 #if defined(PETSC_USE_CTABLE) 1198 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1199 #else 1200 tcol = cmap_i[cols[k]]; 1201 #endif 1202 if (tcol) lens_i[j]++; 1203 } 1204 } else { /* allcolumns */ 1205 lens_i[j] = ncols; 1206 } 1207 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1208 } 1209 } 1210 } 1211 1212 /* Create row map: global row of C -> local row of submatrices */ 1213 #if defined(PETSC_USE_CTABLE) 1214 ierr = PetscMalloc1(1+ismax,&rmap);CHKERRQ(ierr); 1215 for (i=0; i<ismax; i++) { 1216 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1217 rmap_i = rmap[i]; 1218 irow_i = irow[i]; 1219 jmax = nrow[i]; 1220 for (j=0; j<jmax; j++) { 1221 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1222 } 1223 } 1224 #else 1225 ierr = PetscMalloc1(ismax,&rmap);CHKERRQ(ierr); 1226 if (ismax) { 1227 ierr = PetscMalloc1(ismax*C->rmap->N,&rmap[0]);CHKERRQ(ierr); 1228 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1229 } 1230 for (i=1; i<ismax; i++) rmap[i] = rmap[i-1] + C->rmap->N; 1231 for (i=0; i<ismax; i++) { 1232 rmap_i = rmap[i]; 1233 irow_i = irow[i]; 1234 jmax = nrow[i]; 1235 for (j=0; j<jmax; j++) { 1236 rmap_i[irow_i[j]] = j; 1237 } 1238 } 1239 #endif 1240 1241 /* Update lens from offproc data */ 1242 { 1243 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1244 1245 for (tmp2=0; tmp2<nrqs; tmp2++) { 1246 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1247 idex = pa[idex2]; 1248 sbuf1_i = sbuf1[idex]; 1249 jmax = sbuf1_i[0]; 1250 ct1 = 2*jmax+1; 1251 ct2 = 0; 1252 rbuf2_i = rbuf2[idex2]; 1253 rbuf3_i = rbuf3[idex2]; 1254 for (j=1; j<=jmax; j++) { 1255 is_no = sbuf1_i[2*j-1]; 1256 max1 = sbuf1_i[2*j]; 1257 lens_i = lens[is_no]; 1258 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1259 rmap_i = rmap[is_no]; 1260 for (k=0; k<max1; k++,ct1++) { 1261 #if defined(PETSC_USE_CTABLE) 1262 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1263 row--; 1264 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); 1265 #else 1266 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1267 #endif 1268 max2 = rbuf2_i[ct1]; 1269 for (l=0; l<max2; l++,ct2++) { 1270 if (!allcolumns[is_no]) { 1271 #if defined(PETSC_USE_CTABLE) 1272 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1273 #else 1274 tcol = cmap_i[rbuf3_i[ct2]]; 1275 #endif 1276 if (tcol) lens_i[row]++; 1277 } else { /* allcolumns */ 1278 lens_i[row]++; /* lens_i[row] += max2 ? */ 1279 } 1280 } 1281 } 1282 } 1283 } 1284 } 1285 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1286 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1287 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1288 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1289 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1290 1291 /* Create the submatrices */ 1292 if (scall == MAT_REUSE_MATRIX) { 1293 PetscBool flag; 1294 1295 /* 1296 Assumes new rows are same length as the old rows,hence bug! 1297 */ 1298 for (i=0; i<ismax; i++) { 1299 mat = (Mat_SeqAIJ*)(submats[i]->data); 1300 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1301 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1302 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1303 /* Initial matrix as if empty */ 1304 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1305 1306 submats[i]->factortype = C->factortype; 1307 } 1308 } else { 1309 for (i=0; i<ismax; i++) { 1310 PetscInt rbs,cbs; 1311 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1312 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1313 1314 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1315 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1316 1317 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1318 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1319 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1320 } 1321 } 1322 1323 /* Assemble the matrices */ 1324 /* First assemble the local rows */ 1325 { 1326 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1327 PetscScalar *imat_a; 1328 1329 for (i=0; i<ismax; i++) { 1330 mat = (Mat_SeqAIJ*)submats[i]->data; 1331 imat_ilen = mat->ilen; 1332 imat_j = mat->j; 1333 imat_i = mat->i; 1334 imat_a = mat->a; 1335 1336 if (!allcolumns[i]) cmap_i = cmap[i]; 1337 rmap_i = rmap[i]; 1338 irow_i = irow[i]; 1339 jmax = nrow[i]; 1340 for (j=0; j<jmax; j++) { 1341 l = 0; 1342 row = irow_i[j]; 1343 while (row >= C->rmap->range[l+1]) l++; 1344 proc = l; 1345 if (proc == rank) { 1346 old_row = row; 1347 #if defined(PETSC_USE_CTABLE) 1348 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1349 row--; 1350 #else 1351 row = rmap_i[row]; 1352 #endif 1353 ilen_row = imat_ilen[row]; 1354 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1355 mat_i = imat_i[row]; 1356 mat_a = imat_a + mat_i; 1357 mat_j = imat_j + mat_i; 1358 if (!allcolumns[i]) { 1359 for (k=0; k<ncols; k++) { 1360 #if defined(PETSC_USE_CTABLE) 1361 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1362 #else 1363 tcol = cmap_i[cols[k]]; 1364 #endif 1365 if (tcol) { 1366 *mat_j++ = tcol - 1; 1367 *mat_a++ = vals[k]; 1368 ilen_row++; 1369 } 1370 } 1371 } else { /* allcolumns */ 1372 for (k=0; k<ncols; k++) { 1373 *mat_j++ = cols[k]; /* global col index! */ 1374 *mat_a++ = vals[k]; 1375 ilen_row++; 1376 } 1377 } 1378 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1379 1380 imat_ilen[row] = ilen_row; 1381 } 1382 } 1383 } 1384 } 1385 1386 /* Now assemble the off proc rows*/ 1387 { 1388 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1389 PetscInt *imat_j,*imat_i; 1390 PetscScalar *imat_a,*rbuf4_i; 1391 1392 for (tmp2=0; tmp2<nrqs; tmp2++) { 1393 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1394 idex = pa[idex2]; 1395 sbuf1_i = sbuf1[idex]; 1396 jmax = sbuf1_i[0]; 1397 ct1 = 2*jmax + 1; 1398 ct2 = 0; 1399 rbuf2_i = rbuf2[idex2]; 1400 rbuf3_i = rbuf3[idex2]; 1401 rbuf4_i = rbuf4[idex2]; 1402 for (j=1; j<=jmax; j++) { 1403 is_no = sbuf1_i[2*j-1]; 1404 rmap_i = rmap[is_no]; 1405 if (!allcolumns[is_no]) cmap_i = cmap[is_no]; 1406 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1407 imat_ilen = mat->ilen; 1408 imat_j = mat->j; 1409 imat_i = mat->i; 1410 imat_a = mat->a; 1411 max1 = sbuf1_i[2*j]; 1412 for (k=0; k<max1; k++,ct1++) { 1413 row = sbuf1_i[ct1]; 1414 #if defined(PETSC_USE_CTABLE) 1415 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1416 row--; 1417 #else 1418 row = rmap_i[row]; 1419 #endif 1420 ilen = imat_ilen[row]; 1421 mat_i = imat_i[row]; 1422 mat_a = imat_a + mat_i; 1423 mat_j = imat_j + mat_i; 1424 max2 = rbuf2_i[ct1]; 1425 if (!allcolumns[is_no]) { 1426 for (l=0; l<max2; l++,ct2++) { 1427 1428 #if defined(PETSC_USE_CTABLE) 1429 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1430 #else 1431 tcol = cmap_i[rbuf3_i[ct2]]; 1432 #endif 1433 if (tcol) { 1434 *mat_j++ = tcol - 1; 1435 *mat_a++ = rbuf4_i[ct2]; 1436 ilen++; 1437 } 1438 } 1439 } else { /* allcolumns */ 1440 for (l=0; l<max2; l++,ct2++) { 1441 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1442 *mat_a++ = rbuf4_i[ct2]; 1443 ilen++; 1444 } 1445 } 1446 imat_ilen[row] = ilen; 1447 } 1448 } 1449 } 1450 } 1451 1452 /* sort the rows */ 1453 { 1454 PetscInt *imat_ilen,*imat_j,*imat_i; 1455 PetscScalar *imat_a; 1456 1457 for (i=0; i<ismax; i++) { 1458 mat = (Mat_SeqAIJ*)submats[i]->data; 1459 imat_j = mat->j; 1460 imat_i = mat->i; 1461 imat_a = mat->a; 1462 imat_ilen = mat->ilen; 1463 1464 if (allcolumns[i]) continue; 1465 jmax = nrow[i]; 1466 for (j=0; j<jmax; j++) { 1467 PetscInt ilen; 1468 1469 mat_i = imat_i[j]; 1470 mat_a = imat_a + mat_i; 1471 mat_j = imat_j + mat_i; 1472 ilen = imat_ilen[j]; 1473 ierr = PetscSortIntWithMatScalarArray(ilen,mat_j,mat_a);CHKERRQ(ierr); 1474 } 1475 } 1476 } 1477 1478 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1479 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1480 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1481 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1482 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1483 1484 /* Restore the indices */ 1485 for (i=0; i<ismax; i++) { 1486 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1487 if (!allcolumns[i]) { 1488 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1489 } 1490 } 1491 1492 /* Destroy allocated memory */ 1493 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1494 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1495 ierr = PetscFree(pa);CHKERRQ(ierr); 1496 1497 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1498 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1499 for (i=0; i<nrqr; ++i) { 1500 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1501 } 1502 for (i=0; i<nrqs; ++i) { 1503 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1504 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1505 } 1506 1507 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1508 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1509 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1510 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1511 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1512 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1513 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1514 1515 #if defined(PETSC_USE_CTABLE) 1516 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1517 #else 1518 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1519 #endif 1520 ierr = PetscFree(rmap);CHKERRQ(ierr); 1521 1522 for (i=0; i<ismax; i++) { 1523 if (!allcolumns[i]) { 1524 #if defined(PETSC_USE_CTABLE) 1525 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1526 #else 1527 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1528 #endif 1529 } 1530 } 1531 ierr = PetscFree(cmap);CHKERRQ(ierr); 1532 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1533 ierr = PetscFree(lens);CHKERRQ(ierr); 1534 1535 for (i=0; i<ismax; i++) { 1536 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1537 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1538 } 1539 PetscFunctionReturn(0); 1540 } 1541 1542 /* 1543 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1544 Be careful not to destroy them elsewhere. 1545 */ 1546 #undef __FUNCT__ 1547 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1548 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1549 { 1550 /* If making this function public, change the error returned in this function away from _PLIB. */ 1551 PetscErrorCode ierr; 1552 Mat_MPIAIJ *aij; 1553 PetscBool seqaij; 1554 1555 PetscFunctionBegin; 1556 /* Check to make sure the component matrices are compatible with C. */ 1557 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1558 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1559 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1560 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1561 if (PetscAbs(A->rmap->n) != PetscAbs(B->rmap->n) || PetscAbs(A->rmap->bs) != PetscAbs(B->rmap->bs) || PetscAbs(A->cmap->bs) != PetscAbs(B->cmap->bs)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1562 1563 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1564 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1565 ierr = MatSetBlockSizesFromMats(*C,A,A);CHKERRQ(ierr); 1566 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1567 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1568 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1569 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1570 aij = (Mat_MPIAIJ*)((*C)->data); 1571 aij->A = A; 1572 aij->B = B; 1573 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)A);CHKERRQ(ierr); 1574 ierr = PetscLogObjectParent((PetscObject)*C,(PetscObject)B);CHKERRQ(ierr); 1575 1576 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1577 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1578 PetscFunctionReturn(0); 1579 } 1580 1581 #undef __FUNCT__ 1582 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1583 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1584 { 1585 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1586 1587 PetscFunctionBegin; 1588 PetscValidPointer(A,2); 1589 PetscValidPointer(B,3); 1590 *A = aij->A; 1591 *B = aij->B; 1592 /* Note that we don't incref *A and *B, so be careful! */ 1593 PetscFunctionReturn(0); 1594 } 1595 1596 #undef __FUNCT__ 1597 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1598 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1599 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1600 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1601 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1602 { 1603 PetscErrorCode ierr; 1604 PetscMPIInt size, flag; 1605 PetscInt i,ii; 1606 PetscInt ismax_c; 1607 1608 PetscFunctionBegin; 1609 if (!ismax) PetscFunctionReturn(0); 1610 1611 for (i = 0, ismax_c = 0; i < ismax; ++i) { 1612 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 1613 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1614 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1615 if (size > 1) ++ismax_c; 1616 } 1617 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1618 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1619 } else { /* if (ismax_c) */ 1620 Mat *A,*B; 1621 IS *isrow_c, *iscol_c; 1622 PetscMPIInt size; 1623 /* 1624 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1625 array of sequential matrices underlying the resulting parallel matrices. 1626 Which arrays to allocate is based on the value of MatReuse scall. 1627 There are as many diag matrices as there are original index sets. 1628 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1629 1630 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1631 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1632 will deposite the extracted diag and off-diag parts. 1633 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1634 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1635 */ 1636 1637 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1638 if (scall != MAT_REUSE_MATRIX) { 1639 ierr = PetscMalloc1(ismax,submat);CHKERRQ(ierr); 1640 } else { 1641 ierr = PetscMalloc1(ismax, &A);CHKERRQ(ierr); 1642 ierr = PetscMalloc1(ismax_c, &B);CHKERRQ(ierr); 1643 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1644 for (i = 0, ii = 0; i < ismax; ++i) { 1645 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1646 if (size > 1) { 1647 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 1648 ++ii; 1649 } else A[i] = (*submat)[i]; 1650 } 1651 } 1652 /* 1653 Construct the complements of the iscol ISs for parallel ISs only. 1654 These are used to extract the off-diag portion of the resulting parallel matrix. 1655 The row IS for the off-diag portion is the same as for the diag portion, 1656 so we merely alias the row IS, while skipping those that are sequential. 1657 */ 1658 ierr = PetscMalloc2(ismax_c,&isrow_c, ismax_c, &iscol_c);CHKERRQ(ierr); 1659 for (i = 0, ii = 0; i < ismax; ++i) { 1660 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1661 if (size > 1) { 1662 isrow_c[ii] = isrow[i]; 1663 1664 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 1665 ++ii; 1666 } 1667 } 1668 /* Now obtain the sequential A and B submatrices separately. */ 1669 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 1670 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 1671 for (ii = 0; ii < ismax_c; ++ii) { 1672 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 1673 } 1674 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 1675 /* 1676 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1677 have been extracted directly into the parallel matrices containing them, or 1678 simply into the sequential matrix identical with the corresponding A (if size == 1). 1679 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1680 A is put into the correct submat slot (if size == 1). 1681 */ 1682 if (scall != MAT_REUSE_MATRIX) { 1683 for (i = 0, ii = 0; i < ismax; ++i) { 1684 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1685 if (size > 1) { 1686 /* 1687 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1688 */ 1689 /* Construct submat[i] from the Seq pieces A and B. */ 1690 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 1691 1692 ++ii; 1693 } else (*submat)[i] = A[i]; 1694 } 1695 } 1696 ierr = PetscFree(A);CHKERRQ(ierr); 1697 ierr = PetscFree(B);CHKERRQ(ierr); 1698 } 1699 PetscFunctionReturn(0); 1700 } /* MatGetSubMatricesParallel_MPIXAIJ() */ 1701 1702 1703 1704 #undef __FUNCT__ 1705 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1706 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1707 { 1708 PetscErrorCode ierr; 1709 1710 PetscFunctionBegin; 1711 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 1712 PetscFunctionReturn(0); 1713 } 1714