1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++){ 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 260 rw1[proc] = isz1[i]; 261 } 262 ierr = PetscFree(onodes1);CHKERRQ(ierr); 263 ierr = PetscFree(olengths1);CHKERRQ(ierr); 264 265 /* Determine the number of messages to expect, their lengths, from from-ids */ 266 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 267 ierr = PetscFree(rw1);CHKERRQ(ierr); 268 } 269 /* Now post the Irecvs corresponding to these messages */ 270 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 271 272 /* Now post the sends */ 273 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 274 for (i=0; i<nrqr; ++i) { 275 j = recv_status[i].MPI_SOURCE; 276 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* receive work done on other processors*/ 280 { 281 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 282 PetscMPIInt idex; 283 PetscBT table_i; 284 MPI_Status *status2; 285 286 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 287 for (i=0; i<nrqs; ++i) { 288 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 289 /* Process the message*/ 290 rbuf2_i = rbuf2[idex]; 291 ct1 = 2*rbuf2_i[0]+1; 292 jmax = rbuf2[idex][0]; 293 for (j=1; j<=jmax; j++) { 294 max = rbuf2_i[2*j]; 295 is_no = rbuf2_i[2*j-1]; 296 isz_i = isz[is_no]; 297 data_i = data[is_no]; 298 table_i = table[is_no]; 299 for (k=0; k<max; k++,ct1++) { 300 row = rbuf2_i[ct1]; 301 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 302 } 303 isz[is_no] = isz_i; 304 } 305 } 306 307 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 308 ierr = PetscFree(status2);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<imax; ++i) { 312 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 313 } 314 315 ierr = PetscFree(onodes2);CHKERRQ(ierr); 316 ierr = PetscFree(olengths2);CHKERRQ(ierr); 317 318 ierr = PetscFree(pa);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 324 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 325 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 326 ierr = PetscFree(s_status);CHKERRQ(ierr); 327 ierr = PetscFree(recv_status);CHKERRQ(ierr); 328 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 329 ierr = PetscFree(xdata);CHKERRQ(ierr); 330 ierr = PetscFree(isz1);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 336 /* 337 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 338 the work on the local processor. 339 340 Inputs: 341 C - MAT_MPIAIJ; 342 imax - total no of index sets processed at a time; 343 table - an array of char - size = m bits. 344 345 Output: 346 isz - array containing the count of the solution elements corresponding 347 to each index set; 348 data - pointer to the solutions 349 */ 350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 351 { 352 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 353 Mat A = c->A,B = c->B; 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 355 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 356 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 357 PetscBT table_i; 358 359 PetscFunctionBegin; 360 rstart = C->rmap->rstart; 361 cstart = C->cmap->rstart; 362 ai = a->i; 363 aj = a->j; 364 bi = b->i; 365 bj = b->j; 366 garray = c->garray; 367 368 369 for (i=0; i<imax; i++) { 370 data_i = data[i]; 371 table_i = table[i]; 372 isz_i = isz[i]; 373 for (j=0,max=isz[i]; j<max; j++) { 374 row = data_i[j] - rstart; 375 start = ai[row]; 376 end = ai[row+1]; 377 for (k=start; k<end; k++) { /* Amat */ 378 val = aj[k] + cstart; 379 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 380 } 381 start = bi[row]; 382 end = bi[row+1]; 383 for (k=start; k<end; k++) { /* Bmat */ 384 val = garray[bj[k]]; 385 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 386 } 387 } 388 isz[i] = isz_i; 389 } 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 395 /* 396 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 397 and return the output 398 399 Input: 400 C - the matrix 401 nrqr - no of messages being processed. 402 rbuf - an array of pointers to the recieved requests 403 404 Output: 405 xdata - array of messages to be sent back 406 isz1 - size of each message 407 408 For better efficiency perhaps we should malloc separately each xdata[i], 409 then if a remalloc is required we need only copy the data for that one row 410 rather then all previous rows as it is now where a single large chunck of 411 memory is used. 412 413 */ 414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 415 { 416 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 417 Mat A = c->A,B = c->B; 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 419 PetscErrorCode ierr; 420 PetscMPIInt rank; 421 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 422 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 423 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 424 PetscInt *rbuf_i,kmax,rbuf_0; 425 PetscBT xtable; 426 427 PetscFunctionBegin; 428 rank = c->rank; 429 m = C->rmap->N; 430 rstart = C->rmap->rstart; 431 cstart = C->cmap->rstart; 432 ai = a->i; 433 aj = a->j; 434 bi = b->i; 435 bj = b->j; 436 garray = c->garray; 437 438 439 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 440 rbuf_i = rbuf[i]; 441 rbuf_0 = rbuf_i[0]; 442 ct += rbuf_0; 443 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 444 } 445 446 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 447 else max1 = 1; 448 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 449 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 450 ++no_malloc; 451 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 452 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 453 454 ct3 = 0; 455 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 456 rbuf_i = rbuf[i]; 457 rbuf_0 = rbuf_i[0]; 458 ct1 = 2*rbuf_0+1; 459 ct2 = ct1; 460 ct3 += ct1; 461 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 462 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 463 oct2 = ct2; 464 kmax = rbuf_i[2*j]; 465 for (k=0; k<kmax; k++,ct1++) { 466 row = rbuf_i[ct1]; 467 if (!PetscBTLookupSet(xtable,row)) { 468 if (!(ct3 < mem_estimate)) { 469 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 470 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 471 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 472 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 473 xdata[0] = tmp; 474 mem_estimate = new_estimate; ++no_malloc; 475 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 476 } 477 xdata[i][ct2++] = row; 478 ct3++; 479 } 480 } 481 for (k=oct2,max2=ct2; k<max2; k++) { 482 row = xdata[i][k] - rstart; 483 start = ai[row]; 484 end = ai[row+1]; 485 for (l=start; l<end; l++) { 486 val = aj[l] + cstart; 487 if (!PetscBTLookupSet(xtable,val)) { 488 if (!(ct3 < mem_estimate)) { 489 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 490 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 491 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 492 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 493 xdata[0] = tmp; 494 mem_estimate = new_estimate; ++no_malloc; 495 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 496 } 497 xdata[i][ct2++] = val; 498 ct3++; 499 } 500 } 501 start = bi[row]; 502 end = bi[row+1]; 503 for (l=start; l<end; l++) { 504 val = garray[bj[l]]; 505 if (!PetscBTLookupSet(xtable,val)) { 506 if (!(ct3 < mem_estimate)) { 507 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 508 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 509 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 510 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 511 xdata[0] = tmp; 512 mem_estimate = new_estimate; ++no_malloc; 513 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 514 } 515 xdata[i][ct2++] = val; 516 ct3++; 517 } 518 } 519 } 520 /* Update the header*/ 521 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 522 xdata[i][2*j-1] = rbuf_i[2*j-1]; 523 } 524 xdata[i][0] = rbuf_0; 525 xdata[i+1] = xdata[i] + ct2; 526 isz1[i] = ct2; /* size of each message */ 527 } 528 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 529 ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 530 PetscFunctionReturn(0); 531 } 532 /* -------------------------------------------------------------------------*/ 533 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 534 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 535 /* 536 Every processor gets the entire matrix 537 */ 538 #undef __FUNCT__ 539 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 540 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 541 { 542 Mat B; 543 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 544 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 545 PetscErrorCode ierr; 546 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 547 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 548 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 549 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 550 551 PetscFunctionBegin; 552 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 553 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 554 555 if (scall == MAT_INITIAL_MATRIX) { 556 /* ---------------------------------------------------------------- 557 Tell every processor the number of nonzeros per row 558 */ 559 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 560 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 561 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 562 } 563 sendcount = A->rmap->rend - A->rmap->rstart; 564 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 565 for (i=0; i<size; i++) { 566 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 567 displs[i] = A->rmap->range[i]; 568 } 569 #if defined(PETSC_HAVE_MPI_IN_PLACE) 570 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 571 #else 572 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 573 #endif 574 /* --------------------------------------------------------------- 575 Create the sequential matrix of the same type as the local block diagonal 576 */ 577 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 578 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 579 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 580 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 581 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 582 **Bin = B; 583 b = (Mat_SeqAIJ *)B->data; 584 585 /*-------------------------------------------------------------------- 586 Copy my part of matrix column indices over 587 */ 588 sendcount = ad->nz + bd->nz; 589 jsendbuf = b->j + b->i[rstarts[rank]]; 590 a_jsendbuf = ad->j; 591 b_jsendbuf = bd->j; 592 n = A->rmap->rend - A->rmap->rstart; 593 cnt = 0; 594 for (i=0; i<n; i++) { 595 596 /* put in lower diagonal portion */ 597 m = bd->i[i+1] - bd->i[i]; 598 while (m > 0) { 599 /* is it above diagonal (in bd (compressed) numbering) */ 600 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 601 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 602 m--; 603 } 604 605 /* put in diagonal portion */ 606 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 607 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 608 } 609 610 /* put in upper diagonal portion */ 611 while (m-- > 0) { 612 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 613 } 614 } 615 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 616 617 /*-------------------------------------------------------------------- 618 Gather all column indices to all processors 619 */ 620 for (i=0; i<size; i++) { 621 recvcounts[i] = 0; 622 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 623 recvcounts[i] += lens[j]; 624 } 625 } 626 displs[0] = 0; 627 for (i=1; i<size; i++) { 628 displs[i] = displs[i-1] + recvcounts[i-1]; 629 } 630 #if defined(PETSC_HAVE_MPI_IN_PLACE) 631 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 632 #else 633 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 634 #endif 635 /*-------------------------------------------------------------------- 636 Assemble the matrix into useable form (note numerical values not yet set) 637 */ 638 /* set the b->ilen (length of each row) values */ 639 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 640 /* set the b->i indices */ 641 b->i[0] = 0; 642 for (i=1; i<=A->rmap->N; i++) { 643 b->i[i] = b->i[i-1] + lens[i-1]; 644 } 645 ierr = PetscFree(lens);CHKERRQ(ierr); 646 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 647 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 648 649 } else { 650 B = **Bin; 651 b = (Mat_SeqAIJ *)B->data; 652 } 653 654 /*-------------------------------------------------------------------- 655 Copy my part of matrix numerical values into the values location 656 */ 657 if (flag == MAT_GET_VALUES){ 658 sendcount = ad->nz + bd->nz; 659 sendbuf = b->a + b->i[rstarts[rank]]; 660 a_sendbuf = ad->a; 661 b_sendbuf = bd->a; 662 b_sendj = bd->j; 663 n = A->rmap->rend - A->rmap->rstart; 664 cnt = 0; 665 for (i=0; i<n; i++) { 666 667 /* put in lower diagonal portion */ 668 m = bd->i[i+1] - bd->i[i]; 669 while (m > 0) { 670 /* is it above diagonal (in bd (compressed) numbering) */ 671 if (garray[*b_sendj] > A->rmap->rstart + i) break; 672 sendbuf[cnt++] = *b_sendbuf++; 673 m--; 674 b_sendj++; 675 } 676 677 /* put in diagonal portion */ 678 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 679 sendbuf[cnt++] = *a_sendbuf++; 680 } 681 682 /* put in upper diagonal portion */ 683 while (m-- > 0) { 684 sendbuf[cnt++] = *b_sendbuf++; 685 b_sendj++; 686 } 687 } 688 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 689 690 /* ----------------------------------------------------------------- 691 Gather all numerical values to all processors 692 */ 693 if (!recvcounts) { 694 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 695 } 696 for (i=0; i<size; i++) { 697 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 698 } 699 displs[0] = 0; 700 for (i=1; i<size; i++) { 701 displs[i] = displs[i-1] + recvcounts[i-1]; 702 } 703 recvbuf = b->a; 704 #if defined(PETSC_HAVE_MPI_IN_PLACE) 705 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 706 #else 707 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 708 #endif 709 } /* endof (flag == MAT_GET_VALUES) */ 710 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 711 712 if (A->symmetric){ 713 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 714 } else if (A->hermitian) { 715 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 716 } else if (A->structurally_symmetric) { 717 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 718 } 719 PetscFunctionReturn(0); 720 } 721 722 723 724 #undef __FUNCT__ 725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 727 { 728 PetscErrorCode ierr; 729 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 730 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 731 732 PetscFunctionBegin; 733 /* 734 Check for special case: each processor gets entire matrix 735 */ 736 if (ismax == 1 && C->rmap->N == C->cmap->N) { 737 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 738 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 739 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 740 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 741 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 742 wantallmatrix = PETSC_TRUE; 743 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 744 } 745 } 746 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 747 if (twantallmatrix) { 748 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 749 PetscFunctionReturn(0); 750 } 751 752 /* Allocate memory to hold all the submatrices */ 753 if (scall != MAT_REUSE_MATRIX) { 754 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 755 } 756 757 /* Check for special case: each processor gets entire matrix columns */ 758 ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr); 759 for (i=0; i<ismax; i++) { 760 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 761 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 762 if (colflag && ncol == C->cmap->N){ 763 allcolumns[i] = PETSC_TRUE; 764 } else { 765 allcolumns[i] = PETSC_FALSE; 766 } 767 } 768 769 /* Determine the number of stages through which submatrices are done */ 770 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 771 772 /* 773 Each stage will extract nmax submatrices. 774 nmax is determined by the matrix column dimension. 775 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 776 */ 777 if (!nmax) nmax = 1; 778 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 779 780 /* Make sure every processor loops through the nstages */ 781 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 782 783 for (i=0,pos=0; i<nstages; i++) { 784 if (pos+nmax <= ismax) max_no = nmax; 785 else if (pos == ismax) max_no = 0; 786 else max_no = ismax-pos; 787 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 788 pos += max_no; 789 } 790 791 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 792 PetscFunctionReturn(0); 793 } 794 795 /* -------------------------------------------------------------------------*/ 796 #undef __FUNCT__ 797 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 798 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 799 { 800 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 801 Mat A = c->A; 802 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 803 const PetscInt **icol,**irow; 804 PetscInt *nrow,*ncol,start; 805 PetscErrorCode ierr; 806 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 807 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 808 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 809 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 810 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 811 #if defined (PETSC_USE_CTABLE) 812 PetscTable *cmap,cmap_i,*rmap,rmap_i; 813 #else 814 PetscInt **cmap,*cmap_i,**rmap,*rmap_i; 815 #endif 816 const PetscInt *irow_i; 817 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 818 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 819 MPI_Request *r_waits4,*s_waits3,*s_waits4; 820 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 821 MPI_Status *r_status3,*r_status4,*s_status4; 822 MPI_Comm comm; 823 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 824 PetscMPIInt *onodes1,*olengths1; 825 PetscMPIInt idex,idex2,end; 826 827 PetscFunctionBegin; 828 comm = ((PetscObject)C)->comm; 829 tag0 = ((PetscObject)C)->tag; 830 size = c->size; 831 rank = c->rank; 832 833 /* Get some new tags to keep the communication clean */ 834 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 835 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 836 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 837 838 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 839 840 for (i=0; i<ismax; i++) { 841 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 842 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 843 if (allcolumns[i]){ 844 icol[i] = PETSC_NULL; 845 ncol[i] = C->cmap->N; 846 } else { 847 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 848 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 849 } 850 } 851 852 /* evaluate communication - mesg to who, length of mesg, and buffer space 853 required. Based on this, buffers are allocated, and data copied into them*/ 854 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 855 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 856 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 857 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 858 for (i=0; i<ismax; i++) { 859 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 860 jmax = nrow[i]; 861 irow_i = irow[i]; 862 for (j=0; j<jmax; j++) { 863 l = 0; 864 row = irow_i[j]; 865 while (row >= C->rmap->range[l+1]) l++; 866 proc = l; 867 w4[proc]++; 868 } 869 for (j=0; j<size; j++) { 870 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 871 } 872 } 873 874 nrqs = 0; /* no of outgoing messages */ 875 msz = 0; /* total mesg length (for all procs) */ 876 w1[rank] = 0; /* no mesg sent to self */ 877 w3[rank] = 0; 878 for (i=0; i<size; i++) { 879 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 880 } 881 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 882 for (i=0,j=0; i<size; i++) { 883 if (w1[i]) { pa[j] = i; j++; } 884 } 885 886 /* Each message would have a header = 1 + 2*(no of IS) + data */ 887 for (i=0; i<nrqs; i++) { 888 j = pa[i]; 889 w1[j] += w2[j] + 2* w3[j]; 890 msz += w1[j]; 891 } 892 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 893 894 /* Determine the number of messages to expect, their lengths, from from-ids */ 895 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 896 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 897 898 /* Now post the Irecvs corresponding to these messages */ 899 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 900 901 ierr = PetscFree(onodes1);CHKERRQ(ierr); 902 ierr = PetscFree(olengths1);CHKERRQ(ierr); 903 904 /* Allocate Memory for outgoing messages */ 905 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 906 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 907 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 908 909 { 910 PetscInt *iptr = tmp,ict = 0; 911 for (i=0; i<nrqs; i++) { 912 j = pa[i]; 913 iptr += ict; 914 sbuf1[j] = iptr; 915 ict = w1[j]; 916 } 917 } 918 919 /* Form the outgoing messages */ 920 /* Initialize the header space */ 921 for (i=0; i<nrqs; i++) { 922 j = pa[i]; 923 sbuf1[j][0] = 0; 924 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 925 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 926 } 927 928 /* Parse the isrow and copy data into outbuf */ 929 for (i=0; i<ismax; i++) { 930 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 931 irow_i = irow[i]; 932 jmax = nrow[i]; 933 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 934 l = 0; 935 row = irow_i[j]; 936 while (row >= C->rmap->range[l+1]) l++; 937 proc = l; 938 if (proc != rank) { /* copy to the outgoing buf*/ 939 ctr[proc]++; 940 *ptr[proc] = row; 941 ptr[proc]++; 942 } 943 } 944 /* Update the headers for the current IS */ 945 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 946 if ((ctr_j = ctr[j])) { 947 sbuf1_j = sbuf1[j]; 948 k = ++sbuf1_j[0]; 949 sbuf1_j[2*k] = ctr_j; 950 sbuf1_j[2*k-1] = i; 951 } 952 } 953 } 954 955 /* Now post the sends */ 956 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 957 for (i=0; i<nrqs; ++i) { 958 j = pa[i]; 959 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 960 } 961 962 /* Post Receives to capture the buffer size */ 963 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 964 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 965 rbuf2[0] = tmp + msz; 966 for (i=1; i<nrqs; ++i) { 967 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 968 } 969 for (i=0; i<nrqs; ++i) { 970 j = pa[i]; 971 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 972 } 973 974 /* Send to other procs the buf size they should allocate */ 975 976 977 /* Receive messages*/ 978 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 979 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 980 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 981 { 982 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 983 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 984 PetscInt *sbuf2_i; 985 986 for (i=0; i<nrqr; ++i) { 987 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 988 req_size[idex] = 0; 989 rbuf1_i = rbuf1[idex]; 990 start = 2*rbuf1_i[0] + 1; 991 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 992 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 993 sbuf2_i = sbuf2[idex]; 994 for (j=start; j<end; j++) { 995 id = rbuf1_i[j] - rstart; 996 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 997 sbuf2_i[j] = ncols; 998 req_size[idex] += ncols; 999 } 1000 req_source[idex] = r_status1[i].MPI_SOURCE; 1001 /* form the header */ 1002 sbuf2_i[0] = req_size[idex]; 1003 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 1004 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1005 } 1006 } 1007 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1008 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1009 1010 /* recv buffer sizes */ 1011 /* Receive messages*/ 1012 1013 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1014 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1015 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1016 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1017 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1018 1019 for (i=0; i<nrqs; ++i) { 1020 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1021 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1022 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1023 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1024 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1025 } 1026 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1027 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1028 1029 /* Wait on sends1 and sends2 */ 1030 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1031 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1032 1033 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1034 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1035 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1036 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1037 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1038 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1039 1040 /* Now allocate buffers for a->j, and send them off */ 1041 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1042 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1043 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1044 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1045 1046 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1047 { 1048 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1049 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1050 PetscInt cend = C->cmap->rend; 1051 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1052 1053 for (i=0; i<nrqr; i++) { 1054 rbuf1_i = rbuf1[i]; 1055 sbuf_aj_i = sbuf_aj[i]; 1056 ct1 = 2*rbuf1_i[0] + 1; 1057 ct2 = 0; 1058 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1059 kmax = rbuf1[i][2*j]; 1060 for (k=0; k<kmax; k++,ct1++) { 1061 row = rbuf1_i[ct1] - rstart; 1062 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1063 ncols = nzA + nzB; 1064 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1065 1066 /* load the column indices for this row into cols*/ 1067 cols = sbuf_aj_i + ct2; 1068 1069 lwrite = 0; 1070 for (l=0; l<nzB; l++) { 1071 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1072 } 1073 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1074 for (l=0; l<nzB; l++) { 1075 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1076 } 1077 1078 ct2 += ncols; 1079 } 1080 } 1081 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1082 } 1083 } 1084 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1085 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1086 1087 /* Allocate buffers for a->a, and send them off */ 1088 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1089 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1090 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1091 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1092 1093 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1094 { 1095 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1096 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1097 PetscInt cend = C->cmap->rend; 1098 PetscInt *b_j = b->j; 1099 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1100 1101 for (i=0; i<nrqr; i++) { 1102 rbuf1_i = rbuf1[i]; 1103 sbuf_aa_i = sbuf_aa[i]; 1104 ct1 = 2*rbuf1_i[0]+1; 1105 ct2 = 0; 1106 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1107 kmax = rbuf1_i[2*j]; 1108 for (k=0; k<kmax; k++,ct1++) { 1109 row = rbuf1_i[ct1] - rstart; 1110 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1111 ncols = nzA + nzB; 1112 cworkB = b_j + b_i[row]; 1113 vworkA = a_a + a_i[row]; 1114 vworkB = b_a + b_i[row]; 1115 1116 /* load the column values for this row into vals*/ 1117 vals = sbuf_aa_i+ct2; 1118 1119 lwrite = 0; 1120 for (l=0; l<nzB; l++) { 1121 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1122 } 1123 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1124 for (l=0; l<nzB; l++) { 1125 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1126 } 1127 1128 ct2 += ncols; 1129 } 1130 } 1131 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1132 } 1133 } 1134 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1135 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1136 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1137 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1138 1139 /* Form the matrix */ 1140 /* create col map: global col of C -> local col of submatrices */ 1141 { 1142 const PetscInt *icol_i; 1143 #if defined (PETSC_USE_CTABLE) 1144 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1145 for (i=0; i<ismax; i++) { 1146 if (!allcolumns[i]){ 1147 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1148 jmax = ncol[i]; 1149 icol_i = icol[i]; 1150 cmap_i = cmap[i]; 1151 for (j=0; j<jmax; j++) { 1152 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr); 1153 } 1154 } else { 1155 cmap[i] = PETSC_NULL; 1156 } 1157 } 1158 #else 1159 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1160 for (i=0; i<ismax; i++) { 1161 if (!allcolumns[i]){ 1162 ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr); 1163 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1164 jmax = ncol[i]; 1165 icol_i = icol[i]; 1166 cmap_i = cmap[i]; 1167 for (j=0; j<jmax; j++) { 1168 cmap_i[icol_i[j]] = j+1; 1169 } 1170 } else { 1171 cmap[i] = PETSC_NULL; 1172 } 1173 } 1174 #endif 1175 } 1176 1177 /* Create lens which is required for MatCreate... */ 1178 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1179 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1180 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1181 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1182 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1183 1184 /* Update lens from local data */ 1185 for (i=0; i<ismax; i++) { 1186 jmax = nrow[i]; 1187 if (!allcolumns[i]) cmap_i = cmap[i]; 1188 irow_i = irow[i]; 1189 lens_i = lens[i]; 1190 for (j=0; j<jmax; j++) { 1191 l = 0; 1192 row = irow_i[j]; 1193 while (row >= C->rmap->range[l+1]) l++; 1194 proc = l; 1195 if (proc == rank) { 1196 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1197 if (!allcolumns[i]){ 1198 for (k=0; k<ncols; k++) { 1199 #if defined (PETSC_USE_CTABLE) 1200 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1201 #else 1202 tcol = cmap_i[cols[k]]; 1203 #endif 1204 if (tcol) { lens_i[j]++;} 1205 } 1206 } else { /* allcolumns */ 1207 lens_i[j] = ncols; 1208 } 1209 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1210 } 1211 } 1212 } 1213 1214 /* Create row map: global row of C -> local row of submatrices */ 1215 #if defined (PETSC_USE_CTABLE) 1216 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1217 for (i=0; i<ismax; i++) { 1218 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1219 rmap_i = rmap[i]; 1220 irow_i = irow[i]; 1221 jmax = nrow[i]; 1222 for (j=0; j<jmax; j++) { 1223 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr); 1224 } 1225 } 1226 #else 1227 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1228 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1229 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1230 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1231 for (i=0; i<ismax; i++) { 1232 rmap_i = rmap[i]; 1233 irow_i = irow[i]; 1234 jmax = nrow[i]; 1235 for (j=0; j<jmax; j++) { 1236 rmap_i[irow_i[j]] = j; 1237 } 1238 } 1239 #endif 1240 1241 /* Update lens from offproc data */ 1242 { 1243 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1244 1245 for (tmp2=0; tmp2<nrqs; tmp2++) { 1246 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1247 idex = pa[idex2]; 1248 sbuf1_i = sbuf1[idex]; 1249 jmax = sbuf1_i[0]; 1250 ct1 = 2*jmax+1; 1251 ct2 = 0; 1252 rbuf2_i = rbuf2[idex2]; 1253 rbuf3_i = rbuf3[idex2]; 1254 for (j=1; j<=jmax; j++) { 1255 is_no = sbuf1_i[2*j-1]; 1256 max1 = sbuf1_i[2*j]; 1257 lens_i = lens[is_no]; 1258 if (!allcolumns[is_no]){cmap_i = cmap[is_no];} 1259 rmap_i = rmap[is_no]; 1260 for (k=0; k<max1; k++,ct1++) { 1261 #if defined (PETSC_USE_CTABLE) 1262 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1263 row--; 1264 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1265 #else 1266 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1267 #endif 1268 max2 = rbuf2_i[ct1]; 1269 for (l=0; l<max2; l++,ct2++) { 1270 if (!allcolumns[is_no]){ 1271 #if defined (PETSC_USE_CTABLE) 1272 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1273 #else 1274 tcol = cmap_i[rbuf3_i[ct2]]; 1275 #endif 1276 if (tcol) { 1277 lens_i[row]++; 1278 } 1279 } else { /* allcolumns */ 1280 lens_i[row]++; /* lens_i[row] += max2 ? */ 1281 } 1282 } 1283 } 1284 } 1285 } 1286 } 1287 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1288 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1289 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1290 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1291 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1292 1293 /* Create the submatrices */ 1294 if (scall == MAT_REUSE_MATRIX) { 1295 PetscBool flag; 1296 1297 /* 1298 Assumes new rows are same length as the old rows,hence bug! 1299 */ 1300 for (i=0; i<ismax; i++) { 1301 mat = (Mat_SeqAIJ *)(submats[i]->data); 1302 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1303 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1304 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1305 /* Initial matrix as if empty */ 1306 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1307 submats[i]->factortype = C->factortype; 1308 } 1309 } else { 1310 for (i=0; i<ismax; i++) { 1311 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1312 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1313 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1314 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1315 } 1316 } 1317 1318 /* Assemble the matrices */ 1319 /* First assemble the local rows */ 1320 { 1321 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1322 PetscScalar *imat_a; 1323 1324 for (i=0; i<ismax; i++) { 1325 mat = (Mat_SeqAIJ*)submats[i]->data; 1326 imat_ilen = mat->ilen; 1327 imat_j = mat->j; 1328 imat_i = mat->i; 1329 imat_a = mat->a; 1330 if (!allcolumns[i]) cmap_i = cmap[i]; 1331 rmap_i = rmap[i]; 1332 irow_i = irow[i]; 1333 jmax = nrow[i]; 1334 for (j=0; j<jmax; j++) { 1335 l = 0; 1336 row = irow_i[j]; 1337 while (row >= C->rmap->range[l+1]) l++; 1338 proc = l; 1339 if (proc == rank) { 1340 old_row = row; 1341 #if defined (PETSC_USE_CTABLE) 1342 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1343 row--; 1344 #else 1345 row = rmap_i[row]; 1346 #endif 1347 ilen_row = imat_ilen[row]; 1348 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1349 mat_i = imat_i[row] ; 1350 mat_a = imat_a + mat_i; 1351 mat_j = imat_j + mat_i; 1352 if (!allcolumns[i]){ 1353 for (k=0; k<ncols; k++) { 1354 #if defined (PETSC_USE_CTABLE) 1355 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1356 #else 1357 tcol = cmap_i[cols[k]]; 1358 #endif 1359 if (tcol){ 1360 *mat_j++ = tcol - 1; 1361 *mat_a++ = vals[k]; 1362 ilen_row++; 1363 } 1364 } 1365 } else { /* allcolumns */ 1366 for (k=0; k<ncols; k++) { 1367 *mat_j++ = cols[k] ; /* global col index! */ 1368 *mat_a++ = vals[k]; 1369 ilen_row++; 1370 } 1371 } 1372 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1373 imat_ilen[row] = ilen_row; 1374 } 1375 } 1376 } 1377 } 1378 1379 /* Now assemble the off proc rows*/ 1380 { 1381 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1382 PetscInt *imat_j,*imat_i; 1383 PetscScalar *imat_a,*rbuf4_i; 1384 1385 for (tmp2=0; tmp2<nrqs; tmp2++) { 1386 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1387 idex = pa[idex2]; 1388 sbuf1_i = sbuf1[idex]; 1389 jmax = sbuf1_i[0]; 1390 ct1 = 2*jmax + 1; 1391 ct2 = 0; 1392 rbuf2_i = rbuf2[idex2]; 1393 rbuf3_i = rbuf3[idex2]; 1394 rbuf4_i = rbuf4[idex2]; 1395 for (j=1; j<=jmax; j++) { 1396 is_no = sbuf1_i[2*j-1]; 1397 rmap_i = rmap[is_no]; 1398 if (!allcolumns[is_no]){cmap_i = cmap[is_no];} 1399 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1400 imat_ilen = mat->ilen; 1401 imat_j = mat->j; 1402 imat_i = mat->i; 1403 imat_a = mat->a; 1404 max1 = sbuf1_i[2*j]; 1405 for (k=0; k<max1; k++,ct1++) { 1406 row = sbuf1_i[ct1]; 1407 #if defined (PETSC_USE_CTABLE) 1408 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1409 row--; 1410 #else 1411 row = rmap_i[row]; 1412 #endif 1413 ilen = imat_ilen[row]; 1414 mat_i = imat_i[row] ; 1415 mat_a = imat_a + mat_i; 1416 mat_j = imat_j + mat_i; 1417 max2 = rbuf2_i[ct1]; 1418 if (!allcolumns[is_no]){ 1419 for (l=0; l<max2; l++,ct2++) { 1420 1421 #if defined (PETSC_USE_CTABLE) 1422 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1423 #else 1424 tcol = cmap_i[rbuf3_i[ct2]]; 1425 #endif 1426 if (tcol) { 1427 *mat_j++ = tcol - 1; 1428 *mat_a++ = rbuf4_i[ct2]; 1429 ilen++; 1430 } 1431 } 1432 } else { /* allcolumns */ 1433 for (l=0; l<max2; l++,ct2++) { 1434 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1435 *mat_a++ = rbuf4_i[ct2]; 1436 ilen++; 1437 } 1438 } 1439 imat_ilen[row] = ilen; 1440 } 1441 } 1442 } 1443 } 1444 1445 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1446 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1447 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1448 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1449 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1450 1451 /* Restore the indices */ 1452 for (i=0; i<ismax; i++) { 1453 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1454 if (!allcolumns[i]){ 1455 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1456 } 1457 } 1458 1459 /* Destroy allocated memory */ 1460 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1461 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1462 ierr = PetscFree(pa);CHKERRQ(ierr); 1463 1464 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1465 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1466 for (i=0; i<nrqr; ++i) { 1467 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1468 } 1469 for (i=0; i<nrqs; ++i) { 1470 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1471 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1472 } 1473 1474 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1475 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1476 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1477 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1478 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1479 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1480 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1481 1482 #if defined (PETSC_USE_CTABLE) 1483 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1484 #else 1485 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1486 #endif 1487 ierr = PetscFree(rmap);CHKERRQ(ierr); 1488 1489 for (i=0; i<ismax; i++) { 1490 if (!allcolumns[i]){ 1491 #if defined (PETSC_USE_CTABLE) 1492 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1493 #else 1494 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1495 #endif 1496 } 1497 } 1498 ierr = PetscFree(cmap);CHKERRQ(ierr); 1499 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1500 ierr = PetscFree(lens);CHKERRQ(ierr); 1501 1502 for (i=0; i<ismax; i++) { 1503 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1504 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1505 } 1506 PetscFunctionReturn(0); 1507 } 1508 1509 /* 1510 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1511 Be careful not to destroy them elsewhere. 1512 */ 1513 #undef __FUNCT__ 1514 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1515 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1516 { 1517 /* If making this function public, change the error returned in this function away from _PLIB. */ 1518 PetscErrorCode ierr; 1519 Mat_MPIAIJ *aij; 1520 PetscBool seqaij; 1521 1522 PetscFunctionBegin; 1523 /* Check to make sure the component matrices are compatible with C. */ 1524 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1525 if(!seqaij) { 1526 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1527 } 1528 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1529 if(!seqaij) { 1530 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1531 } 1532 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1533 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1534 } 1535 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1536 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1537 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1538 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1539 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1540 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1541 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1542 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1543 } 1544 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1545 aij = (Mat_MPIAIJ*)((*C)->data); 1546 aij->A = A; 1547 aij->B = B; 1548 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1549 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1550 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1551 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1552 PetscFunctionReturn(0); 1553 } 1554 1555 #undef __FUNCT__ 1556 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1557 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1558 { 1559 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1560 PetscFunctionBegin; 1561 PetscValidPointer(A,2); 1562 PetscValidPointer(B,3); 1563 *A = aij->A; 1564 *B = aij->B; 1565 /* Note that we don't incref *A and *B, so be careful! */ 1566 PetscFunctionReturn(0); 1567 } 1568 1569 1570 #undef __FUNCT__ 1571 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1572 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1573 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1574 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1575 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1576 { 1577 PetscErrorCode ierr; 1578 PetscMPIInt size, flag; 1579 PetscInt i,ii; 1580 PetscInt ismax_c; 1581 PetscFunctionBegin; 1582 if(!ismax) { 1583 PetscFunctionReturn(0); 1584 } 1585 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1586 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1587 if(flag != MPI_IDENT) { 1588 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1589 } 1590 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1591 if(size > 1) { 1592 ++ismax_c; 1593 } 1594 } 1595 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1596 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1597 } 1598 else { /* if(ismax_c) */ 1599 Mat *A,*B; 1600 IS *isrow_c, *iscol_c; 1601 PetscMPIInt size; 1602 /* 1603 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1604 array of sequential matrices underlying the resulting parallel matrices. 1605 Which arrays to allocate is based on the value of MatReuse scall. 1606 There are as many diag matrices as there are original index sets. 1607 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1608 1609 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1610 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1611 will deposite the extracted diag and off-diag parts. 1612 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1613 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1614 */ 1615 1616 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1617 if (scall != MAT_REUSE_MATRIX) { 1618 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1619 } 1620 else { 1621 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1622 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1623 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1624 for(i = 0, ii = 0; i < ismax; ++i) { 1625 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1626 if(size > 1) { 1627 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1628 ++ii; 1629 } 1630 else { 1631 A[i] = (*submat)[i]; 1632 } 1633 } 1634 } 1635 /* 1636 Construct the complements of the iscol ISs for parallel ISs only. 1637 These are used to extract the off-diag portion of the resulting parallel matrix. 1638 The row IS for the off-diag portion is the same as for the diag portion, 1639 so we merely alias the row IS, while skipping those that are sequential. 1640 */ 1641 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1642 for(i = 0, ii = 0; i < ismax; ++i) { 1643 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1644 if(size > 1) { 1645 isrow_c[ii] = isrow[i]; 1646 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1647 ++ii; 1648 } 1649 } 1650 /* Now obtain the sequential A and B submatrices separately. */ 1651 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1652 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1653 for(ii = 0; ii < ismax_c; ++ii) { 1654 ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr); 1655 } 1656 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1657 /* 1658 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1659 have been extracted directly into the parallel matrices containing them, or 1660 simply into the sequential matrix identical with the corresponding A (if size == 1). 1661 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1662 A is put into the correct submat slot (if size == 1). 1663 */ 1664 if(scall != MAT_REUSE_MATRIX) { 1665 for(i = 0, ii = 0; i < ismax; ++i) { 1666 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1667 if(size > 1) { 1668 /* 1669 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1670 */ 1671 /* Construct submat[i] from the Seq pieces A and B. */ 1672 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1673 1674 ++ii; 1675 } 1676 else { 1677 (*submat)[i] = A[i]; 1678 } 1679 } 1680 } 1681 ierr = PetscFree(A); CHKERRQ(ierr); 1682 ierr = PetscFree(B); CHKERRQ(ierr); 1683 } 1684 PetscFunctionReturn(0); 1685 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1686 1687 1688 1689 #undef __FUNCT__ 1690 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1691 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1692 { 1693 PetscErrorCode ierr; 1694 PetscFunctionBegin; 1695 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1696 MatCreateMPIAIJFromSeqMatrices_Private, 1697 MatMPIAIJExtractSeqMatrices_Private); 1698 CHKERRQ(ierr); 1699 PetscFunctionReturn(0); 1700 } 1701