1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++){ 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 260 rw1[proc] = isz1[i]; 261 } 262 ierr = PetscFree(onodes1);CHKERRQ(ierr); 263 ierr = PetscFree(olengths1);CHKERRQ(ierr); 264 265 /* Determine the number of messages to expect, their lengths, from from-ids */ 266 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 267 ierr = PetscFree(rw1);CHKERRQ(ierr); 268 } 269 /* Now post the Irecvs corresponding to these messages */ 270 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 271 272 /* Now post the sends */ 273 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 274 for (i=0; i<nrqr; ++i) { 275 j = recv_status[i].MPI_SOURCE; 276 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* receive work done on other processors*/ 280 { 281 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 282 PetscMPIInt idex; 283 PetscBT table_i; 284 MPI_Status *status2; 285 286 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 287 for (i=0; i<nrqs; ++i) { 288 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 289 /* Process the message*/ 290 rbuf2_i = rbuf2[idex]; 291 ct1 = 2*rbuf2_i[0]+1; 292 jmax = rbuf2[idex][0]; 293 for (j=1; j<=jmax; j++) { 294 max = rbuf2_i[2*j]; 295 is_no = rbuf2_i[2*j-1]; 296 isz_i = isz[is_no]; 297 data_i = data[is_no]; 298 table_i = table[is_no]; 299 for (k=0; k<max; k++,ct1++) { 300 row = rbuf2_i[ct1]; 301 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 302 } 303 isz[is_no] = isz_i; 304 } 305 } 306 307 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 308 ierr = PetscFree(status2);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<imax; ++i) { 312 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 313 } 314 315 ierr = PetscFree(onodes2);CHKERRQ(ierr); 316 ierr = PetscFree(olengths2);CHKERRQ(ierr); 317 318 ierr = PetscFree(pa);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 324 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 325 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 326 ierr = PetscFree(s_status);CHKERRQ(ierr); 327 ierr = PetscFree(recv_status);CHKERRQ(ierr); 328 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 329 ierr = PetscFree(xdata);CHKERRQ(ierr); 330 ierr = PetscFree(isz1);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 336 /* 337 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 338 the work on the local processor. 339 340 Inputs: 341 C - MAT_MPIAIJ; 342 imax - total no of index sets processed at a time; 343 table - an array of char - size = m bits. 344 345 Output: 346 isz - array containing the count of the solution elements corresponding 347 to each index set; 348 data - pointer to the solutions 349 */ 350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 351 { 352 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 353 Mat A = c->A,B = c->B; 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 355 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 356 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 357 PetscBT table_i; 358 359 PetscFunctionBegin; 360 rstart = C->rmap->rstart; 361 cstart = C->cmap->rstart; 362 ai = a->i; 363 aj = a->j; 364 bi = b->i; 365 bj = b->j; 366 garray = c->garray; 367 368 369 for (i=0; i<imax; i++) { 370 data_i = data[i]; 371 table_i = table[i]; 372 isz_i = isz[i]; 373 for (j=0,max=isz[i]; j<max; j++) { 374 row = data_i[j] - rstart; 375 start = ai[row]; 376 end = ai[row+1]; 377 for (k=start; k<end; k++) { /* Amat */ 378 val = aj[k] + cstart; 379 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 380 } 381 start = bi[row]; 382 end = bi[row+1]; 383 for (k=start; k<end; k++) { /* Bmat */ 384 val = garray[bj[k]]; 385 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 386 } 387 } 388 isz[i] = isz_i; 389 } 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 395 /* 396 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 397 and return the output 398 399 Input: 400 C - the matrix 401 nrqr - no of messages being processed. 402 rbuf - an array of pointers to the recieved requests 403 404 Output: 405 xdata - array of messages to be sent back 406 isz1 - size of each message 407 408 For better efficiency perhaps we should malloc separately each xdata[i], 409 then if a remalloc is required we need only copy the data for that one row 410 rather then all previous rows as it is now where a single large chunck of 411 memory is used. 412 413 */ 414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 415 { 416 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 417 Mat A = c->A,B = c->B; 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 419 PetscErrorCode ierr; 420 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 421 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 422 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 423 PetscInt *rbuf_i,kmax,rbuf_0; 424 PetscBT xtable; 425 426 PetscFunctionBegin; 427 m = C->rmap->N; 428 rstart = C->rmap->rstart; 429 cstart = C->cmap->rstart; 430 ai = a->i; 431 aj = a->j; 432 bi = b->i; 433 bj = b->j; 434 garray = c->garray; 435 436 437 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 438 rbuf_i = rbuf[i]; 439 rbuf_0 = rbuf_i[0]; 440 ct += rbuf_0; 441 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 442 } 443 444 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 445 else max1 = 1; 446 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 447 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 448 ++no_malloc; 449 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 450 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 451 452 ct3 = 0; 453 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 454 rbuf_i = rbuf[i]; 455 rbuf_0 = rbuf_i[0]; 456 ct1 = 2*rbuf_0+1; 457 ct2 = ct1; 458 ct3 += ct1; 459 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 460 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 461 oct2 = ct2; 462 kmax = rbuf_i[2*j]; 463 for (k=0; k<kmax; k++,ct1++) { 464 row = rbuf_i[ct1]; 465 if (!PetscBTLookupSet(xtable,row)) { 466 if (!(ct3 < mem_estimate)) { 467 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 468 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 469 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 470 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 471 xdata[0] = tmp; 472 mem_estimate = new_estimate; ++no_malloc; 473 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 474 } 475 xdata[i][ct2++] = row; 476 ct3++; 477 } 478 } 479 for (k=oct2,max2=ct2; k<max2; k++) { 480 row = xdata[i][k] - rstart; 481 start = ai[row]; 482 end = ai[row+1]; 483 for (l=start; l<end; l++) { 484 val = aj[l] + cstart; 485 if (!PetscBTLookupSet(xtable,val)) { 486 if (!(ct3 < mem_estimate)) { 487 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 488 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 489 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 490 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 491 xdata[0] = tmp; 492 mem_estimate = new_estimate; ++no_malloc; 493 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 494 } 495 xdata[i][ct2++] = val; 496 ct3++; 497 } 498 } 499 start = bi[row]; 500 end = bi[row+1]; 501 for (l=start; l<end; l++) { 502 val = garray[bj[l]]; 503 if (!PetscBTLookupSet(xtable,val)) { 504 if (!(ct3 < mem_estimate)) { 505 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 506 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 507 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 508 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 509 xdata[0] = tmp; 510 mem_estimate = new_estimate; ++no_malloc; 511 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 512 } 513 xdata[i][ct2++] = val; 514 ct3++; 515 } 516 } 517 } 518 /* Update the header*/ 519 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 520 xdata[i][2*j-1] = rbuf_i[2*j-1]; 521 } 522 xdata[i][0] = rbuf_0; 523 xdata[i+1] = xdata[i] + ct2; 524 isz1[i] = ct2; /* size of each message */ 525 } 526 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 527 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 528 PetscFunctionReturn(0); 529 } 530 /* -------------------------------------------------------------------------*/ 531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 533 /* 534 Every processor gets the entire matrix 535 */ 536 #undef __FUNCT__ 537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 539 { 540 Mat B; 541 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 542 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 543 PetscErrorCode ierr; 544 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 545 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 546 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 547 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 548 549 PetscFunctionBegin; 550 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 551 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 552 553 if (scall == MAT_INITIAL_MATRIX) { 554 /* ---------------------------------------------------------------- 555 Tell every processor the number of nonzeros per row 556 */ 557 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 558 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 559 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 560 } 561 sendcount = A->rmap->rend - A->rmap->rstart; 562 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 563 for (i=0; i<size; i++) { 564 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 565 displs[i] = A->rmap->range[i]; 566 } 567 #if defined(PETSC_HAVE_MPI_IN_PLACE) 568 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 569 #else 570 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 571 #endif 572 /* --------------------------------------------------------------- 573 Create the sequential matrix of the same type as the local block diagonal 574 */ 575 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 576 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 577 ierr = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 578 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 579 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 580 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 581 **Bin = B; 582 b = (Mat_SeqAIJ *)B->data; 583 584 /*-------------------------------------------------------------------- 585 Copy my part of matrix column indices over 586 */ 587 sendcount = ad->nz + bd->nz; 588 jsendbuf = b->j + b->i[rstarts[rank]]; 589 a_jsendbuf = ad->j; 590 b_jsendbuf = bd->j; 591 n = A->rmap->rend - A->rmap->rstart; 592 cnt = 0; 593 for (i=0; i<n; i++) { 594 595 /* put in lower diagonal portion */ 596 m = bd->i[i+1] - bd->i[i]; 597 while (m > 0) { 598 /* is it above diagonal (in bd (compressed) numbering) */ 599 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 600 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 601 m--; 602 } 603 604 /* put in diagonal portion */ 605 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 606 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 607 } 608 609 /* put in upper diagonal portion */ 610 while (m-- > 0) { 611 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 612 } 613 } 614 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 615 616 /*-------------------------------------------------------------------- 617 Gather all column indices to all processors 618 */ 619 for (i=0; i<size; i++) { 620 recvcounts[i] = 0; 621 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 622 recvcounts[i] += lens[j]; 623 } 624 } 625 displs[0] = 0; 626 for (i=1; i<size; i++) { 627 displs[i] = displs[i-1] + recvcounts[i-1]; 628 } 629 #if defined(PETSC_HAVE_MPI_IN_PLACE) 630 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 631 #else 632 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 633 #endif 634 /*-------------------------------------------------------------------- 635 Assemble the matrix into useable form (note numerical values not yet set) 636 */ 637 /* set the b->ilen (length of each row) values */ 638 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 639 /* set the b->i indices */ 640 b->i[0] = 0; 641 for (i=1; i<=A->rmap->N; i++) { 642 b->i[i] = b->i[i-1] + lens[i-1]; 643 } 644 ierr = PetscFree(lens);CHKERRQ(ierr); 645 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 646 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 647 648 } else { 649 B = **Bin; 650 b = (Mat_SeqAIJ *)B->data; 651 } 652 653 /*-------------------------------------------------------------------- 654 Copy my part of matrix numerical values into the values location 655 */ 656 if (flag == MAT_GET_VALUES){ 657 sendcount = ad->nz + bd->nz; 658 sendbuf = b->a + b->i[rstarts[rank]]; 659 a_sendbuf = ad->a; 660 b_sendbuf = bd->a; 661 b_sendj = bd->j; 662 n = A->rmap->rend - A->rmap->rstart; 663 cnt = 0; 664 for (i=0; i<n; i++) { 665 666 /* put in lower diagonal portion */ 667 m = bd->i[i+1] - bd->i[i]; 668 while (m > 0) { 669 /* is it above diagonal (in bd (compressed) numbering) */ 670 if (garray[*b_sendj] > A->rmap->rstart + i) break; 671 sendbuf[cnt++] = *b_sendbuf++; 672 m--; 673 b_sendj++; 674 } 675 676 /* put in diagonal portion */ 677 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 678 sendbuf[cnt++] = *a_sendbuf++; 679 } 680 681 /* put in upper diagonal portion */ 682 while (m-- > 0) { 683 sendbuf[cnt++] = *b_sendbuf++; 684 b_sendj++; 685 } 686 } 687 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 688 689 /* ----------------------------------------------------------------- 690 Gather all numerical values to all processors 691 */ 692 if (!recvcounts) { 693 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 694 } 695 for (i=0; i<size; i++) { 696 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 697 } 698 displs[0] = 0; 699 for (i=1; i<size; i++) { 700 displs[i] = displs[i-1] + recvcounts[i-1]; 701 } 702 recvbuf = b->a; 703 #if defined(PETSC_HAVE_MPI_IN_PLACE) 704 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 705 #else 706 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 707 #endif 708 } /* endof (flag == MAT_GET_VALUES) */ 709 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 710 711 if (A->symmetric){ 712 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 713 } else if (A->hermitian) { 714 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 715 } else if (A->structurally_symmetric) { 716 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 717 } 718 PetscFunctionReturn(0); 719 } 720 721 722 723 #undef __FUNCT__ 724 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 725 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 726 { 727 PetscErrorCode ierr; 728 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 729 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 730 731 PetscFunctionBegin; 732 /* 733 Check for special case: each processor gets entire matrix 734 */ 735 if (ismax == 1 && C->rmap->N == C->cmap->N) { 736 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 737 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 738 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 739 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 740 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 741 wantallmatrix = PETSC_TRUE; 742 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 743 } 744 } 745 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 746 if (twantallmatrix) { 747 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 748 PetscFunctionReturn(0); 749 } 750 751 /* Allocate memory to hold all the submatrices */ 752 if (scall != MAT_REUSE_MATRIX) { 753 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 754 } 755 756 /* Check for special case: each processor gets entire matrix columns */ 757 ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr); 758 for (i=0; i<ismax; i++) { 759 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 760 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 761 if (colflag && ncol == C->cmap->N){ 762 allcolumns[i] = PETSC_TRUE; 763 } else { 764 allcolumns[i] = PETSC_FALSE; 765 } 766 } 767 768 /* Determine the number of stages through which submatrices are done */ 769 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 770 771 /* 772 Each stage will extract nmax submatrices. 773 nmax is determined by the matrix column dimension. 774 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 775 */ 776 if (!nmax) nmax = 1; 777 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 778 779 /* Make sure every processor loops through the nstages */ 780 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 781 782 for (i=0,pos=0; i<nstages; i++) { 783 if (pos+nmax <= ismax) max_no = nmax; 784 else if (pos == ismax) max_no = 0; 785 else max_no = ismax-pos; 786 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 787 pos += max_no; 788 } 789 790 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 791 PetscFunctionReturn(0); 792 } 793 794 /* -------------------------------------------------------------------------*/ 795 #undef __FUNCT__ 796 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 797 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 798 { 799 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 800 Mat A = c->A; 801 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 802 const PetscInt **icol,**irow; 803 PetscInt *nrow,*ncol,start; 804 PetscErrorCode ierr; 805 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 806 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 807 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 808 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 809 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 810 #if defined (PETSC_USE_CTABLE) 811 PetscTable *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i; 812 #else 813 PetscInt **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i; 814 #endif 815 const PetscInt *irow_i; 816 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 817 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 818 MPI_Request *r_waits4,*s_waits3,*s_waits4; 819 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 820 MPI_Status *r_status3,*r_status4,*s_status4; 821 MPI_Comm comm; 822 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 823 PetscMPIInt *onodes1,*olengths1; 824 PetscMPIInt idex,idex2,end; 825 826 PetscFunctionBegin; 827 comm = ((PetscObject)C)->comm; 828 tag0 = ((PetscObject)C)->tag; 829 size = c->size; 830 rank = c->rank; 831 832 /* Get some new tags to keep the communication clean */ 833 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 834 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 835 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 836 837 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 838 839 for (i=0; i<ismax; i++) { 840 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 841 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 842 if (allcolumns[i]){ 843 icol[i] = PETSC_NULL; 844 ncol[i] = C->cmap->N; 845 } else { 846 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 847 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 848 } 849 } 850 851 /* evaluate communication - mesg to who, length of mesg, and buffer space 852 required. Based on this, buffers are allocated, and data copied into them*/ 853 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 854 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 855 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 856 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 857 for (i=0; i<ismax; i++) { 858 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 859 jmax = nrow[i]; 860 irow_i = irow[i]; 861 for (j=0; j<jmax; j++) { 862 l = 0; 863 row = irow_i[j]; 864 while (row >= C->rmap->range[l+1]) l++; 865 proc = l; 866 w4[proc]++; 867 } 868 for (j=0; j<size; j++) { 869 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 870 } 871 } 872 873 nrqs = 0; /* no of outgoing messages */ 874 msz = 0; /* total mesg length (for all procs) */ 875 w1[rank] = 0; /* no mesg sent to self */ 876 w3[rank] = 0; 877 for (i=0; i<size; i++) { 878 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 879 } 880 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 881 for (i=0,j=0; i<size; i++) { 882 if (w1[i]) { pa[j] = i; j++; } 883 } 884 885 /* Each message would have a header = 1 + 2*(no of IS) + data */ 886 for (i=0; i<nrqs; i++) { 887 j = pa[i]; 888 w1[j] += w2[j] + 2* w3[j]; 889 msz += w1[j]; 890 } 891 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 892 893 /* Determine the number of messages to expect, their lengths, from from-ids */ 894 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 895 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 896 897 /* Now post the Irecvs corresponding to these messages */ 898 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 899 900 ierr = PetscFree(onodes1);CHKERRQ(ierr); 901 ierr = PetscFree(olengths1);CHKERRQ(ierr); 902 903 /* Allocate Memory for outgoing messages */ 904 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 905 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 906 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 907 908 { 909 PetscInt *iptr = tmp,ict = 0; 910 for (i=0; i<nrqs; i++) { 911 j = pa[i]; 912 iptr += ict; 913 sbuf1[j] = iptr; 914 ict = w1[j]; 915 } 916 } 917 918 /* Form the outgoing messages */ 919 /* Initialize the header space */ 920 for (i=0; i<nrqs; i++) { 921 j = pa[i]; 922 sbuf1[j][0] = 0; 923 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 924 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 925 } 926 927 /* Parse the isrow and copy data into outbuf */ 928 for (i=0; i<ismax; i++) { 929 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 930 irow_i = irow[i]; 931 jmax = nrow[i]; 932 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 933 l = 0; 934 row = irow_i[j]; 935 while (row >= C->rmap->range[l+1]) l++; 936 proc = l; 937 if (proc != rank) { /* copy to the outgoing buf*/ 938 ctr[proc]++; 939 *ptr[proc] = row; 940 ptr[proc]++; 941 } 942 } 943 /* Update the headers for the current IS */ 944 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 945 if ((ctr_j = ctr[j])) { 946 sbuf1_j = sbuf1[j]; 947 k = ++sbuf1_j[0]; 948 sbuf1_j[2*k] = ctr_j; 949 sbuf1_j[2*k-1] = i; 950 } 951 } 952 } 953 954 /* Now post the sends */ 955 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 956 for (i=0; i<nrqs; ++i) { 957 j = pa[i]; 958 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 959 } 960 961 /* Post Receives to capture the buffer size */ 962 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 963 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 964 rbuf2[0] = tmp + msz; 965 for (i=1; i<nrqs; ++i) { 966 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 967 } 968 for (i=0; i<nrqs; ++i) { 969 j = pa[i]; 970 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 971 } 972 973 /* Send to other procs the buf size they should allocate */ 974 975 976 /* Receive messages*/ 977 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 978 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 979 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 980 { 981 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 982 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 983 PetscInt *sbuf2_i; 984 985 for (i=0; i<nrqr; ++i) { 986 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 987 req_size[idex] = 0; 988 rbuf1_i = rbuf1[idex]; 989 start = 2*rbuf1_i[0] + 1; 990 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 991 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 992 sbuf2_i = sbuf2[idex]; 993 for (j=start; j<end; j++) { 994 id = rbuf1_i[j] - rstart; 995 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 996 sbuf2_i[j] = ncols; 997 req_size[idex] += ncols; 998 } 999 req_source[idex] = r_status1[i].MPI_SOURCE; 1000 /* form the header */ 1001 sbuf2_i[0] = req_size[idex]; 1002 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 1003 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1004 } 1005 } 1006 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1007 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1008 1009 /* recv buffer sizes */ 1010 /* Receive messages*/ 1011 1012 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1013 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1014 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1015 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1016 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1017 1018 for (i=0; i<nrqs; ++i) { 1019 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1020 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1021 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1022 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1023 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1024 } 1025 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1026 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1027 1028 /* Wait on sends1 and sends2 */ 1029 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1030 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1031 1032 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1033 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1034 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1035 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1036 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1037 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1038 1039 /* Now allocate buffers for a->j, and send them off */ 1040 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1041 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1042 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1043 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1044 1045 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1046 { 1047 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1048 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1049 PetscInt cend = C->cmap->rend; 1050 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1051 1052 for (i=0; i<nrqr; i++) { 1053 rbuf1_i = rbuf1[i]; 1054 sbuf_aj_i = sbuf_aj[i]; 1055 ct1 = 2*rbuf1_i[0] + 1; 1056 ct2 = 0; 1057 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1058 kmax = rbuf1[i][2*j]; 1059 for (k=0; k<kmax; k++,ct1++) { 1060 row = rbuf1_i[ct1] - rstart; 1061 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1062 ncols = nzA + nzB; 1063 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1064 1065 /* load the column indices for this row into cols*/ 1066 cols = sbuf_aj_i + ct2; 1067 1068 lwrite = 0; 1069 for (l=0; l<nzB; l++) { 1070 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1071 } 1072 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1073 for (l=0; l<nzB; l++) { 1074 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1075 } 1076 1077 ct2 += ncols; 1078 } 1079 } 1080 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1081 } 1082 } 1083 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1084 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1085 1086 /* Allocate buffers for a->a, and send them off */ 1087 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1088 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1089 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1090 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1091 1092 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1093 { 1094 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1095 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1096 PetscInt cend = C->cmap->rend; 1097 PetscInt *b_j = b->j; 1098 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1099 1100 for (i=0; i<nrqr; i++) { 1101 rbuf1_i = rbuf1[i]; 1102 sbuf_aa_i = sbuf_aa[i]; 1103 ct1 = 2*rbuf1_i[0]+1; 1104 ct2 = 0; 1105 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1106 kmax = rbuf1_i[2*j]; 1107 for (k=0; k<kmax; k++,ct1++) { 1108 row = rbuf1_i[ct1] - rstart; 1109 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1110 ncols = nzA + nzB; 1111 cworkB = b_j + b_i[row]; 1112 vworkA = a_a + a_i[row]; 1113 vworkB = b_a + b_i[row]; 1114 1115 /* load the column values for this row into vals*/ 1116 vals = sbuf_aa_i+ct2; 1117 1118 lwrite = 0; 1119 for (l=0; l<nzB; l++) { 1120 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1121 } 1122 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1123 for (l=0; l<nzB; l++) { 1124 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1125 } 1126 1127 ct2 += ncols; 1128 } 1129 } 1130 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1131 } 1132 } 1133 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1134 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1135 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1136 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1137 1138 /* Form the matrix */ 1139 /* create col map: global col of C -> local col of submatrices */ 1140 { 1141 const PetscInt *icol_i; 1142 #if defined (PETSC_USE_CTABLE) 1143 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1144 for (i=0; i<ismax; i++) { 1145 if (!allcolumns[i]){ 1146 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1147 jmax = ncol[i]; 1148 icol_i = icol[i]; 1149 cmap_i = cmap[i]; 1150 for (j=0; j<jmax; j++) { 1151 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1152 } 1153 } else { 1154 cmap[i] = PETSC_NULL; 1155 } 1156 } 1157 #else 1158 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1159 for (i=0; i<ismax; i++) { 1160 if (!allcolumns[i]){ 1161 ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr); 1162 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1163 jmax = ncol[i]; 1164 icol_i = icol[i]; 1165 cmap_i = cmap[i]; 1166 for (j=0; j<jmax; j++) { 1167 cmap_i[icol_i[j]] = j+1; 1168 } 1169 } else { 1170 cmap[i] = PETSC_NULL; 1171 } 1172 } 1173 #endif 1174 } 1175 1176 /* Create lens which is required for MatCreate... */ 1177 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1178 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1179 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1180 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1181 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1182 1183 /* Update lens from local data */ 1184 for (i=0; i<ismax; i++) { 1185 jmax = nrow[i]; 1186 if (!allcolumns[i]) cmap_i = cmap[i]; 1187 irow_i = irow[i]; 1188 lens_i = lens[i]; 1189 for (j=0; j<jmax; j++) { 1190 l = 0; 1191 row = irow_i[j]; 1192 while (row >= C->rmap->range[l+1]) l++; 1193 proc = l; 1194 if (proc == rank) { 1195 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1196 if (!allcolumns[i]){ 1197 for (k=0; k<ncols; k++) { 1198 #if defined (PETSC_USE_CTABLE) 1199 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1200 #else 1201 tcol = cmap_i[cols[k]]; 1202 #endif 1203 if (tcol) { lens_i[j]++;} 1204 } 1205 } else { /* allcolumns */ 1206 lens_i[j] = ncols; 1207 } 1208 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1209 } 1210 } 1211 } 1212 1213 /* Create row map: global row of C -> local row of submatrices */ 1214 #if defined (PETSC_USE_CTABLE) 1215 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1216 for (i=0; i<ismax; i++) { 1217 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1218 rmap_i = rmap[i]; 1219 irow_i = irow[i]; 1220 jmax = nrow[i]; 1221 for (j=0; j<jmax; j++) { 1222 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1223 } 1224 } 1225 #else 1226 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1227 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1228 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1229 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1230 for (i=0; i<ismax; i++) { 1231 rmap_i = rmap[i]; 1232 irow_i = irow[i]; 1233 jmax = nrow[i]; 1234 for (j=0; j<jmax; j++) { 1235 rmap_i[irow_i[j]] = j; 1236 } 1237 } 1238 #endif 1239 1240 /* Update lens from offproc data */ 1241 { 1242 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1243 1244 for (tmp2=0; tmp2<nrqs; tmp2++) { 1245 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1246 idex = pa[idex2]; 1247 sbuf1_i = sbuf1[idex]; 1248 jmax = sbuf1_i[0]; 1249 ct1 = 2*jmax+1; 1250 ct2 = 0; 1251 rbuf2_i = rbuf2[idex2]; 1252 rbuf3_i = rbuf3[idex2]; 1253 for (j=1; j<=jmax; j++) { 1254 is_no = sbuf1_i[2*j-1]; 1255 max1 = sbuf1_i[2*j]; 1256 lens_i = lens[is_no]; 1257 if (!allcolumns[is_no]){cmap_i = cmap[is_no];} 1258 rmap_i = rmap[is_no]; 1259 for (k=0; k<max1; k++,ct1++) { 1260 #if defined (PETSC_USE_CTABLE) 1261 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1262 row--; 1263 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1264 #else 1265 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1266 #endif 1267 max2 = rbuf2_i[ct1]; 1268 for (l=0; l<max2; l++,ct2++) { 1269 if (!allcolumns[is_no]){ 1270 #if defined (PETSC_USE_CTABLE) 1271 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1272 #else 1273 tcol = cmap_i[rbuf3_i[ct2]]; 1274 #endif 1275 if (tcol) { 1276 lens_i[row]++; 1277 } 1278 } else { /* allcolumns */ 1279 lens_i[row]++; /* lens_i[row] += max2 ? */ 1280 } 1281 } 1282 } 1283 } 1284 } 1285 } 1286 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1287 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1288 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1289 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1290 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1291 1292 /* Create the submatrices */ 1293 if (scall == MAT_REUSE_MATRIX) { 1294 PetscBool flag; 1295 1296 /* 1297 Assumes new rows are same length as the old rows,hence bug! 1298 */ 1299 for (i=0; i<ismax; i++) { 1300 mat = (Mat_SeqAIJ *)(submats[i]->data); 1301 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1302 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1303 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1304 /* Initial matrix as if empty */ 1305 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1306 submats[i]->factortype = C->factortype; 1307 } 1308 } else { 1309 for (i=0; i<ismax; i++) { 1310 PetscInt rbs,cbs; 1311 ierr = ISGetBlockSize(isrow[i],&rbs); CHKERRQ(ierr); 1312 ierr = ISGetBlockSize(iscol[i],&cbs); CHKERRQ(ierr); 1313 1314 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1315 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1316 1317 ierr = MatSetBlockSizes(submats[i],rbs,cbs); CHKERRQ(ierr); 1318 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1319 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1320 } 1321 } 1322 1323 /* Assemble the matrices */ 1324 /* First assemble the local rows */ 1325 { 1326 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1327 PetscScalar *imat_a; 1328 1329 for (i=0; i<ismax; i++) { 1330 mat = (Mat_SeqAIJ*)submats[i]->data; 1331 imat_ilen = mat->ilen; 1332 imat_j = mat->j; 1333 imat_i = mat->i; 1334 imat_a = mat->a; 1335 if (!allcolumns[i]) cmap_i = cmap[i]; 1336 rmap_i = rmap[i]; 1337 irow_i = irow[i]; 1338 jmax = nrow[i]; 1339 for (j=0; j<jmax; j++) { 1340 l = 0; 1341 row = irow_i[j]; 1342 while (row >= C->rmap->range[l+1]) l++; 1343 proc = l; 1344 if (proc == rank) { 1345 old_row = row; 1346 #if defined (PETSC_USE_CTABLE) 1347 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1348 row--; 1349 #else 1350 row = rmap_i[row]; 1351 #endif 1352 ilen_row = imat_ilen[row]; 1353 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1354 mat_i = imat_i[row] ; 1355 mat_a = imat_a + mat_i; 1356 mat_j = imat_j + mat_i; 1357 if (!allcolumns[i]){ 1358 for (k=0; k<ncols; k++) { 1359 #if defined (PETSC_USE_CTABLE) 1360 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1361 #else 1362 tcol = cmap_i[cols[k]]; 1363 #endif 1364 if (tcol){ 1365 *mat_j++ = tcol - 1; 1366 *mat_a++ = vals[k]; 1367 ilen_row++; 1368 } 1369 } 1370 } else { /* allcolumns */ 1371 for (k=0; k<ncols; k++) { 1372 *mat_j++ = cols[k] ; /* global col index! */ 1373 *mat_a++ = vals[k]; 1374 ilen_row++; 1375 } 1376 } 1377 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1378 imat_ilen[row] = ilen_row; 1379 } 1380 } 1381 } 1382 } 1383 1384 /* Now assemble the off proc rows*/ 1385 { 1386 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1387 PetscInt *imat_j,*imat_i; 1388 PetscScalar *imat_a,*rbuf4_i; 1389 1390 for (tmp2=0; tmp2<nrqs; tmp2++) { 1391 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1392 idex = pa[idex2]; 1393 sbuf1_i = sbuf1[idex]; 1394 jmax = sbuf1_i[0]; 1395 ct1 = 2*jmax + 1; 1396 ct2 = 0; 1397 rbuf2_i = rbuf2[idex2]; 1398 rbuf3_i = rbuf3[idex2]; 1399 rbuf4_i = rbuf4[idex2]; 1400 for (j=1; j<=jmax; j++) { 1401 is_no = sbuf1_i[2*j-1]; 1402 rmap_i = rmap[is_no]; 1403 if (!allcolumns[is_no]){cmap_i = cmap[is_no];} 1404 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1405 imat_ilen = mat->ilen; 1406 imat_j = mat->j; 1407 imat_i = mat->i; 1408 imat_a = mat->a; 1409 max1 = sbuf1_i[2*j]; 1410 for (k=0; k<max1; k++,ct1++) { 1411 row = sbuf1_i[ct1]; 1412 #if defined (PETSC_USE_CTABLE) 1413 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1414 row--; 1415 #else 1416 row = rmap_i[row]; 1417 #endif 1418 ilen = imat_ilen[row]; 1419 mat_i = imat_i[row] ; 1420 mat_a = imat_a + mat_i; 1421 mat_j = imat_j + mat_i; 1422 max2 = rbuf2_i[ct1]; 1423 if (!allcolumns[is_no]){ 1424 for (l=0; l<max2; l++,ct2++) { 1425 1426 #if defined (PETSC_USE_CTABLE) 1427 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1428 #else 1429 tcol = cmap_i[rbuf3_i[ct2]]; 1430 #endif 1431 if (tcol) { 1432 *mat_j++ = tcol - 1; 1433 *mat_a++ = rbuf4_i[ct2]; 1434 ilen++; 1435 } 1436 } 1437 } else { /* allcolumns */ 1438 for (l=0; l<max2; l++,ct2++) { 1439 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1440 *mat_a++ = rbuf4_i[ct2]; 1441 ilen++; 1442 } 1443 } 1444 imat_ilen[row] = ilen; 1445 } 1446 } 1447 } 1448 } 1449 1450 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1451 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1452 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1453 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1454 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1455 1456 /* Restore the indices */ 1457 for (i=0; i<ismax; i++) { 1458 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1459 if (!allcolumns[i]){ 1460 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1461 } 1462 } 1463 1464 /* Destroy allocated memory */ 1465 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1466 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1467 ierr = PetscFree(pa);CHKERRQ(ierr); 1468 1469 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1470 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1471 for (i=0; i<nrqr; ++i) { 1472 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1473 } 1474 for (i=0; i<nrqs; ++i) { 1475 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1476 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1477 } 1478 1479 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1480 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1481 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1482 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1483 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1484 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1485 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1486 1487 #if defined (PETSC_USE_CTABLE) 1488 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1489 #else 1490 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1491 #endif 1492 ierr = PetscFree(rmap);CHKERRQ(ierr); 1493 1494 for (i=0; i<ismax; i++) { 1495 if (!allcolumns[i]){ 1496 #if defined (PETSC_USE_CTABLE) 1497 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1498 #else 1499 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1500 #endif 1501 } 1502 } 1503 ierr = PetscFree(cmap);CHKERRQ(ierr); 1504 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1505 ierr = PetscFree(lens);CHKERRQ(ierr); 1506 1507 for (i=0; i<ismax; i++) { 1508 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1509 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1510 } 1511 PetscFunctionReturn(0); 1512 } 1513 1514 /* 1515 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1516 Be careful not to destroy them elsewhere. 1517 */ 1518 #undef __FUNCT__ 1519 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1520 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1521 { 1522 /* If making this function public, change the error returned in this function away from _PLIB. */ 1523 PetscErrorCode ierr; 1524 Mat_MPIAIJ *aij; 1525 PetscBool seqaij; 1526 1527 PetscFunctionBegin; 1528 /* Check to make sure the component matrices are compatible with C. */ 1529 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1530 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1531 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1532 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1533 if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1534 1535 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1536 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1537 ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs); CHKERRQ(ierr); 1538 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1539 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1540 if((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1541 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1542 aij = (Mat_MPIAIJ*)((*C)->data); 1543 aij->A = A; 1544 aij->B = B; 1545 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1546 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1547 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1548 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1549 PetscFunctionReturn(0); 1550 } 1551 1552 #undef __FUNCT__ 1553 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1554 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1555 { 1556 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1557 PetscFunctionBegin; 1558 PetscValidPointer(A,2); 1559 PetscValidPointer(B,3); 1560 *A = aij->A; 1561 *B = aij->B; 1562 /* Note that we don't incref *A and *B, so be careful! */ 1563 PetscFunctionReturn(0); 1564 } 1565 1566 #undef __FUNCT__ 1567 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1568 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1569 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1570 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1571 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1572 { 1573 PetscErrorCode ierr; 1574 PetscMPIInt size, flag; 1575 PetscInt i,ii; 1576 PetscInt ismax_c; 1577 1578 PetscFunctionBegin; 1579 if (!ismax) { 1580 PetscFunctionReturn(0); 1581 } 1582 for (i = 0, ismax_c = 0; i < ismax; ++i) { 1583 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1584 if(flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1585 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1586 if(size > 1) { 1587 ++ismax_c; 1588 } 1589 } 1590 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1591 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1592 } else { /* if(ismax_c) */ 1593 Mat *A,*B; 1594 IS *isrow_c, *iscol_c; 1595 PetscMPIInt size; 1596 /* 1597 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1598 array of sequential matrices underlying the resulting parallel matrices. 1599 Which arrays to allocate is based on the value of MatReuse scall. 1600 There are as many diag matrices as there are original index sets. 1601 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1602 1603 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1604 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1605 will deposite the extracted diag and off-diag parts. 1606 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1607 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1608 */ 1609 1610 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1611 if (scall != MAT_REUSE_MATRIX) { 1612 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1613 } else { 1614 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1615 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1616 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1617 for(i = 0, ii = 0; i < ismax; ++i) { 1618 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1619 if(size > 1) { 1620 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1621 ++ii; 1622 } else { 1623 A[i] = (*submat)[i]; 1624 } 1625 } 1626 } 1627 /* 1628 Construct the complements of the iscol ISs for parallel ISs only. 1629 These are used to extract the off-diag portion of the resulting parallel matrix. 1630 The row IS for the off-diag portion is the same as for the diag portion, 1631 so we merely alias the row IS, while skipping those that are sequential. 1632 */ 1633 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1634 for(i = 0, ii = 0; i < ismax; ++i) { 1635 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1636 if(size > 1) { 1637 isrow_c[ii] = isrow[i]; 1638 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1639 ++ii; 1640 } 1641 } 1642 /* Now obtain the sequential A and B submatrices separately. */ 1643 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1644 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1645 for(ii = 0; ii < ismax_c; ++ii) { 1646 ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr); 1647 } 1648 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1649 /* 1650 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1651 have been extracted directly into the parallel matrices containing them, or 1652 simply into the sequential matrix identical with the corresponding A (if size == 1). 1653 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1654 A is put into the correct submat slot (if size == 1). 1655 */ 1656 if(scall != MAT_REUSE_MATRIX) { 1657 for(i = 0, ii = 0; i < ismax; ++i) { 1658 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1659 if(size > 1) { 1660 /* 1661 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1662 */ 1663 /* Construct submat[i] from the Seq pieces A and B. */ 1664 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1665 1666 ++ii; 1667 } else { 1668 (*submat)[i] = A[i]; 1669 } 1670 } 1671 } 1672 ierr = PetscFree(A); CHKERRQ(ierr); 1673 ierr = PetscFree(B); CHKERRQ(ierr); 1674 } 1675 PetscFunctionReturn(0); 1676 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1677 1678 1679 1680 #undef __FUNCT__ 1681 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1682 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1683 { 1684 PetscErrorCode ierr; 1685 1686 PetscFunctionBegin; 1687 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 1688 PetscFunctionReturn(0); 1689 } 1690