1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++){ 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 260 rw1[proc] = isz1[i]; 261 } 262 ierr = PetscFree(onodes1);CHKERRQ(ierr); 263 ierr = PetscFree(olengths1);CHKERRQ(ierr); 264 265 /* Determine the number of messages to expect, their lengths, from from-ids */ 266 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 267 ierr = PetscFree(rw1);CHKERRQ(ierr); 268 } 269 /* Now post the Irecvs corresponding to these messages */ 270 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 271 272 /* Now post the sends */ 273 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 274 for (i=0; i<nrqr; ++i) { 275 j = recv_status[i].MPI_SOURCE; 276 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* receive work done on other processors*/ 280 { 281 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 282 PetscMPIInt idex; 283 PetscBT table_i; 284 MPI_Status *status2; 285 286 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 287 for (i=0; i<nrqs; ++i) { 288 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 289 /* Process the message*/ 290 rbuf2_i = rbuf2[idex]; 291 ct1 = 2*rbuf2_i[0]+1; 292 jmax = rbuf2[idex][0]; 293 for (j=1; j<=jmax; j++) { 294 max = rbuf2_i[2*j]; 295 is_no = rbuf2_i[2*j-1]; 296 isz_i = isz[is_no]; 297 data_i = data[is_no]; 298 table_i = table[is_no]; 299 for (k=0; k<max; k++,ct1++) { 300 row = rbuf2_i[ct1]; 301 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 302 } 303 isz[is_no] = isz_i; 304 } 305 } 306 307 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 308 ierr = PetscFree(status2);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<imax; ++i) { 312 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 313 } 314 315 ierr = PetscFree(onodes2);CHKERRQ(ierr); 316 ierr = PetscFree(olengths2);CHKERRQ(ierr); 317 318 ierr = PetscFree(pa);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 324 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 325 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 326 ierr = PetscFree(s_status);CHKERRQ(ierr); 327 ierr = PetscFree(recv_status);CHKERRQ(ierr); 328 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 329 ierr = PetscFree(xdata);CHKERRQ(ierr); 330 ierr = PetscFree(isz1);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 336 /* 337 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 338 the work on the local processor. 339 340 Inputs: 341 C - MAT_MPIAIJ; 342 imax - total no of index sets processed at a time; 343 table - an array of char - size = m bits. 344 345 Output: 346 isz - array containing the count of the solution elements corresponding 347 to each index set; 348 data - pointer to the solutions 349 */ 350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 351 { 352 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 353 Mat A = c->A,B = c->B; 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 355 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 356 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 357 PetscBT table_i; 358 359 PetscFunctionBegin; 360 rstart = C->rmap->rstart; 361 cstart = C->cmap->rstart; 362 ai = a->i; 363 aj = a->j; 364 bi = b->i; 365 bj = b->j; 366 garray = c->garray; 367 368 369 for (i=0; i<imax; i++) { 370 data_i = data[i]; 371 table_i = table[i]; 372 isz_i = isz[i]; 373 for (j=0,max=isz[i]; j<max; j++) { 374 row = data_i[j] - rstart; 375 start = ai[row]; 376 end = ai[row+1]; 377 for (k=start; k<end; k++) { /* Amat */ 378 val = aj[k] + cstart; 379 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 380 } 381 start = bi[row]; 382 end = bi[row+1]; 383 for (k=start; k<end; k++) { /* Bmat */ 384 val = garray[bj[k]]; 385 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 386 } 387 } 388 isz[i] = isz_i; 389 } 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 395 /* 396 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 397 and return the output 398 399 Input: 400 C - the matrix 401 nrqr - no of messages being processed. 402 rbuf - an array of pointers to the recieved requests 403 404 Output: 405 xdata - array of messages to be sent back 406 isz1 - size of each message 407 408 For better efficiency perhaps we should malloc separately each xdata[i], 409 then if a remalloc is required we need only copy the data for that one row 410 rather then all previous rows as it is now where a single large chunck of 411 memory is used. 412 413 */ 414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 415 { 416 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 417 Mat A = c->A,B = c->B; 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 419 PetscErrorCode ierr; 420 PetscMPIInt rank; 421 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 422 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 423 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 424 PetscInt *rbuf_i,kmax,rbuf_0; 425 PetscBT xtable; 426 427 PetscFunctionBegin; 428 rank = c->rank; 429 m = C->rmap->N; 430 rstart = C->rmap->rstart; 431 cstart = C->cmap->rstart; 432 ai = a->i; 433 aj = a->j; 434 bi = b->i; 435 bj = b->j; 436 garray = c->garray; 437 438 439 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 440 rbuf_i = rbuf[i]; 441 rbuf_0 = rbuf_i[0]; 442 ct += rbuf_0; 443 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 444 } 445 446 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 447 else max1 = 1; 448 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 449 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 450 ++no_malloc; 451 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 452 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 453 454 ct3 = 0; 455 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 456 rbuf_i = rbuf[i]; 457 rbuf_0 = rbuf_i[0]; 458 ct1 = 2*rbuf_0+1; 459 ct2 = ct1; 460 ct3 += ct1; 461 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 462 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 463 oct2 = ct2; 464 kmax = rbuf_i[2*j]; 465 for (k=0; k<kmax; k++,ct1++) { 466 row = rbuf_i[ct1]; 467 if (!PetscBTLookupSet(xtable,row)) { 468 if (!(ct3 < mem_estimate)) { 469 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 470 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 471 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 472 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 473 xdata[0] = tmp; 474 mem_estimate = new_estimate; ++no_malloc; 475 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 476 } 477 xdata[i][ct2++] = row; 478 ct3++; 479 } 480 } 481 for (k=oct2,max2=ct2; k<max2; k++) { 482 row = xdata[i][k] - rstart; 483 start = ai[row]; 484 end = ai[row+1]; 485 for (l=start; l<end; l++) { 486 val = aj[l] + cstart; 487 if (!PetscBTLookupSet(xtable,val)) { 488 if (!(ct3 < mem_estimate)) { 489 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 490 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 491 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 492 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 493 xdata[0] = tmp; 494 mem_estimate = new_estimate; ++no_malloc; 495 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 496 } 497 xdata[i][ct2++] = val; 498 ct3++; 499 } 500 } 501 start = bi[row]; 502 end = bi[row+1]; 503 for (l=start; l<end; l++) { 504 val = garray[bj[l]]; 505 if (!PetscBTLookupSet(xtable,val)) { 506 if (!(ct3 < mem_estimate)) { 507 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 508 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 509 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 510 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 511 xdata[0] = tmp; 512 mem_estimate = new_estimate; ++no_malloc; 513 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 514 } 515 xdata[i][ct2++] = val; 516 ct3++; 517 } 518 } 519 } 520 /* Update the header*/ 521 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 522 xdata[i][2*j-1] = rbuf_i[2*j-1]; 523 } 524 xdata[i][0] = rbuf_0; 525 xdata[i+1] = xdata[i] + ct2; 526 isz1[i] = ct2; /* size of each message */ 527 } 528 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 529 ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 530 PetscFunctionReturn(0); 531 } 532 /* -------------------------------------------------------------------------*/ 533 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool,Mat*); 534 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 535 /* 536 Every processor gets the entire matrix 537 */ 538 #undef __FUNCT__ 539 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 540 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 541 { 542 Mat B; 543 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 544 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 545 PetscErrorCode ierr; 546 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 547 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 548 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 549 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 550 551 PetscFunctionBegin; 552 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 553 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 554 555 if (scall == MAT_INITIAL_MATRIX) { 556 /* ---------------------------------------------------------------- 557 Tell every processor the number of nonzeros per row 558 */ 559 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 560 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 561 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 562 } 563 sendcount = A->rmap->rend - A->rmap->rstart; 564 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 565 for (i=0; i<size; i++) { 566 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 567 displs[i] = A->rmap->range[i]; 568 } 569 #if defined(PETSC_HAVE_MPI_IN_PLACE) 570 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 571 #else 572 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 573 #endif 574 /* --------------------------------------------------------------- 575 Create the sequential matrix of the same type as the local block diagonal 576 */ 577 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 578 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 579 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 580 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 581 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 582 **Bin = B; 583 b = (Mat_SeqAIJ *)B->data; 584 585 /*-------------------------------------------------------------------- 586 Copy my part of matrix column indices over 587 */ 588 sendcount = ad->nz + bd->nz; 589 jsendbuf = b->j + b->i[rstarts[rank]]; 590 a_jsendbuf = ad->j; 591 b_jsendbuf = bd->j; 592 n = A->rmap->rend - A->rmap->rstart; 593 cnt = 0; 594 for (i=0; i<n; i++) { 595 596 /* put in lower diagonal portion */ 597 m = bd->i[i+1] - bd->i[i]; 598 while (m > 0) { 599 /* is it above diagonal (in bd (compressed) numbering) */ 600 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 601 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 602 m--; 603 } 604 605 /* put in diagonal portion */ 606 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 607 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 608 } 609 610 /* put in upper diagonal portion */ 611 while (m-- > 0) { 612 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 613 } 614 } 615 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 616 617 /*-------------------------------------------------------------------- 618 Gather all column indices to all processors 619 */ 620 for (i=0; i<size; i++) { 621 recvcounts[i] = 0; 622 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 623 recvcounts[i] += lens[j]; 624 } 625 } 626 displs[0] = 0; 627 for (i=1; i<size; i++) { 628 displs[i] = displs[i-1] + recvcounts[i-1]; 629 } 630 #if defined(PETSC_HAVE_MPI_IN_PLACE) 631 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 632 #else 633 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 634 #endif 635 /*-------------------------------------------------------------------- 636 Assemble the matrix into useable form (note numerical values not yet set) 637 */ 638 /* set the b->ilen (length of each row) values */ 639 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 640 /* set the b->i indices */ 641 b->i[0] = 0; 642 for (i=1; i<=A->rmap->N; i++) { 643 b->i[i] = b->i[i-1] + lens[i-1]; 644 } 645 ierr = PetscFree(lens);CHKERRQ(ierr); 646 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 647 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 648 649 } else { 650 B = **Bin; 651 b = (Mat_SeqAIJ *)B->data; 652 } 653 654 /*-------------------------------------------------------------------- 655 Copy my part of matrix numerical values into the values location 656 */ 657 if (flag == MAT_GET_VALUES){ 658 sendcount = ad->nz + bd->nz; 659 sendbuf = b->a + b->i[rstarts[rank]]; 660 a_sendbuf = ad->a; 661 b_sendbuf = bd->a; 662 b_sendj = bd->j; 663 n = A->rmap->rend - A->rmap->rstart; 664 cnt = 0; 665 for (i=0; i<n; i++) { 666 667 /* put in lower diagonal portion */ 668 m = bd->i[i+1] - bd->i[i]; 669 while (m > 0) { 670 /* is it above diagonal (in bd (compressed) numbering) */ 671 if (garray[*b_sendj] > A->rmap->rstart + i) break; 672 sendbuf[cnt++] = *b_sendbuf++; 673 m--; 674 b_sendj++; 675 } 676 677 /* put in diagonal portion */ 678 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 679 sendbuf[cnt++] = *a_sendbuf++; 680 } 681 682 /* put in upper diagonal portion */ 683 while (m-- > 0) { 684 sendbuf[cnt++] = *b_sendbuf++; 685 b_sendj++; 686 } 687 } 688 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 689 690 /* ----------------------------------------------------------------- 691 Gather all numerical values to all processors 692 */ 693 if (!recvcounts) { 694 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 695 } 696 for (i=0; i<size; i++) { 697 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 698 } 699 displs[0] = 0; 700 for (i=1; i<size; i++) { 701 displs[i] = displs[i-1] + recvcounts[i-1]; 702 } 703 recvbuf = b->a; 704 #if defined(PETSC_HAVE_MPI_IN_PLACE) 705 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 706 #else 707 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 708 #endif 709 } /* endof (flag == MAT_GET_VALUES) */ 710 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 711 712 if (A->symmetric){ 713 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 714 } else if (A->hermitian) { 715 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 716 } else if (A->structurally_symmetric) { 717 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 718 } 719 PetscFunctionReturn(0); 720 } 721 722 723 724 #undef __FUNCT__ 725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 727 { 728 PetscErrorCode ierr; 729 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 730 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,wantallmatrixcolumns=PETSC_FALSE; 731 732 PetscFunctionBegin; 733 /* 734 Check for special case: each processor gets entire matrix 735 */ 736 if (ismax == 1 && C->rmap->N == C->cmap->N) { 737 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 738 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 739 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 740 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 741 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 742 wantallmatrix = PETSC_TRUE; 743 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 744 } 745 } 746 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 747 if (twantallmatrix) { 748 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 749 PetscFunctionReturn(0); 750 } 751 752 /* Allocate memory to hold all the submatrices */ 753 if (scall != MAT_REUSE_MATRIX) { 754 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 755 } 756 757 /* Check for special case: each processor gets entire matrix columns */ 758 if (ismax == 1){ 759 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 760 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 761 if (colflag && ncol == C->cmap->N){ 762 wantallmatrixcolumns = PETSC_TRUE; 763 ierr = MatGetSubMatrices_MPIAIJ_Local(C,ismax,isrow,iscol,scall,wantallmatrixcolumns,*submat);CHKERRQ(ierr); 764 PetscFunctionReturn(0); 765 } 766 } 767 768 /* Determine the number of stages through which submatrices are done */ 769 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 770 /* 771 Each stage will extract nmax submatrices. 772 nmax is determined by the matrix column dimension. 773 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 774 */ 775 if (!nmax) nmax = 1; 776 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 777 778 /* Make sure every processor loops through the nstages */ 779 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 780 781 for (i=0,pos=0; i<nstages; i++) { 782 if (pos+nmax <= ismax) max_no = nmax; 783 else if (pos == ismax) max_no = 0; 784 else max_no = ismax-pos; 785 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,wantallmatrixcolumns,*submat+pos);CHKERRQ(ierr); 786 pos += max_no; 787 } 788 PetscFunctionReturn(0); 789 } 790 791 /* -------------------------------------------------------------------------*/ 792 #undef __FUNCT__ 793 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 794 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 795 { 796 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 797 Mat A = c->A; 798 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 799 const PetscInt **icol,**irow; 800 PetscInt *nrow,*ncol,start; 801 PetscErrorCode ierr; 802 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 803 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 804 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 805 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 806 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 807 #if defined (PETSC_USE_CTABLE) 808 PetscTable *cmap,cmap_i,*rmap,rmap_i; 809 #else 810 PetscInt **cmap,*cmap_i,**rmap,*rmap_i; 811 #endif 812 const PetscInt *irow_i; 813 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 814 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 815 MPI_Request *r_waits4,*s_waits3,*s_waits4; 816 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 817 MPI_Status *r_status3,*r_status4,*s_status4; 818 MPI_Comm comm; 819 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 820 PetscMPIInt *onodes1,*olengths1; 821 PetscMPIInt idex,idex2,end; 822 823 PetscFunctionBegin; 824 comm = ((PetscObject)C)->comm; 825 tag0 = ((PetscObject)C)->tag; 826 size = c->size; 827 rank = c->rank; 828 //printf("[%d] MatGetSubMatrices_MPIAIJ_Local() allcolumns %d\n",rank,allcolumns); 829 //allcolumns = PETSC_FALSE; 830 831 /* Get some new tags to keep the communication clean */ 832 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 833 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 834 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 835 836 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 837 838 for (i=0; i<ismax; i++) { 839 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 840 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 841 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 842 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 843 } 844 845 /* evaluate communication - mesg to who, length of mesg, and buffer space 846 required. Based on this, buffers are allocated, and data copied into them*/ 847 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 848 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 849 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 850 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 851 for (i=0; i<ismax; i++) { 852 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 853 jmax = nrow[i]; 854 irow_i = irow[i]; 855 for (j=0; j<jmax; j++) { 856 l = 0; 857 row = irow_i[j]; 858 while (row >= C->rmap->range[l+1]) l++; 859 proc = l; 860 w4[proc]++; 861 } 862 for (j=0; j<size; j++) { 863 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 864 } 865 } 866 867 nrqs = 0; /* no of outgoing messages */ 868 msz = 0; /* total mesg length (for all procs) */ 869 w1[rank] = 0; /* no mesg sent to self */ 870 w3[rank] = 0; 871 for (i=0; i<size; i++) { 872 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 873 } 874 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 875 for (i=0,j=0; i<size; i++) { 876 if (w1[i]) { pa[j] = i; j++; } 877 } 878 879 /* Each message would have a header = 1 + 2*(no of IS) + data */ 880 for (i=0; i<nrqs; i++) { 881 j = pa[i]; 882 w1[j] += w2[j] + 2* w3[j]; 883 msz += w1[j]; 884 } 885 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 886 887 /* Determine the number of messages to expect, their lengths, from from-ids */ 888 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 889 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 890 891 /* Now post the Irecvs corresponding to these messages */ 892 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 893 894 ierr = PetscFree(onodes1);CHKERRQ(ierr); 895 ierr = PetscFree(olengths1);CHKERRQ(ierr); 896 897 /* Allocate Memory for outgoing messages */ 898 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 899 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 900 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 901 902 { 903 PetscInt *iptr = tmp,ict = 0; 904 for (i=0; i<nrqs; i++) { 905 j = pa[i]; 906 iptr += ict; 907 sbuf1[j] = iptr; 908 ict = w1[j]; 909 } 910 } 911 912 /* Form the outgoing messages */ 913 /* Initialize the header space */ 914 for (i=0; i<nrqs; i++) { 915 j = pa[i]; 916 sbuf1[j][0] = 0; 917 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 918 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 919 } 920 921 /* Parse the isrow and copy data into outbuf */ 922 for (i=0; i<ismax; i++) { 923 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 924 irow_i = irow[i]; 925 jmax = nrow[i]; 926 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 927 l = 0; 928 row = irow_i[j]; 929 while (row >= C->rmap->range[l+1]) l++; 930 proc = l; 931 if (proc != rank) { /* copy to the outgoing buf*/ 932 ctr[proc]++; 933 *ptr[proc] = row; 934 ptr[proc]++; 935 } 936 } 937 /* Update the headers for the current IS */ 938 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 939 if ((ctr_j = ctr[j])) { 940 sbuf1_j = sbuf1[j]; 941 k = ++sbuf1_j[0]; 942 sbuf1_j[2*k] = ctr_j; 943 sbuf1_j[2*k-1] = i; 944 } 945 } 946 } 947 948 /* Now post the sends */ 949 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 950 for (i=0; i<nrqs; ++i) { 951 j = pa[i]; 952 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 953 } 954 955 /* Post Receives to capture the buffer size */ 956 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 957 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 958 rbuf2[0] = tmp + msz; 959 for (i=1; i<nrqs; ++i) { 960 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 961 } 962 for (i=0; i<nrqs; ++i) { 963 j = pa[i]; 964 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 965 } 966 967 /* Send to other procs the buf size they should allocate */ 968 969 970 /* Receive messages*/ 971 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 972 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 973 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 974 { 975 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 976 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 977 PetscInt *sbuf2_i; 978 979 for (i=0; i<nrqr; ++i) { 980 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 981 req_size[idex] = 0; 982 rbuf1_i = rbuf1[idex]; 983 start = 2*rbuf1_i[0] + 1; 984 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 985 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 986 sbuf2_i = sbuf2[idex]; 987 for (j=start; j<end; j++) { 988 id = rbuf1_i[j] - rstart; 989 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 990 sbuf2_i[j] = ncols; 991 req_size[idex] += ncols; 992 } 993 req_source[idex] = r_status1[i].MPI_SOURCE; 994 /* form the header */ 995 sbuf2_i[0] = req_size[idex]; 996 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 997 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 998 } 999 } 1000 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1001 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1002 1003 /* recv buffer sizes */ 1004 /* Receive messages*/ 1005 1006 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1007 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1008 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1009 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1010 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1011 1012 for (i=0; i<nrqs; ++i) { 1013 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1014 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1015 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1016 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1017 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1018 } 1019 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1020 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1021 1022 /* Wait on sends1 and sends2 */ 1023 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1024 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1025 1026 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1027 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1028 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1029 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1030 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1031 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1032 1033 /* Now allocate buffers for a->j, and send them off */ 1034 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1035 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1036 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1037 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1038 1039 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1040 { 1041 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1042 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1043 PetscInt cend = C->cmap->rend; 1044 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1045 1046 for (i=0; i<nrqr; i++) { 1047 rbuf1_i = rbuf1[i]; 1048 sbuf_aj_i = sbuf_aj[i]; 1049 ct1 = 2*rbuf1_i[0] + 1; 1050 ct2 = 0; 1051 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1052 kmax = rbuf1[i][2*j]; 1053 for (k=0; k<kmax; k++,ct1++) { 1054 row = rbuf1_i[ct1] - rstart; 1055 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1056 ncols = nzA + nzB; 1057 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1058 1059 /* load the column indices for this row into cols*/ 1060 cols = sbuf_aj_i + ct2; 1061 1062 lwrite = 0; 1063 for (l=0; l<nzB; l++) { 1064 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1065 } 1066 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1067 for (l=0; l<nzB; l++) { 1068 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1069 } 1070 1071 ct2 += ncols; 1072 } 1073 } 1074 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1075 } 1076 } 1077 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1078 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1079 1080 /* Allocate buffers for a->a, and send them off */ 1081 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1082 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1083 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1084 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1085 1086 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1087 { 1088 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1089 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1090 PetscInt cend = C->cmap->rend; 1091 PetscInt *b_j = b->j; 1092 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1093 1094 for (i=0; i<nrqr; i++) { 1095 rbuf1_i = rbuf1[i]; 1096 sbuf_aa_i = sbuf_aa[i]; 1097 ct1 = 2*rbuf1_i[0]+1; 1098 ct2 = 0; 1099 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1100 kmax = rbuf1_i[2*j]; 1101 for (k=0; k<kmax; k++,ct1++) { 1102 row = rbuf1_i[ct1] - rstart; 1103 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1104 ncols = nzA + nzB; 1105 cworkB = b_j + b_i[row]; 1106 vworkA = a_a + a_i[row]; 1107 vworkB = b_a + b_i[row]; 1108 1109 /* load the column values for this row into vals*/ 1110 vals = sbuf_aa_i+ct2; 1111 1112 lwrite = 0; 1113 for (l=0; l<nzB; l++) { 1114 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1115 } 1116 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1117 for (l=0; l<nzB; l++) { 1118 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1119 } 1120 1121 ct2 += ncols; 1122 } 1123 } 1124 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1125 } 1126 } 1127 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1128 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1129 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1130 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1131 1132 /* Form the matrix */ 1133 /* create col map: global col of C -> local col of submatrices */ 1134 if (!allcolumns) 1135 { 1136 const PetscInt *icol_i; 1137 #if defined (PETSC_USE_CTABLE) 1138 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1139 for (i=0; i<ismax; i++) { 1140 ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr); 1141 jmax = ncol[i]; 1142 icol_i = icol[i]; 1143 cmap_i = cmap[i]; 1144 for (j=0; j<jmax; j++) { 1145 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr); 1146 } 1147 } 1148 #else 1149 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1150 ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 1151 ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1152 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; } 1153 for (i=0; i<ismax; i++) { 1154 jmax = ncol[i]; 1155 icol_i = icol[i]; 1156 cmap_i = cmap[i]; 1157 for (j=0; j<jmax; j++) { 1158 cmap_i[icol_i[j]] = j+1; 1159 //printf("[%d] cmap_i[%d]=%d\n",rank,icol_i[j],j+1); 1160 } 1161 } 1162 #endif 1163 } 1164 1165 /* Create lens which is required for MatCreate... */ 1166 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1167 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1168 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1169 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1170 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1171 1172 /* Update lens from local data */ 1173 for (i=0; i<ismax; i++) { 1174 jmax = nrow[i]; 1175 if (!allcolumns) cmap_i = cmap[i]; 1176 irow_i = irow[i]; 1177 lens_i = lens[i]; 1178 for (j=0; j<jmax; j++) { // for each row 1179 l = 0; 1180 row = irow_i[j]; 1181 while (row >= C->rmap->range[l+1]) l++; 1182 proc = l; 1183 if (proc == rank) { 1184 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1185 if (!allcolumns){ 1186 for (k=0; k<ncols; k++) { 1187 #if defined (PETSC_USE_CTABLE) 1188 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1189 #else 1190 tcol = cmap_i[cols[k]]; 1191 //printf("[%d] cmap_i[%d]=%d\n",rank,cols[k],tcol); 1192 #endif 1193 if (tcol) { lens_i[j]++;} // 1194 } 1195 } else { // allcolumns 1196 lens_i[j] = ncols; 1197 } 1198 //if (rank==0) printf("\n"); 1199 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1200 } //if (proc == rank) { -> lens_i[j]=ncols; 1201 } //for (j=0; j<jmax; j++) 1202 } // for (i=0; i<ismax; i++) 1203 1204 /* Create row map: global row of C -> local row of submatrices */ 1205 #if defined (PETSC_USE_CTABLE) 1206 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1207 for (i=0; i<ismax; i++) { 1208 ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr); 1209 rmap_i = rmap[i]; 1210 irow_i = irow[i]; 1211 jmax = nrow[i]; 1212 for (j=0; j<jmax; j++) { 1213 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr); 1214 } 1215 } 1216 #else 1217 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1218 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1219 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1220 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1221 for (i=0; i<ismax; i++) { 1222 rmap_i = rmap[i]; 1223 irow_i = irow[i]; 1224 jmax = nrow[i]; 1225 for (j=0; j<jmax; j++) { 1226 rmap_i[irow_i[j]] = j; 1227 } 1228 } 1229 #endif 1230 1231 /* Update lens from offproc data */ 1232 { 1233 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1234 1235 for (tmp2=0; tmp2<nrqs; tmp2++) { 1236 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1237 idex = pa[idex2]; 1238 sbuf1_i = sbuf1[idex]; 1239 jmax = sbuf1_i[0]; 1240 ct1 = 2*jmax+1; 1241 ct2 = 0; 1242 rbuf2_i = rbuf2[idex2]; 1243 rbuf3_i = rbuf3[idex2]; 1244 for (j=1; j<=jmax; j++) { 1245 is_no = sbuf1_i[2*j-1]; 1246 max1 = sbuf1_i[2*j]; 1247 lens_i = lens[is_no]; 1248 if (!allcolumns){cmap_i = cmap[is_no];} 1249 rmap_i = rmap[is_no]; 1250 for (k=0; k<max1; k++,ct1++) { 1251 #if defined (PETSC_USE_CTABLE) 1252 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1253 row--; 1254 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1255 #else 1256 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1257 #endif 1258 max2 = rbuf2_i[ct1]; 1259 for (l=0; l<max2; l++,ct2++) { 1260 if (!allcolumns){ 1261 #if defined (PETSC_USE_CTABLE) 1262 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1263 #else 1264 tcol = cmap_i[rbuf3_i[ct2]]; 1265 #endif 1266 //printf("[%d] tcol %d\n",rank,tcol); 1267 if (tcol) { 1268 lens_i[row]++; 1269 } 1270 } else { //allcolumns 1271 lens_i[row]++; //lens_i[row] += max2 ? 1272 } 1273 } //for (l=0; l<max2; l++,ct2++) { 1274 } 1275 } 1276 } 1277 } 1278 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1279 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1280 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1281 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1282 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1283 1284 /* Create the submatrices */ 1285 if (scall == MAT_REUSE_MATRIX) { 1286 PetscBool flag; 1287 1288 /* 1289 Assumes new rows are same length as the old rows,hence bug! 1290 */ 1291 for (i=0; i<ismax; i++) { 1292 mat = (Mat_SeqAIJ *)(submats[i]->data); 1293 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1294 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1295 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1296 /* Initial matrix as if empty */ 1297 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1298 submats[i]->factortype = C->factortype; 1299 } 1300 } else { 1301 for (i=0; i<ismax; i++) { 1302 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1303 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1304 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1305 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1306 } 1307 } 1308 1309 /* Assemble the matrices */ 1310 /* First assemble the local rows */ 1311 { 1312 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1313 PetscScalar *imat_a; 1314 1315 for (i=0; i<ismax; i++) { 1316 mat = (Mat_SeqAIJ*)submats[i]->data; 1317 imat_ilen = mat->ilen; 1318 imat_j = mat->j; 1319 imat_i = mat->i; 1320 imat_a = mat->a; 1321 if (!allcolumns) cmap_i = cmap[i]; 1322 rmap_i = rmap[i]; 1323 irow_i = irow[i]; 1324 jmax = nrow[i]; 1325 for (j=0; j<jmax; j++) { 1326 l = 0; 1327 row = irow_i[j]; 1328 while (row >= C->rmap->range[l+1]) l++; 1329 proc = l; 1330 if (proc == rank) { 1331 old_row = row; 1332 #if defined (PETSC_USE_CTABLE) 1333 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1334 row--; 1335 #else 1336 row = rmap_i[row]; 1337 #endif 1338 ilen_row = imat_ilen[row]; 1339 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1340 mat_i = imat_i[row] ; 1341 mat_a = imat_a + mat_i; 1342 mat_j = imat_j + mat_i; 1343 if (!allcolumns){ 1344 for (k=0; k<ncols; k++) { 1345 #if defined (PETSC_USE_CTABLE) 1346 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1347 #else 1348 tcol = cmap_i[cols[k]]; 1349 #endif 1350 //printf("[%d] cols %d, tcol-1 %d\n",rank,cols[k],tcol-1); 1351 if (tcol){ 1352 *mat_j++ = tcol - 1; 1353 *mat_a++ = vals[k]; 1354 ilen_row++; 1355 } 1356 } 1357 } else { //allcolumns 1358 for (k=0; k<ncols; k++) { 1359 *mat_j++ = cols[k] ; /* global col index! */ 1360 *mat_a++ = vals[k]; 1361 ilen_row++; 1362 } 1363 } 1364 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1365 imat_ilen[row] = ilen_row; 1366 } 1367 } 1368 } 1369 } 1370 1371 /* Now assemble the off proc rows*/ 1372 { 1373 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1374 PetscInt *imat_j,*imat_i; 1375 PetscScalar *imat_a,*rbuf4_i; 1376 1377 for (tmp2=0; tmp2<nrqs; tmp2++) { 1378 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1379 idex = pa[idex2]; 1380 sbuf1_i = sbuf1[idex]; 1381 jmax = sbuf1_i[0]; 1382 ct1 = 2*jmax + 1; 1383 ct2 = 0; 1384 rbuf2_i = rbuf2[idex2]; 1385 rbuf3_i = rbuf3[idex2]; 1386 rbuf4_i = rbuf4[idex2]; 1387 for (j=1; j<=jmax; j++) { 1388 is_no = sbuf1_i[2*j-1]; 1389 rmap_i = rmap[is_no]; 1390 if (!allcolumns){cmap_i = cmap[is_no];} 1391 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1392 imat_ilen = mat->ilen; 1393 imat_j = mat->j; 1394 imat_i = mat->i; 1395 imat_a = mat->a; 1396 max1 = sbuf1_i[2*j]; 1397 for (k=0; k<max1; k++,ct1++) { 1398 row = sbuf1_i[ct1]; 1399 #if defined (PETSC_USE_CTABLE) 1400 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1401 row--; 1402 #else 1403 row = rmap_i[row]; 1404 #endif 1405 ilen = imat_ilen[row]; 1406 mat_i = imat_i[row] ; 1407 mat_a = imat_a + mat_i; 1408 mat_j = imat_j + mat_i; 1409 max2 = rbuf2_i[ct1]; 1410 if (!allcolumns){ 1411 for (l=0; l<max2; l++,ct2++) { 1412 1413 #if defined (PETSC_USE_CTABLE) 1414 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1415 #else 1416 tcol = cmap_i[rbuf3_i[ct2]]; 1417 #endif 1418 if (tcol) { 1419 //if (rank) printf("[%d] rbuf3_i[ct2] %d, tcol %d\n",rank,rbuf3_i[ct2],tcol-1); 1420 *mat_j++ = tcol - 1; 1421 *mat_a++ = rbuf4_i[ct2]; 1422 ilen++; 1423 } 1424 } 1425 } else { //allcolumns 1426 for (l=0; l<max2; l++,ct2++) { 1427 *mat_j++ = rbuf3_i[ct2]; /* global col index! */ 1428 *mat_a++ = rbuf4_i[ct2]; 1429 ilen++; 1430 } 1431 } 1432 imat_ilen[row] = ilen; 1433 } 1434 } 1435 } 1436 } 1437 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1438 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1439 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1440 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1441 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1442 1443 /* Restore the indices */ 1444 for (i=0; i<ismax; i++) { 1445 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1446 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1447 } 1448 1449 /* Destroy allocated memory */ 1450 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1451 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1452 ierr = PetscFree(pa);CHKERRQ(ierr); 1453 1454 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1455 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1456 for (i=0; i<nrqr; ++i) { 1457 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1458 } 1459 for (i=0; i<nrqs; ++i) { 1460 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1461 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1462 } 1463 1464 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1465 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1466 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1467 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1468 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1469 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1470 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1471 #if defined (PETSC_USE_CTABLE) 1472 for (i=0; i<ismax; i++) { 1473 if (!allcolumns){ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);} 1474 ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr); 1475 } 1476 #else 1477 if (!allcolumns){ierr = PetscFree(cmap[0]);CHKERRQ(ierr);} 1478 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1479 #endif 1480 if (!allcolumns){ierr = PetscFree(cmap);CHKERRQ(ierr);} 1481 ierr = PetscFree(rmap);CHKERRQ(ierr); 1482 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1483 ierr = PetscFree(lens);CHKERRQ(ierr); 1484 1485 for (i=0; i<ismax; i++) { 1486 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1487 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1488 } 1489 PetscFunctionReturn(0); 1490 } 1491 1492 /* 1493 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1494 Be careful not to destroy them elsewhere. 1495 */ 1496 #undef __FUNCT__ 1497 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1498 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1499 { 1500 /* If making this function public, change the error returned in this function away from _PLIB. */ 1501 PetscErrorCode ierr; 1502 Mat_MPIAIJ *aij; 1503 PetscBool seqaij; 1504 1505 PetscFunctionBegin; 1506 /* Check to make sure the component matrices are compatible with C. */ 1507 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1508 if(!seqaij) { 1509 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1510 } 1511 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1512 if(!seqaij) { 1513 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1514 } 1515 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1516 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1517 } 1518 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1519 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1520 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1521 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1522 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1523 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1524 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1525 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1526 } 1527 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1528 aij = (Mat_MPIAIJ*)((*C)->data); 1529 aij->A = A; 1530 aij->B = B; 1531 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1532 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1533 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1534 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1535 PetscFunctionReturn(0); 1536 } 1537 1538 #undef __FUNCT__ 1539 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1540 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1541 { 1542 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1543 PetscFunctionBegin; 1544 PetscValidPointer(A,2); 1545 PetscValidPointer(B,3); 1546 *A = aij->A; 1547 *B = aij->B; 1548 /* Note that we don't incref *A and *B, so be careful! */ 1549 PetscFunctionReturn(0); 1550 } 1551 1552 1553 #undef __FUNCT__ 1554 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1555 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1556 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1557 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1558 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1559 { 1560 PetscErrorCode ierr; 1561 PetscMPIInt size, flag; 1562 PetscInt i,ii; 1563 PetscInt ismax_c; 1564 PetscFunctionBegin; 1565 if(!ismax) { 1566 PetscFunctionReturn(0); 1567 } 1568 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1569 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1570 if(flag != MPI_IDENT) { 1571 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1572 } 1573 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1574 if(size > 1) { 1575 ++ismax_c; 1576 } 1577 } 1578 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1579 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1580 } 1581 else { /* if(ismax_c) */ 1582 Mat *A,*B; 1583 IS *isrow_c, *iscol_c; 1584 PetscMPIInt size; 1585 /* 1586 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1587 array of sequential matrices underlying the resulting parallel matrices. 1588 Which arrays to allocate is based on the value of MatReuse scall. 1589 There are as many diag matrices as there are original index sets. 1590 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1591 1592 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1593 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1594 will deposite the extracted diag and off-diag parts. 1595 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1596 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1597 */ 1598 1599 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1600 if (scall != MAT_REUSE_MATRIX) { 1601 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1602 } 1603 else { 1604 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1605 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1606 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1607 for(i = 0, ii = 0; i < ismax; ++i) { 1608 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1609 if(size > 1) { 1610 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1611 ++ii; 1612 } 1613 else { 1614 A[i] = (*submat)[i]; 1615 } 1616 } 1617 } 1618 /* 1619 Construct the complements of the iscol ISs for parallel ISs only. 1620 These are used to extract the off-diag portion of the resulting parallel matrix. 1621 The row IS for the off-diag portion is the same as for the diag portion, 1622 so we merely alias the row IS, while skipping those that are sequential. 1623 */ 1624 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1625 for(i = 0, ii = 0; i < ismax; ++i) { 1626 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1627 if(size > 1) { 1628 isrow_c[ii] = isrow[i]; 1629 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1630 ++ii; 1631 } 1632 } 1633 /* Now obtain the sequential A and B submatrices separately. */ 1634 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1635 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1636 for(ii = 0; ii < ismax_c; ++ii) { 1637 ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr); 1638 } 1639 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1640 /* 1641 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1642 have been extracted directly into the parallel matrices containing them, or 1643 simply into the sequential matrix identical with the corresponding A (if size == 1). 1644 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1645 A is put into the correct submat slot (if size == 1). 1646 */ 1647 if(scall != MAT_REUSE_MATRIX) { 1648 for(i = 0, ii = 0; i < ismax; ++i) { 1649 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1650 if(size > 1) { 1651 /* 1652 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1653 */ 1654 /* Construct submat[i] from the Seq pieces A and B. */ 1655 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1656 1657 ++ii; 1658 } 1659 else { 1660 (*submat)[i] = A[i]; 1661 } 1662 } 1663 } 1664 ierr = PetscFree(A); CHKERRQ(ierr); 1665 ierr = PetscFree(B); CHKERRQ(ierr); 1666 } 1667 PetscFunctionReturn(0); 1668 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1669 1670 1671 1672 #undef __FUNCT__ 1673 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1674 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1675 { 1676 PetscErrorCode ierr; 1677 PetscFunctionBegin; 1678 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1679 MatCreateMPIAIJFromSeqMatrices_Private, 1680 MatMPIAIJExtractSeqMatrices_Private); 1681 CHKERRQ(ierr); 1682 PetscFunctionReturn(0); 1683 } 1684