1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++){ 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 260 rw1[proc] = isz1[i]; 261 } 262 ierr = PetscFree(onodes1);CHKERRQ(ierr); 263 ierr = PetscFree(olengths1);CHKERRQ(ierr); 264 265 /* Determine the number of messages to expect, their lengths, from from-ids */ 266 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 267 ierr = PetscFree(rw1);CHKERRQ(ierr); 268 } 269 /* Now post the Irecvs corresponding to these messages */ 270 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 271 272 /* Now post the sends */ 273 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 274 for (i=0; i<nrqr; ++i) { 275 j = recv_status[i].MPI_SOURCE; 276 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* receive work done on other processors*/ 280 { 281 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 282 PetscMPIInt idex; 283 PetscBT table_i; 284 MPI_Status *status2; 285 286 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 287 for (i=0; i<nrqs; ++i) { 288 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 289 /* Process the message*/ 290 rbuf2_i = rbuf2[idex]; 291 ct1 = 2*rbuf2_i[0]+1; 292 jmax = rbuf2[idex][0]; 293 for (j=1; j<=jmax; j++) { 294 max = rbuf2_i[2*j]; 295 is_no = rbuf2_i[2*j-1]; 296 isz_i = isz[is_no]; 297 data_i = data[is_no]; 298 table_i = table[is_no]; 299 for (k=0; k<max; k++,ct1++) { 300 row = rbuf2_i[ct1]; 301 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 302 } 303 isz[is_no] = isz_i; 304 } 305 } 306 307 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 308 ierr = PetscFree(status2);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<imax; ++i) { 312 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 313 } 314 315 ierr = PetscFree(onodes2);CHKERRQ(ierr); 316 ierr = PetscFree(olengths2);CHKERRQ(ierr); 317 318 ierr = PetscFree(pa);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 324 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 325 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 326 ierr = PetscFree(s_status);CHKERRQ(ierr); 327 ierr = PetscFree(recv_status);CHKERRQ(ierr); 328 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 329 ierr = PetscFree(xdata);CHKERRQ(ierr); 330 ierr = PetscFree(isz1);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 336 /* 337 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 338 the work on the local processor. 339 340 Inputs: 341 C - MAT_MPIAIJ; 342 imax - total no of index sets processed at a time; 343 table - an array of char - size = m bits. 344 345 Output: 346 isz - array containing the count of the solution elements corresponding 347 to each index set; 348 data - pointer to the solutions 349 */ 350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 351 { 352 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 353 Mat A = c->A,B = c->B; 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 355 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 356 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 357 PetscBT table_i; 358 359 PetscFunctionBegin; 360 rstart = C->rmap->rstart; 361 cstart = C->cmap->rstart; 362 ai = a->i; 363 aj = a->j; 364 bi = b->i; 365 bj = b->j; 366 garray = c->garray; 367 368 369 for (i=0; i<imax; i++) { 370 data_i = data[i]; 371 table_i = table[i]; 372 isz_i = isz[i]; 373 for (j=0,max=isz[i]; j<max; j++) { 374 row = data_i[j] - rstart; 375 start = ai[row]; 376 end = ai[row+1]; 377 for (k=start; k<end; k++) { /* Amat */ 378 val = aj[k] + cstart; 379 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 380 } 381 start = bi[row]; 382 end = bi[row+1]; 383 for (k=start; k<end; k++) { /* Bmat */ 384 val = garray[bj[k]]; 385 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 386 } 387 } 388 isz[i] = isz_i; 389 } 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 395 /* 396 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 397 and return the output 398 399 Input: 400 C - the matrix 401 nrqr - no of messages being processed. 402 rbuf - an array of pointers to the recieved requests 403 404 Output: 405 xdata - array of messages to be sent back 406 isz1 - size of each message 407 408 For better efficiency perhaps we should malloc separately each xdata[i], 409 then if a remalloc is required we need only copy the data for that one row 410 rather then all previous rows as it is now where a single large chunck of 411 memory is used. 412 413 */ 414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 415 { 416 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 417 Mat A = c->A,B = c->B; 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 419 PetscErrorCode ierr; 420 PetscMPIInt rank; 421 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 422 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 423 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 424 PetscInt *rbuf_i,kmax,rbuf_0; 425 PetscBT xtable; 426 427 PetscFunctionBegin; 428 rank = c->rank; 429 m = C->rmap->N; 430 rstart = C->rmap->rstart; 431 cstart = C->cmap->rstart; 432 ai = a->i; 433 aj = a->j; 434 bi = b->i; 435 bj = b->j; 436 garray = c->garray; 437 438 439 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 440 rbuf_i = rbuf[i]; 441 rbuf_0 = rbuf_i[0]; 442 ct += rbuf_0; 443 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 444 } 445 446 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 447 else max1 = 1; 448 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 449 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 450 ++no_malloc; 451 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 452 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 453 454 ct3 = 0; 455 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 456 rbuf_i = rbuf[i]; 457 rbuf_0 = rbuf_i[0]; 458 ct1 = 2*rbuf_0+1; 459 ct2 = ct1; 460 ct3 += ct1; 461 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 462 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 463 oct2 = ct2; 464 kmax = rbuf_i[2*j]; 465 for (k=0; k<kmax; k++,ct1++) { 466 row = rbuf_i[ct1]; 467 if (!PetscBTLookupSet(xtable,row)) { 468 if (!(ct3 < mem_estimate)) { 469 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 470 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 471 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 472 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 473 xdata[0] = tmp; 474 mem_estimate = new_estimate; ++no_malloc; 475 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 476 } 477 xdata[i][ct2++] = row; 478 ct3++; 479 } 480 } 481 for (k=oct2,max2=ct2; k<max2; k++) { 482 row = xdata[i][k] - rstart; 483 start = ai[row]; 484 end = ai[row+1]; 485 for (l=start; l<end; l++) { 486 val = aj[l] + cstart; 487 if (!PetscBTLookupSet(xtable,val)) { 488 if (!(ct3 < mem_estimate)) { 489 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 490 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 491 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 492 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 493 xdata[0] = tmp; 494 mem_estimate = new_estimate; ++no_malloc; 495 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 496 } 497 xdata[i][ct2++] = val; 498 ct3++; 499 } 500 } 501 start = bi[row]; 502 end = bi[row+1]; 503 for (l=start; l<end; l++) { 504 val = garray[bj[l]]; 505 if (!PetscBTLookupSet(xtable,val)) { 506 if (!(ct3 < mem_estimate)) { 507 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 508 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 509 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 510 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 511 xdata[0] = tmp; 512 mem_estimate = new_estimate; ++no_malloc; 513 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 514 } 515 xdata[i][ct2++] = val; 516 ct3++; 517 } 518 } 519 } 520 /* Update the header*/ 521 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 522 xdata[i][2*j-1] = rbuf_i[2*j-1]; 523 } 524 xdata[i][0] = rbuf_0; 525 xdata[i+1] = xdata[i] + ct2; 526 isz1[i] = ct2; /* size of each message */ 527 } 528 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 529 ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 530 PetscFunctionReturn(0); 531 } 532 /* -------------------------------------------------------------------------*/ 533 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool,Mat*); 534 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 535 /* 536 Every processor gets the entire matrix 537 */ 538 #undef __FUNCT__ 539 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 540 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 541 { 542 Mat B; 543 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 544 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 545 PetscErrorCode ierr; 546 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 547 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 548 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 549 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 550 551 PetscFunctionBegin; 552 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 553 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 554 555 if (scall == MAT_INITIAL_MATRIX) { 556 /* ---------------------------------------------------------------- 557 Tell every processor the number of nonzeros per row 558 */ 559 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 560 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 561 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 562 } 563 sendcount = A->rmap->rend - A->rmap->rstart; 564 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 565 for (i=0; i<size; i++) { 566 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 567 displs[i] = A->rmap->range[i]; 568 } 569 #if defined(PETSC_HAVE_MPI_IN_PLACE) 570 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 571 #else 572 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 573 #endif 574 /* --------------------------------------------------------------- 575 Create the sequential matrix of the same type as the local block diagonal 576 */ 577 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 578 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 579 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 580 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 581 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 582 **Bin = B; 583 b = (Mat_SeqAIJ *)B->data; 584 585 /*-------------------------------------------------------------------- 586 Copy my part of matrix column indices over 587 */ 588 sendcount = ad->nz + bd->nz; 589 jsendbuf = b->j + b->i[rstarts[rank]]; 590 a_jsendbuf = ad->j; 591 b_jsendbuf = bd->j; 592 n = A->rmap->rend - A->rmap->rstart; 593 cnt = 0; 594 for (i=0; i<n; i++) { 595 596 /* put in lower diagonal portion */ 597 m = bd->i[i+1] - bd->i[i]; 598 while (m > 0) { 599 /* is it above diagonal (in bd (compressed) numbering) */ 600 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 601 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 602 m--; 603 } 604 605 /* put in diagonal portion */ 606 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 607 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 608 } 609 610 /* put in upper diagonal portion */ 611 while (m-- > 0) { 612 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 613 } 614 } 615 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 616 617 /*-------------------------------------------------------------------- 618 Gather all column indices to all processors 619 */ 620 for (i=0; i<size; i++) { 621 recvcounts[i] = 0; 622 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 623 recvcounts[i] += lens[j]; 624 } 625 } 626 displs[0] = 0; 627 for (i=1; i<size; i++) { 628 displs[i] = displs[i-1] + recvcounts[i-1]; 629 } 630 #if defined(PETSC_HAVE_MPI_IN_PLACE) 631 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 632 #else 633 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 634 #endif 635 /*-------------------------------------------------------------------- 636 Assemble the matrix into useable form (note numerical values not yet set) 637 */ 638 /* set the b->ilen (length of each row) values */ 639 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 640 /* set the b->i indices */ 641 b->i[0] = 0; 642 for (i=1; i<=A->rmap->N; i++) { 643 b->i[i] = b->i[i-1] + lens[i-1]; 644 } 645 ierr = PetscFree(lens);CHKERRQ(ierr); 646 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 647 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 648 649 } else { 650 B = **Bin; 651 b = (Mat_SeqAIJ *)B->data; 652 } 653 654 /*-------------------------------------------------------------------- 655 Copy my part of matrix numerical values into the values location 656 */ 657 if (flag == MAT_GET_VALUES){ 658 sendcount = ad->nz + bd->nz; 659 sendbuf = b->a + b->i[rstarts[rank]]; 660 a_sendbuf = ad->a; 661 b_sendbuf = bd->a; 662 b_sendj = bd->j; 663 n = A->rmap->rend - A->rmap->rstart; 664 cnt = 0; 665 for (i=0; i<n; i++) { 666 667 /* put in lower diagonal portion */ 668 m = bd->i[i+1] - bd->i[i]; 669 while (m > 0) { 670 /* is it above diagonal (in bd (compressed) numbering) */ 671 if (garray[*b_sendj] > A->rmap->rstart + i) break; 672 sendbuf[cnt++] = *b_sendbuf++; 673 m--; 674 b_sendj++; 675 } 676 677 /* put in diagonal portion */ 678 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 679 sendbuf[cnt++] = *a_sendbuf++; 680 } 681 682 /* put in upper diagonal portion */ 683 while (m-- > 0) { 684 sendbuf[cnt++] = *b_sendbuf++; 685 b_sendj++; 686 } 687 } 688 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 689 690 /* ----------------------------------------------------------------- 691 Gather all numerical values to all processors 692 */ 693 if (!recvcounts) { 694 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 695 } 696 for (i=0; i<size; i++) { 697 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 698 } 699 displs[0] = 0; 700 for (i=1; i<size; i++) { 701 displs[i] = displs[i-1] + recvcounts[i-1]; 702 } 703 recvbuf = b->a; 704 #if defined(PETSC_HAVE_MPI_IN_PLACE) 705 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 706 #else 707 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 708 #endif 709 } /* endof (flag == MAT_GET_VALUES) */ 710 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 711 712 if (A->symmetric){ 713 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 714 } else if (A->hermitian) { 715 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 716 } else if (A->structurally_symmetric) { 717 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 718 } 719 PetscFunctionReturn(0); 720 } 721 722 723 724 #undef __FUNCT__ 725 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 726 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 727 { 728 PetscErrorCode ierr; 729 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 730 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,wantallmatrixcolumns=PETSC_FALSE; 731 732 PetscFunctionBegin; 733 /* 734 Check for special case: each processor gets entire matrix 735 */ 736 if (ismax == 1 && C->rmap->N == C->cmap->N) { 737 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 738 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 739 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 740 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 741 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 742 wantallmatrix = PETSC_TRUE; 743 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 744 } 745 } 746 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 747 if (twantallmatrix) { 748 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 749 PetscFunctionReturn(0); 750 } 751 752 /* Allocate memory to hold all the submatrices */ 753 if (scall != MAT_REUSE_MATRIX) { 754 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 755 } 756 757 /* Check for special case: each processor gets entire matrix columns */ 758 if (ismax == 1){ 759 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 760 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 761 if (colflag && ncol == C->cmap->N){ 762 wantallmatrixcolumns = PETSC_TRUE; 763 ierr = MatGetSubMatrices_MPIAIJ_Local(C,ismax,isrow,iscol,scall,wantallmatrixcolumns,*submat);CHKERRQ(ierr); 764 PetscFunctionReturn(0); 765 } 766 } 767 768 /* Determine the number of stages through which submatrices are done */ 769 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 770 771 /* 772 Each stage will extract nmax submatrices. 773 nmax is determined by the matrix column dimension. 774 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 775 */ 776 if (!nmax) nmax = 1; 777 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 778 779 /* Make sure every processor loops through the nstages */ 780 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 781 782 for (i=0,pos=0; i<nstages; i++) { 783 if (pos+nmax <= ismax) max_no = nmax; 784 else if (pos == ismax) max_no = 0; 785 else max_no = ismax-pos; 786 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,wantallmatrixcolumns,*submat+pos);CHKERRQ(ierr); 787 pos += max_no; 788 } 789 PetscFunctionReturn(0); 790 } 791 792 /* -------------------------------------------------------------------------*/ 793 #undef __FUNCT__ 794 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 795 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats) 796 { 797 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 798 Mat A = c->A; 799 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 800 const PetscInt **icol,**irow; 801 PetscInt *nrow,*ncol,start; 802 PetscErrorCode ierr; 803 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 804 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 805 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 806 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 807 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 808 #if defined (PETSC_USE_CTABLE) 809 PetscTable *cmap,cmap_i,*rmap,rmap_i; 810 #else 811 PetscInt **cmap,*cmap_i,**rmap,*rmap_i; 812 #endif 813 const PetscInt *irow_i; 814 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 815 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 816 MPI_Request *r_waits4,*s_waits3,*s_waits4; 817 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 818 MPI_Status *r_status3,*r_status4,*s_status4; 819 MPI_Comm comm; 820 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 821 PetscMPIInt *onodes1,*olengths1; 822 PetscMPIInt idex,idex2,end; 823 824 PetscFunctionBegin; 825 comm = ((PetscObject)C)->comm; 826 tag0 = ((PetscObject)C)->tag; 827 size = c->size; 828 rank = c->rank; 829 830 /* Get some new tags to keep the communication clean */ 831 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 832 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 833 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 834 835 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 836 837 for (i=0; i<ismax; i++) { 838 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 839 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 840 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 841 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 842 } 843 844 /* evaluate communication - mesg to who, length of mesg, and buffer space 845 required. Based on this, buffers are allocated, and data copied into them*/ 846 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 847 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 848 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 849 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 850 for (i=0; i<ismax; i++) { 851 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 852 jmax = nrow[i]; 853 irow_i = irow[i]; 854 for (j=0; j<jmax; j++) { 855 l = 0; 856 row = irow_i[j]; 857 while (row >= C->rmap->range[l+1]) l++; 858 proc = l; 859 w4[proc]++; 860 } 861 for (j=0; j<size; j++) { 862 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 863 } 864 } 865 866 nrqs = 0; /* no of outgoing messages */ 867 msz = 0; /* total mesg length (for all procs) */ 868 w1[rank] = 0; /* no mesg sent to self */ 869 w3[rank] = 0; 870 for (i=0; i<size; i++) { 871 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 872 } 873 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 874 for (i=0,j=0; i<size; i++) { 875 if (w1[i]) { pa[j] = i; j++; } 876 } 877 878 /* Each message would have a header = 1 + 2*(no of IS) + data */ 879 for (i=0; i<nrqs; i++) { 880 j = pa[i]; 881 w1[j] += w2[j] + 2* w3[j]; 882 msz += w1[j]; 883 } 884 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 885 886 /* Determine the number of messages to expect, their lengths, from from-ids */ 887 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 888 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 889 890 /* Now post the Irecvs corresponding to these messages */ 891 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 892 893 ierr = PetscFree(onodes1);CHKERRQ(ierr); 894 ierr = PetscFree(olengths1);CHKERRQ(ierr); 895 896 /* Allocate Memory for outgoing messages */ 897 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 898 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 899 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 900 901 { 902 PetscInt *iptr = tmp,ict = 0; 903 for (i=0; i<nrqs; i++) { 904 j = pa[i]; 905 iptr += ict; 906 sbuf1[j] = iptr; 907 ict = w1[j]; 908 } 909 } 910 911 /* Form the outgoing messages */ 912 /* Initialize the header space */ 913 for (i=0; i<nrqs; i++) { 914 j = pa[i]; 915 sbuf1[j][0] = 0; 916 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 917 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 918 } 919 920 /* Parse the isrow and copy data into outbuf */ 921 for (i=0; i<ismax; i++) { 922 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 923 irow_i = irow[i]; 924 jmax = nrow[i]; 925 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 926 l = 0; 927 row = irow_i[j]; 928 while (row >= C->rmap->range[l+1]) l++; 929 proc = l; 930 if (proc != rank) { /* copy to the outgoing buf*/ 931 ctr[proc]++; 932 *ptr[proc] = row; 933 ptr[proc]++; 934 } 935 } 936 /* Update the headers for the current IS */ 937 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 938 if ((ctr_j = ctr[j])) { 939 sbuf1_j = sbuf1[j]; 940 k = ++sbuf1_j[0]; 941 sbuf1_j[2*k] = ctr_j; 942 sbuf1_j[2*k-1] = i; 943 } 944 } 945 } 946 947 /* Now post the sends */ 948 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 949 for (i=0; i<nrqs; ++i) { 950 j = pa[i]; 951 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 952 } 953 954 /* Post Receives to capture the buffer size */ 955 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 956 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 957 rbuf2[0] = tmp + msz; 958 for (i=1; i<nrqs; ++i) { 959 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 960 } 961 for (i=0; i<nrqs; ++i) { 962 j = pa[i]; 963 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 964 } 965 966 /* Send to other procs the buf size they should allocate */ 967 968 969 /* Receive messages*/ 970 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 971 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 972 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 973 { 974 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 975 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 976 PetscInt *sbuf2_i; 977 978 for (i=0; i<nrqr; ++i) { 979 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 980 req_size[idex] = 0; 981 rbuf1_i = rbuf1[idex]; 982 start = 2*rbuf1_i[0] + 1; 983 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 984 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 985 sbuf2_i = sbuf2[idex]; 986 for (j=start; j<end; j++) { 987 id = rbuf1_i[j] - rstart; 988 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 989 sbuf2_i[j] = ncols; 990 req_size[idex] += ncols; 991 } 992 req_source[idex] = r_status1[i].MPI_SOURCE; 993 /* form the header */ 994 sbuf2_i[0] = req_size[idex]; 995 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 996 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 997 } 998 } 999 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1000 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1001 1002 /* recv buffer sizes */ 1003 /* Receive messages*/ 1004 1005 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1006 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1007 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1008 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1009 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1010 1011 for (i=0; i<nrqs; ++i) { 1012 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1013 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1014 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1015 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1016 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1017 } 1018 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1019 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1020 1021 /* Wait on sends1 and sends2 */ 1022 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1023 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1024 1025 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1026 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1027 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1028 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1029 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1030 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1031 1032 /* Now allocate buffers for a->j, and send them off */ 1033 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1034 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1035 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1036 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1037 1038 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1039 { 1040 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1041 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1042 PetscInt cend = C->cmap->rend; 1043 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1044 1045 for (i=0; i<nrqr; i++) { 1046 rbuf1_i = rbuf1[i]; 1047 sbuf_aj_i = sbuf_aj[i]; 1048 ct1 = 2*rbuf1_i[0] + 1; 1049 ct2 = 0; 1050 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1051 kmax = rbuf1[i][2*j]; 1052 for (k=0; k<kmax; k++,ct1++) { 1053 row = rbuf1_i[ct1] - rstart; 1054 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1055 ncols = nzA + nzB; 1056 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1057 1058 /* load the column indices for this row into cols*/ 1059 cols = sbuf_aj_i + ct2; 1060 1061 lwrite = 0; 1062 for (l=0; l<nzB; l++) { 1063 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1064 } 1065 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1066 for (l=0; l<nzB; l++) { 1067 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1068 } 1069 1070 ct2 += ncols; 1071 } 1072 } 1073 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1074 } 1075 } 1076 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1077 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1078 1079 /* Allocate buffers for a->a, and send them off */ 1080 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1081 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1082 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1083 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1084 1085 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1086 { 1087 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1088 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1089 PetscInt cend = C->cmap->rend; 1090 PetscInt *b_j = b->j; 1091 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1092 1093 for (i=0; i<nrqr; i++) { 1094 rbuf1_i = rbuf1[i]; 1095 sbuf_aa_i = sbuf_aa[i]; 1096 ct1 = 2*rbuf1_i[0]+1; 1097 ct2 = 0; 1098 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1099 kmax = rbuf1_i[2*j]; 1100 for (k=0; k<kmax; k++,ct1++) { 1101 row = rbuf1_i[ct1] - rstart; 1102 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1103 ncols = nzA + nzB; 1104 cworkB = b_j + b_i[row]; 1105 vworkA = a_a + a_i[row]; 1106 vworkB = b_a + b_i[row]; 1107 1108 /* load the column values for this row into vals*/ 1109 vals = sbuf_aa_i+ct2; 1110 1111 lwrite = 0; 1112 for (l=0; l<nzB; l++) { 1113 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1114 } 1115 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1116 for (l=0; l<nzB; l++) { 1117 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1118 } 1119 1120 ct2 += ncols; 1121 } 1122 } 1123 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1124 } 1125 } 1126 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1127 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1128 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1129 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1130 1131 /* Form the matrix */ 1132 /* create col map: global col of C -> local col of submatrices */ 1133 if (!allcolumns){ 1134 const PetscInt *icol_i; 1135 #if defined (PETSC_USE_CTABLE) 1136 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1137 for (i=0; i<ismax; i++) { 1138 ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr); 1139 jmax = ncol[i]; 1140 icol_i = icol[i]; 1141 cmap_i = cmap[i]; 1142 for (j=0; j<jmax; j++) { 1143 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr); 1144 } 1145 } 1146 #else 1147 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1148 ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 1149 ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1150 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; } 1151 for (i=0; i<ismax; i++) { 1152 jmax = ncol[i]; 1153 icol_i = icol[i]; 1154 cmap_i = cmap[i]; 1155 for (j=0; j<jmax; j++) { 1156 cmap_i[icol_i[j]] = j+1; 1157 } 1158 } 1159 #endif 1160 } 1161 1162 /* Create lens which is required for MatCreate... */ 1163 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1164 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1165 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1166 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1167 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1168 1169 /* Update lens from local data */ 1170 for (i=0; i<ismax; i++) { 1171 jmax = nrow[i]; 1172 if (!allcolumns) cmap_i = cmap[i]; 1173 irow_i = irow[i]; 1174 lens_i = lens[i]; 1175 for (j=0; j<jmax; j++) { 1176 l = 0; 1177 row = irow_i[j]; 1178 while (row >= C->rmap->range[l+1]) l++; 1179 proc = l; 1180 if (proc == rank) { 1181 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1182 if (!allcolumns){ 1183 for (k=0; k<ncols; k++) { 1184 #if defined (PETSC_USE_CTABLE) 1185 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1186 #else 1187 tcol = cmap_i[cols[k]]; 1188 #endif 1189 if (tcol) { lens_i[j]++;} 1190 } 1191 } else { /* allcolumns */ 1192 lens_i[j] = ncols; 1193 } 1194 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1195 } 1196 } 1197 } 1198 1199 /* Create row map: global row of C -> local row of submatrices */ 1200 #if defined (PETSC_USE_CTABLE) 1201 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1202 for (i=0; i<ismax; i++) { 1203 ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr); 1204 rmap_i = rmap[i]; 1205 irow_i = irow[i]; 1206 jmax = nrow[i]; 1207 for (j=0; j<jmax; j++) { 1208 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr); 1209 } 1210 } 1211 #else 1212 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1213 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1214 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1215 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1216 for (i=0; i<ismax; i++) { 1217 rmap_i = rmap[i]; 1218 irow_i = irow[i]; 1219 jmax = nrow[i]; 1220 for (j=0; j<jmax; j++) { 1221 rmap_i[irow_i[j]] = j; 1222 } 1223 } 1224 #endif 1225 1226 /* Update lens from offproc data */ 1227 { 1228 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1229 1230 for (tmp2=0; tmp2<nrqs; tmp2++) { 1231 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1232 idex = pa[idex2]; 1233 sbuf1_i = sbuf1[idex]; 1234 jmax = sbuf1_i[0]; 1235 ct1 = 2*jmax+1; 1236 ct2 = 0; 1237 rbuf2_i = rbuf2[idex2]; 1238 rbuf3_i = rbuf3[idex2]; 1239 for (j=1; j<=jmax; j++) { 1240 is_no = sbuf1_i[2*j-1]; 1241 max1 = sbuf1_i[2*j]; 1242 lens_i = lens[is_no]; 1243 if (!allcolumns){cmap_i = cmap[is_no];} 1244 rmap_i = rmap[is_no]; 1245 for (k=0; k<max1; k++,ct1++) { 1246 #if defined (PETSC_USE_CTABLE) 1247 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1248 row--; 1249 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1250 #else 1251 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1252 #endif 1253 max2 = rbuf2_i[ct1]; 1254 for (l=0; l<max2; l++,ct2++) { 1255 if (!allcolumns){ 1256 #if defined (PETSC_USE_CTABLE) 1257 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1258 #else 1259 tcol = cmap_i[rbuf3_i[ct2]]; 1260 #endif 1261 if (tcol) { 1262 lens_i[row]++; 1263 } 1264 } else { /* allcolumns */ 1265 lens_i[row]++; /* lens_i[row] += max2 ? */ 1266 } 1267 } 1268 } 1269 } 1270 } 1271 } 1272 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1273 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1274 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1275 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1276 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1277 1278 /* Create the submatrices */ 1279 if (scall == MAT_REUSE_MATRIX) { 1280 PetscBool flag; 1281 1282 /* 1283 Assumes new rows are same length as the old rows,hence bug! 1284 */ 1285 for (i=0; i<ismax; i++) { 1286 mat = (Mat_SeqAIJ *)(submats[i]->data); 1287 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1288 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1289 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1290 /* Initial matrix as if empty */ 1291 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1292 submats[i]->factortype = C->factortype; 1293 } 1294 } else { 1295 for (i=0; i<ismax; i++) { 1296 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1297 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1298 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1299 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1300 } 1301 } 1302 1303 /* Assemble the matrices */ 1304 /* First assemble the local rows */ 1305 { 1306 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1307 PetscScalar *imat_a; 1308 1309 for (i=0; i<ismax; i++) { 1310 mat = (Mat_SeqAIJ*)submats[i]->data; 1311 imat_ilen = mat->ilen; 1312 imat_j = mat->j; 1313 imat_i = mat->i; 1314 imat_a = mat->a; 1315 if (!allcolumns) cmap_i = cmap[i]; 1316 rmap_i = rmap[i]; 1317 irow_i = irow[i]; 1318 jmax = nrow[i]; 1319 for (j=0; j<jmax; j++) { 1320 l = 0; 1321 row = irow_i[j]; 1322 while (row >= C->rmap->range[l+1]) l++; 1323 proc = l; 1324 if (proc == rank) { 1325 old_row = row; 1326 #if defined (PETSC_USE_CTABLE) 1327 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1328 row--; 1329 #else 1330 row = rmap_i[row]; 1331 #endif 1332 ilen_row = imat_ilen[row]; 1333 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1334 mat_i = imat_i[row] ; 1335 mat_a = imat_a + mat_i; 1336 mat_j = imat_j + mat_i; 1337 if (!allcolumns){ 1338 for (k=0; k<ncols; k++) { 1339 #if defined (PETSC_USE_CTABLE) 1340 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1341 #else 1342 tcol = cmap_i[cols[k]]; 1343 #endif 1344 if (tcol){ 1345 *mat_j++ = tcol - 1; 1346 *mat_a++ = vals[k]; 1347 ilen_row++; 1348 } 1349 } 1350 } else { /* allcolumns */ 1351 for (k=0; k<ncols; k++) { 1352 *mat_j++ = cols[k] ; /* global col index! */ 1353 *mat_a++ = vals[k]; 1354 ilen_row++; 1355 } 1356 } 1357 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1358 imat_ilen[row] = ilen_row; 1359 } 1360 } 1361 } 1362 } 1363 1364 /* Now assemble the off proc rows*/ 1365 { 1366 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1367 PetscInt *imat_j,*imat_i; 1368 PetscScalar *imat_a,*rbuf4_i; 1369 1370 for (tmp2=0; tmp2<nrqs; tmp2++) { 1371 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1372 idex = pa[idex2]; 1373 sbuf1_i = sbuf1[idex]; 1374 jmax = sbuf1_i[0]; 1375 ct1 = 2*jmax + 1; 1376 ct2 = 0; 1377 rbuf2_i = rbuf2[idex2]; 1378 rbuf3_i = rbuf3[idex2]; 1379 rbuf4_i = rbuf4[idex2]; 1380 for (j=1; j<=jmax; j++) { 1381 is_no = sbuf1_i[2*j-1]; 1382 rmap_i = rmap[is_no]; 1383 if (!allcolumns){cmap_i = cmap[is_no];} 1384 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1385 imat_ilen = mat->ilen; 1386 imat_j = mat->j; 1387 imat_i = mat->i; 1388 imat_a = mat->a; 1389 max1 = sbuf1_i[2*j]; 1390 for (k=0; k<max1; k++,ct1++) { 1391 row = sbuf1_i[ct1]; 1392 #if defined (PETSC_USE_CTABLE) 1393 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1394 row--; 1395 #else 1396 row = rmap_i[row]; 1397 #endif 1398 ilen = imat_ilen[row]; 1399 mat_i = imat_i[row] ; 1400 mat_a = imat_a + mat_i; 1401 mat_j = imat_j + mat_i; 1402 max2 = rbuf2_i[ct1]; 1403 if (!allcolumns){ 1404 for (l=0; l<max2; l++,ct2++) { 1405 1406 #if defined (PETSC_USE_CTABLE) 1407 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1408 #else 1409 tcol = cmap_i[rbuf3_i[ct2]]; 1410 #endif 1411 if (tcol) { 1412 *mat_j++ = tcol - 1; 1413 *mat_a++ = rbuf4_i[ct2]; 1414 ilen++; 1415 } 1416 } 1417 } else { /* allcolumns */ 1418 for (l=0; l<max2; l++,ct2++) { 1419 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1420 *mat_a++ = rbuf4_i[ct2]; 1421 ilen++; 1422 } 1423 } 1424 imat_ilen[row] = ilen; 1425 } 1426 } 1427 } 1428 } 1429 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1430 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1431 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1432 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1433 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1434 1435 /* Restore the indices */ 1436 for (i=0; i<ismax; i++) { 1437 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1438 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1439 } 1440 1441 /* Destroy allocated memory */ 1442 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1443 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1444 ierr = PetscFree(pa);CHKERRQ(ierr); 1445 1446 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1447 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1448 for (i=0; i<nrqr; ++i) { 1449 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1450 } 1451 for (i=0; i<nrqs; ++i) { 1452 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1453 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1454 } 1455 1456 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1457 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1458 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1459 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1460 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1461 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1462 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1463 #if defined (PETSC_USE_CTABLE) 1464 for (i=0; i<ismax; i++) { 1465 if (!allcolumns){ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr);} 1466 ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr); 1467 } 1468 #else 1469 if (!allcolumns){ierr = PetscFree(cmap[0]);CHKERRQ(ierr);} 1470 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1471 #endif 1472 if (!allcolumns){ierr = PetscFree(cmap);CHKERRQ(ierr);} 1473 ierr = PetscFree(rmap);CHKERRQ(ierr); 1474 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1475 ierr = PetscFree(lens);CHKERRQ(ierr); 1476 1477 for (i=0; i<ismax; i++) { 1478 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1479 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1480 } 1481 PetscFunctionReturn(0); 1482 } 1483 1484 /* 1485 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1486 Be careful not to destroy them elsewhere. 1487 */ 1488 #undef __FUNCT__ 1489 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1490 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1491 { 1492 /* If making this function public, change the error returned in this function away from _PLIB. */ 1493 PetscErrorCode ierr; 1494 Mat_MPIAIJ *aij; 1495 PetscBool seqaij; 1496 1497 PetscFunctionBegin; 1498 /* Check to make sure the component matrices are compatible with C. */ 1499 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1500 if(!seqaij) { 1501 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1502 } 1503 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1504 if(!seqaij) { 1505 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1506 } 1507 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1508 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1509 } 1510 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1511 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1512 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1513 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1514 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1515 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1516 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1517 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1518 } 1519 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1520 aij = (Mat_MPIAIJ*)((*C)->data); 1521 aij->A = A; 1522 aij->B = B; 1523 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1524 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1525 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1526 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1527 PetscFunctionReturn(0); 1528 } 1529 1530 #undef __FUNCT__ 1531 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1532 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1533 { 1534 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1535 PetscFunctionBegin; 1536 PetscValidPointer(A,2); 1537 PetscValidPointer(B,3); 1538 *A = aij->A; 1539 *B = aij->B; 1540 /* Note that we don't incref *A and *B, so be careful! */ 1541 PetscFunctionReturn(0); 1542 } 1543 1544 1545 #undef __FUNCT__ 1546 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1547 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1548 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1549 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1550 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1551 { 1552 PetscErrorCode ierr; 1553 PetscMPIInt size, flag; 1554 PetscInt i,ii; 1555 PetscInt ismax_c; 1556 PetscFunctionBegin; 1557 if(!ismax) { 1558 PetscFunctionReturn(0); 1559 } 1560 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1561 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1562 if(flag != MPI_IDENT) { 1563 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1564 } 1565 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1566 if(size > 1) { 1567 ++ismax_c; 1568 } 1569 } 1570 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1571 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1572 } 1573 else { /* if(ismax_c) */ 1574 Mat *A,*B; 1575 IS *isrow_c, *iscol_c; 1576 PetscMPIInt size; 1577 /* 1578 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1579 array of sequential matrices underlying the resulting parallel matrices. 1580 Which arrays to allocate is based on the value of MatReuse scall. 1581 There are as many diag matrices as there are original index sets. 1582 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1583 1584 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1585 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1586 will deposite the extracted diag and off-diag parts. 1587 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1588 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1589 */ 1590 1591 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1592 if (scall != MAT_REUSE_MATRIX) { 1593 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1594 } 1595 else { 1596 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1597 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1598 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1599 for(i = 0, ii = 0; i < ismax; ++i) { 1600 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1601 if(size > 1) { 1602 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1603 ++ii; 1604 } 1605 else { 1606 A[i] = (*submat)[i]; 1607 } 1608 } 1609 } 1610 /* 1611 Construct the complements of the iscol ISs for parallel ISs only. 1612 These are used to extract the off-diag portion of the resulting parallel matrix. 1613 The row IS for the off-diag portion is the same as for the diag portion, 1614 so we merely alias the row IS, while skipping those that are sequential. 1615 */ 1616 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1617 for(i = 0, ii = 0; i < ismax; ++i) { 1618 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1619 if(size > 1) { 1620 isrow_c[ii] = isrow[i]; 1621 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1622 ++ii; 1623 } 1624 } 1625 /* Now obtain the sequential A and B submatrices separately. */ 1626 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1627 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1628 for(ii = 0; ii < ismax_c; ++ii) { 1629 ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr); 1630 } 1631 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1632 /* 1633 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1634 have been extracted directly into the parallel matrices containing them, or 1635 simply into the sequential matrix identical with the corresponding A (if size == 1). 1636 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1637 A is put into the correct submat slot (if size == 1). 1638 */ 1639 if(scall != MAT_REUSE_MATRIX) { 1640 for(i = 0, ii = 0; i < ismax; ++i) { 1641 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1642 if(size > 1) { 1643 /* 1644 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1645 */ 1646 /* Construct submat[i] from the Seq pieces A and B. */ 1647 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1648 1649 ++ii; 1650 } 1651 else { 1652 (*submat)[i] = A[i]; 1653 } 1654 } 1655 } 1656 ierr = PetscFree(A); CHKERRQ(ierr); 1657 ierr = PetscFree(B); CHKERRQ(ierr); 1658 } 1659 PetscFunctionReturn(0); 1660 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1661 1662 1663 1664 #undef __FUNCT__ 1665 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1666 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1667 { 1668 PetscErrorCode ierr; 1669 PetscFunctionBegin; 1670 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1671 MatCreateMPIAIJFromSeqMatrices_Private, 1672 MatMPIAIJExtractSeqMatrices_Private); 1673 CHKERRQ(ierr); 1674 PetscFunctionReturn(0); 1675 } 1676