1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++){ 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 260 rw1[proc] = isz1[i]; 261 } 262 ierr = PetscFree(onodes1);CHKERRQ(ierr); 263 ierr = PetscFree(olengths1);CHKERRQ(ierr); 264 265 /* Determine the number of messages to expect, their lengths, from from-ids */ 266 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 267 ierr = PetscFree(rw1);CHKERRQ(ierr); 268 } 269 /* Now post the Irecvs corresponding to these messages */ 270 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 271 272 /* Now post the sends */ 273 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 274 for (i=0; i<nrqr; ++i) { 275 j = recv_status[i].MPI_SOURCE; 276 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* receive work done on other processors*/ 280 { 281 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 282 PetscMPIInt idex; 283 PetscBT table_i; 284 MPI_Status *status2; 285 286 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 287 for (i=0; i<nrqs; ++i) { 288 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 289 /* Process the message*/ 290 rbuf2_i = rbuf2[idex]; 291 ct1 = 2*rbuf2_i[0]+1; 292 jmax = rbuf2[idex][0]; 293 for (j=1; j<=jmax; j++) { 294 max = rbuf2_i[2*j]; 295 is_no = rbuf2_i[2*j-1]; 296 isz_i = isz[is_no]; 297 data_i = data[is_no]; 298 table_i = table[is_no]; 299 for (k=0; k<max; k++,ct1++) { 300 row = rbuf2_i[ct1]; 301 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 302 } 303 isz[is_no] = isz_i; 304 } 305 } 306 307 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 308 ierr = PetscFree(status2);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<imax; ++i) { 312 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 313 } 314 315 ierr = PetscFree(onodes2);CHKERRQ(ierr); 316 ierr = PetscFree(olengths2);CHKERRQ(ierr); 317 318 ierr = PetscFree(pa);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 324 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 325 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 326 ierr = PetscFree(s_status);CHKERRQ(ierr); 327 ierr = PetscFree(recv_status);CHKERRQ(ierr); 328 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 329 ierr = PetscFree(xdata);CHKERRQ(ierr); 330 ierr = PetscFree(isz1);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 336 /* 337 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 338 the work on the local processor. 339 340 Inputs: 341 C - MAT_MPIAIJ; 342 imax - total no of index sets processed at a time; 343 table - an array of char - size = m bits. 344 345 Output: 346 isz - array containing the count of the solution elements corresponding 347 to each index set; 348 data - pointer to the solutions 349 */ 350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 351 { 352 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 353 Mat A = c->A,B = c->B; 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 355 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 356 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 357 PetscBT table_i; 358 359 PetscFunctionBegin; 360 rstart = C->rmap->rstart; 361 cstart = C->cmap->rstart; 362 ai = a->i; 363 aj = a->j; 364 bi = b->i; 365 bj = b->j; 366 garray = c->garray; 367 368 369 for (i=0; i<imax; i++) { 370 data_i = data[i]; 371 table_i = table[i]; 372 isz_i = isz[i]; 373 for (j=0,max=isz[i]; j<max; j++) { 374 row = data_i[j] - rstart; 375 start = ai[row]; 376 end = ai[row+1]; 377 for (k=start; k<end; k++) { /* Amat */ 378 val = aj[k] + cstart; 379 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 380 } 381 start = bi[row]; 382 end = bi[row+1]; 383 for (k=start; k<end; k++) { /* Bmat */ 384 val = garray[bj[k]]; 385 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 386 } 387 } 388 isz[i] = isz_i; 389 } 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 395 /* 396 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 397 and return the output 398 399 Input: 400 C - the matrix 401 nrqr - no of messages being processed. 402 rbuf - an array of pointers to the recieved requests 403 404 Output: 405 xdata - array of messages to be sent back 406 isz1 - size of each message 407 408 For better efficiency perhaps we should malloc separately each xdata[i], 409 then if a remalloc is required we need only copy the data for that one row 410 rather then all previous rows as it is now where a single large chunck of 411 memory is used. 412 413 */ 414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 415 { 416 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 417 Mat A = c->A,B = c->B; 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 419 PetscErrorCode ierr; 420 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 421 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 422 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 423 PetscInt *rbuf_i,kmax,rbuf_0; 424 PetscBT xtable; 425 426 PetscFunctionBegin; 427 m = C->rmap->N; 428 rstart = C->rmap->rstart; 429 cstart = C->cmap->rstart; 430 ai = a->i; 431 aj = a->j; 432 bi = b->i; 433 bj = b->j; 434 garray = c->garray; 435 436 437 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 438 rbuf_i = rbuf[i]; 439 rbuf_0 = rbuf_i[0]; 440 ct += rbuf_0; 441 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 442 } 443 444 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 445 else max1 = 1; 446 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 447 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 448 ++no_malloc; 449 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 450 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 451 452 ct3 = 0; 453 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 454 rbuf_i = rbuf[i]; 455 rbuf_0 = rbuf_i[0]; 456 ct1 = 2*rbuf_0+1; 457 ct2 = ct1; 458 ct3 += ct1; 459 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 460 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 461 oct2 = ct2; 462 kmax = rbuf_i[2*j]; 463 for (k=0; k<kmax; k++,ct1++) { 464 row = rbuf_i[ct1]; 465 if (!PetscBTLookupSet(xtable,row)) { 466 if (!(ct3 < mem_estimate)) { 467 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 468 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 469 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 470 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 471 xdata[0] = tmp; 472 mem_estimate = new_estimate; ++no_malloc; 473 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 474 } 475 xdata[i][ct2++] = row; 476 ct3++; 477 } 478 } 479 for (k=oct2,max2=ct2; k<max2; k++) { 480 row = xdata[i][k] - rstart; 481 start = ai[row]; 482 end = ai[row+1]; 483 for (l=start; l<end; l++) { 484 val = aj[l] + cstart; 485 if (!PetscBTLookupSet(xtable,val)) { 486 if (!(ct3 < mem_estimate)) { 487 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 488 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 489 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 490 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 491 xdata[0] = tmp; 492 mem_estimate = new_estimate; ++no_malloc; 493 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 494 } 495 xdata[i][ct2++] = val; 496 ct3++; 497 } 498 } 499 start = bi[row]; 500 end = bi[row+1]; 501 for (l=start; l<end; l++) { 502 val = garray[bj[l]]; 503 if (!PetscBTLookupSet(xtable,val)) { 504 if (!(ct3 < mem_estimate)) { 505 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 506 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 507 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 508 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 509 xdata[0] = tmp; 510 mem_estimate = new_estimate; ++no_malloc; 511 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 512 } 513 xdata[i][ct2++] = val; 514 ct3++; 515 } 516 } 517 } 518 /* Update the header*/ 519 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 520 xdata[i][2*j-1] = rbuf_i[2*j-1]; 521 } 522 xdata[i][0] = rbuf_0; 523 xdata[i+1] = xdata[i] + ct2; 524 isz1[i] = ct2; /* size of each message */ 525 } 526 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 527 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 528 PetscFunctionReturn(0); 529 } 530 /* -------------------------------------------------------------------------*/ 531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 533 /* 534 Every processor gets the entire matrix 535 */ 536 #undef __FUNCT__ 537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 539 { 540 Mat B; 541 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 542 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 543 PetscErrorCode ierr; 544 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 545 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 546 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 547 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 548 549 PetscFunctionBegin; 550 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 551 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 552 553 if (scall == MAT_INITIAL_MATRIX) { 554 /* ---------------------------------------------------------------- 555 Tell every processor the number of nonzeros per row 556 */ 557 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 558 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 559 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 560 } 561 sendcount = A->rmap->rend - A->rmap->rstart; 562 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 563 for (i=0; i<size; i++) { 564 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 565 displs[i] = A->rmap->range[i]; 566 } 567 #if defined(PETSC_HAVE_MPI_IN_PLACE) 568 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 569 #else 570 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 571 #endif 572 /* --------------------------------------------------------------- 573 Create the sequential matrix of the same type as the local block diagonal 574 */ 575 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 576 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 577 ierr = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 578 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 579 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 580 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 581 **Bin = B; 582 b = (Mat_SeqAIJ *)B->data; 583 584 /*-------------------------------------------------------------------- 585 Copy my part of matrix column indices over 586 */ 587 sendcount = ad->nz + bd->nz; 588 jsendbuf = b->j + b->i[rstarts[rank]]; 589 a_jsendbuf = ad->j; 590 b_jsendbuf = bd->j; 591 n = A->rmap->rend - A->rmap->rstart; 592 cnt = 0; 593 for (i=0; i<n; i++) { 594 595 /* put in lower diagonal portion */ 596 m = bd->i[i+1] - bd->i[i]; 597 while (m > 0) { 598 /* is it above diagonal (in bd (compressed) numbering) */ 599 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 600 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 601 m--; 602 } 603 604 /* put in diagonal portion */ 605 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 606 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 607 } 608 609 /* put in upper diagonal portion */ 610 while (m-- > 0) { 611 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 612 } 613 } 614 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 615 616 /*-------------------------------------------------------------------- 617 Gather all column indices to all processors 618 */ 619 for (i=0; i<size; i++) { 620 recvcounts[i] = 0; 621 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 622 recvcounts[i] += lens[j]; 623 } 624 } 625 displs[0] = 0; 626 for (i=1; i<size; i++) { 627 displs[i] = displs[i-1] + recvcounts[i-1]; 628 } 629 #if defined(PETSC_HAVE_MPI_IN_PLACE) 630 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 631 #else 632 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 633 #endif 634 /*-------------------------------------------------------------------- 635 Assemble the matrix into useable form (note numerical values not yet set) 636 */ 637 /* set the b->ilen (length of each row) values */ 638 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 639 /* set the b->i indices */ 640 b->i[0] = 0; 641 for (i=1; i<=A->rmap->N; i++) { 642 b->i[i] = b->i[i-1] + lens[i-1]; 643 } 644 ierr = PetscFree(lens);CHKERRQ(ierr); 645 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 646 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 647 648 } else { 649 B = **Bin; 650 b = (Mat_SeqAIJ *)B->data; 651 } 652 653 /*-------------------------------------------------------------------- 654 Copy my part of matrix numerical values into the values location 655 */ 656 if (flag == MAT_GET_VALUES){ 657 sendcount = ad->nz + bd->nz; 658 sendbuf = b->a + b->i[rstarts[rank]]; 659 a_sendbuf = ad->a; 660 b_sendbuf = bd->a; 661 b_sendj = bd->j; 662 n = A->rmap->rend - A->rmap->rstart; 663 cnt = 0; 664 for (i=0; i<n; i++) { 665 666 /* put in lower diagonal portion */ 667 m = bd->i[i+1] - bd->i[i]; 668 while (m > 0) { 669 /* is it above diagonal (in bd (compressed) numbering) */ 670 if (garray[*b_sendj] > A->rmap->rstart + i) break; 671 sendbuf[cnt++] = *b_sendbuf++; 672 m--; 673 b_sendj++; 674 } 675 676 /* put in diagonal portion */ 677 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 678 sendbuf[cnt++] = *a_sendbuf++; 679 } 680 681 /* put in upper diagonal portion */ 682 while (m-- > 0) { 683 sendbuf[cnt++] = *b_sendbuf++; 684 b_sendj++; 685 } 686 } 687 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 688 689 /* ----------------------------------------------------------------- 690 Gather all numerical values to all processors 691 */ 692 if (!recvcounts) { 693 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 694 } 695 for (i=0; i<size; i++) { 696 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 697 } 698 displs[0] = 0; 699 for (i=1; i<size; i++) { 700 displs[i] = displs[i-1] + recvcounts[i-1]; 701 } 702 recvbuf = b->a; 703 #if defined(PETSC_HAVE_MPI_IN_PLACE) 704 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 705 #else 706 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 707 #endif 708 } /* endof (flag == MAT_GET_VALUES) */ 709 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 710 711 if (A->symmetric){ 712 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 713 } else if (A->hermitian) { 714 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 715 } else if (A->structurally_symmetric) { 716 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 717 } 718 PetscFunctionReturn(0); 719 } 720 721 722 723 #undef __FUNCT__ 724 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 725 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 726 { 727 PetscErrorCode ierr; 728 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 729 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 730 731 PetscFunctionBegin; 732 733 /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */ 734 /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level. 735 However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that 736 take care to order the result correctly by assembling it with MatSetValues() (after preallocating). 737 */ 738 for (i = 0; i < ismax; ++i) { 739 PetscBool sorted; 740 ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr); 741 if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i); 742 } 743 744 /* 745 Check for special case: each processor gets entire matrix 746 */ 747 if (ismax == 1 && C->rmap->N == C->cmap->N) { 748 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 749 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 750 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 751 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 752 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 753 wantallmatrix = PETSC_TRUE; 754 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 755 } 756 } 757 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 758 if (twantallmatrix) { 759 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 760 PetscFunctionReturn(0); 761 } 762 763 /* Allocate memory to hold all the submatrices */ 764 if (scall != MAT_REUSE_MATRIX) { 765 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 766 } 767 768 /* Check for special case: each processor gets entire matrix columns */ 769 ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr); 770 for (i=0; i<ismax; i++) { 771 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 772 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 773 if (colflag && ncol == C->cmap->N){ 774 allcolumns[i] = PETSC_TRUE; 775 } else { 776 allcolumns[i] = PETSC_FALSE; 777 } 778 } 779 780 /* Determine the number of stages through which submatrices are done */ 781 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 782 783 /* 784 Each stage will extract nmax submatrices. 785 nmax is determined by the matrix column dimension. 786 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 787 */ 788 if (!nmax) nmax = 1; 789 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 790 791 /* Make sure every processor loops through the nstages */ 792 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 793 794 for (i=0,pos=0; i<nstages; i++) { 795 if (pos+nmax <= ismax) max_no = nmax; 796 else if (pos == ismax) max_no = 0; 797 else max_no = ismax-pos; 798 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 799 pos += max_no; 800 } 801 802 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 803 PetscFunctionReturn(0); 804 } 805 806 /* -------------------------------------------------------------------------*/ 807 #undef __FUNCT__ 808 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 809 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 810 { 811 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 812 Mat A = c->A; 813 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 814 const PetscInt **icol,**irow; 815 PetscInt *nrow,*ncol,start; 816 PetscErrorCode ierr; 817 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 818 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 819 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 820 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 821 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 822 #if defined (PETSC_USE_CTABLE) 823 PetscTable *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i; 824 #else 825 PetscInt **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i; 826 #endif 827 const PetscInt *irow_i; 828 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 829 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 830 MPI_Request *r_waits4,*s_waits3,*s_waits4; 831 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 832 MPI_Status *r_status3,*r_status4,*s_status4; 833 MPI_Comm comm; 834 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 835 PetscMPIInt *onodes1,*olengths1; 836 PetscMPIInt idex,idex2,end; 837 838 PetscFunctionBegin; 839 840 comm = ((PetscObject)C)->comm; 841 tag0 = ((PetscObject)C)->tag; 842 size = c->size; 843 rank = c->rank; 844 845 /* Get some new tags to keep the communication clean */ 846 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 847 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 848 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 849 850 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 851 852 for (i=0; i<ismax; i++) { 853 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 854 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 855 if (allcolumns[i]){ 856 icol[i] = PETSC_NULL; 857 ncol[i] = C->cmap->N; 858 } else { 859 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 860 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 861 } 862 } 863 864 /* evaluate communication - mesg to who, length of mesg, and buffer space 865 required. Based on this, buffers are allocated, and data copied into them*/ 866 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 867 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 868 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 869 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 870 for (i=0; i<ismax; i++) { 871 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 872 jmax = nrow[i]; 873 irow_i = irow[i]; 874 for (j=0; j<jmax; j++) { 875 l = 0; 876 row = irow_i[j]; 877 while (row >= C->rmap->range[l+1]) l++; 878 proc = l; 879 w4[proc]++; 880 } 881 for (j=0; j<size; j++) { 882 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 883 } 884 } 885 886 nrqs = 0; /* no of outgoing messages */ 887 msz = 0; /* total mesg length (for all procs) */ 888 w1[rank] = 0; /* no mesg sent to self */ 889 w3[rank] = 0; 890 for (i=0; i<size; i++) { 891 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 892 } 893 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 894 for (i=0,j=0; i<size; i++) { 895 if (w1[i]) { pa[j] = i; j++; } 896 } 897 898 /* Each message would have a header = 1 + 2*(no of IS) + data */ 899 for (i=0; i<nrqs; i++) { 900 j = pa[i]; 901 w1[j] += w2[j] + 2* w3[j]; 902 msz += w1[j]; 903 } 904 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 905 906 /* Determine the number of messages to expect, their lengths, from from-ids */ 907 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 908 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 909 910 /* Now post the Irecvs corresponding to these messages */ 911 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 912 913 ierr = PetscFree(onodes1);CHKERRQ(ierr); 914 ierr = PetscFree(olengths1);CHKERRQ(ierr); 915 916 /* Allocate Memory for outgoing messages */ 917 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 918 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 919 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 920 921 { 922 PetscInt *iptr = tmp,ict = 0; 923 for (i=0; i<nrqs; i++) { 924 j = pa[i]; 925 iptr += ict; 926 sbuf1[j] = iptr; 927 ict = w1[j]; 928 } 929 } 930 931 /* Form the outgoing messages */ 932 /* Initialize the header space */ 933 for (i=0; i<nrqs; i++) { 934 j = pa[i]; 935 sbuf1[j][0] = 0; 936 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 937 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 938 } 939 940 /* Parse the isrow and copy data into outbuf */ 941 for (i=0; i<ismax; i++) { 942 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 943 irow_i = irow[i]; 944 jmax = nrow[i]; 945 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 946 l = 0; 947 row = irow_i[j]; 948 while (row >= C->rmap->range[l+1]) l++; 949 proc = l; 950 if (proc != rank) { /* copy to the outgoing buf*/ 951 ctr[proc]++; 952 *ptr[proc] = row; 953 ptr[proc]++; 954 } 955 } 956 /* Update the headers for the current IS */ 957 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 958 if ((ctr_j = ctr[j])) { 959 sbuf1_j = sbuf1[j]; 960 k = ++sbuf1_j[0]; 961 sbuf1_j[2*k] = ctr_j; 962 sbuf1_j[2*k-1] = i; 963 } 964 } 965 } 966 967 /* Now post the sends */ 968 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 969 for (i=0; i<nrqs; ++i) { 970 j = pa[i]; 971 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 972 } 973 974 /* Post Receives to capture the buffer size */ 975 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 976 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 977 rbuf2[0] = tmp + msz; 978 for (i=1; i<nrqs; ++i) { 979 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 980 } 981 for (i=0; i<nrqs; ++i) { 982 j = pa[i]; 983 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 984 } 985 986 /* Send to other procs the buf size they should allocate */ 987 988 989 /* Receive messages*/ 990 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 991 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 992 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 993 { 994 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 995 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 996 PetscInt *sbuf2_i; 997 998 for (i=0; i<nrqr; ++i) { 999 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1000 req_size[idex] = 0; 1001 rbuf1_i = rbuf1[idex]; 1002 start = 2*rbuf1_i[0] + 1; 1003 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1004 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 1005 sbuf2_i = sbuf2[idex]; 1006 for (j=start; j<end; j++) { 1007 id = rbuf1_i[j] - rstart; 1008 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1009 sbuf2_i[j] = ncols; 1010 req_size[idex] += ncols; 1011 } 1012 req_source[idex] = r_status1[i].MPI_SOURCE; 1013 /* form the header */ 1014 sbuf2_i[0] = req_size[idex]; 1015 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 1016 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1017 } 1018 } 1019 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1020 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1021 1022 /* recv buffer sizes */ 1023 /* Receive messages*/ 1024 1025 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1026 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1027 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1028 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1029 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1030 1031 for (i=0; i<nrqs; ++i) { 1032 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1033 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1034 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1035 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1036 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1037 } 1038 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1039 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1040 1041 /* Wait on sends1 and sends2 */ 1042 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1043 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1044 1045 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1046 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1047 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1048 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1049 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1050 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1051 1052 /* Now allocate buffers for a->j, and send them off */ 1053 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1054 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1055 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1056 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1057 1058 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1059 { 1060 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1061 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1062 PetscInt cend = C->cmap->rend; 1063 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1064 1065 for (i=0; i<nrqr; i++) { 1066 rbuf1_i = rbuf1[i]; 1067 sbuf_aj_i = sbuf_aj[i]; 1068 ct1 = 2*rbuf1_i[0] + 1; 1069 ct2 = 0; 1070 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1071 kmax = rbuf1[i][2*j]; 1072 for (k=0; k<kmax; k++,ct1++) { 1073 row = rbuf1_i[ct1] - rstart; 1074 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1075 ncols = nzA + nzB; 1076 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1077 1078 /* load the column indices for this row into cols*/ 1079 cols = sbuf_aj_i + ct2; 1080 1081 lwrite = 0; 1082 for (l=0; l<nzB; l++) { 1083 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1084 } 1085 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1086 for (l=0; l<nzB; l++) { 1087 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1088 } 1089 1090 ct2 += ncols; 1091 } 1092 } 1093 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1094 } 1095 } 1096 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1097 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1098 1099 /* Allocate buffers for a->a, and send them off */ 1100 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1101 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1102 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1103 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1104 1105 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1106 { 1107 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1108 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1109 PetscInt cend = C->cmap->rend; 1110 PetscInt *b_j = b->j; 1111 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1112 1113 for (i=0; i<nrqr; i++) { 1114 rbuf1_i = rbuf1[i]; 1115 sbuf_aa_i = sbuf_aa[i]; 1116 ct1 = 2*rbuf1_i[0]+1; 1117 ct2 = 0; 1118 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1119 kmax = rbuf1_i[2*j]; 1120 for (k=0; k<kmax; k++,ct1++) { 1121 row = rbuf1_i[ct1] - rstart; 1122 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1123 ncols = nzA + nzB; 1124 cworkB = b_j + b_i[row]; 1125 vworkA = a_a + a_i[row]; 1126 vworkB = b_a + b_i[row]; 1127 1128 /* load the column values for this row into vals*/ 1129 vals = sbuf_aa_i+ct2; 1130 1131 lwrite = 0; 1132 for (l=0; l<nzB; l++) { 1133 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1134 } 1135 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1136 for (l=0; l<nzB; l++) { 1137 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1138 } 1139 1140 ct2 += ncols; 1141 } 1142 } 1143 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1144 } 1145 } 1146 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1147 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1148 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1149 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1150 1151 /* Form the matrix */ 1152 /* create col map: global col of C -> local col of submatrices */ 1153 { 1154 const PetscInt *icol_i; 1155 #if defined (PETSC_USE_CTABLE) 1156 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1157 for (i=0; i<ismax; i++) { 1158 if (!allcolumns[i]){ 1159 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1160 jmax = ncol[i]; 1161 icol_i = icol[i]; 1162 cmap_i = cmap[i]; 1163 for (j=0; j<jmax; j++) { 1164 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1165 } 1166 } else { 1167 cmap[i] = PETSC_NULL; 1168 } 1169 } 1170 #else 1171 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1172 for (i=0; i<ismax; i++) { 1173 if (!allcolumns[i]){ 1174 ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr); 1175 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1176 jmax = ncol[i]; 1177 icol_i = icol[i]; 1178 cmap_i = cmap[i]; 1179 for (j=0; j<jmax; j++) { 1180 cmap_i[icol_i[j]] = j+1; 1181 } 1182 } else { 1183 cmap[i] = PETSC_NULL; 1184 } 1185 } 1186 #endif 1187 } 1188 1189 /* Create lens which is required for MatCreate... */ 1190 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1191 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1192 if (ismax) { 1193 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1194 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1195 } 1196 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1197 1198 /* Update lens from local data */ 1199 for (i=0; i<ismax; i++) { 1200 jmax = nrow[i]; 1201 if (!allcolumns[i]) cmap_i = cmap[i]; 1202 irow_i = irow[i]; 1203 lens_i = lens[i]; 1204 for (j=0; j<jmax; j++) { 1205 l = 0; 1206 row = irow_i[j]; 1207 while (row >= C->rmap->range[l+1]) l++; 1208 proc = l; 1209 if (proc == rank) { 1210 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1211 if (!allcolumns[i]){ 1212 for (k=0; k<ncols; k++) { 1213 #if defined (PETSC_USE_CTABLE) 1214 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1215 #else 1216 tcol = cmap_i[cols[k]]; 1217 #endif 1218 if (tcol) { lens_i[j]++;} 1219 } 1220 } else { /* allcolumns */ 1221 lens_i[j] = ncols; 1222 } 1223 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1224 } 1225 } 1226 } 1227 1228 /* Create row map: global row of C -> local row of submatrices */ 1229 #if defined (PETSC_USE_CTABLE) 1230 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1231 for (i=0; i<ismax; i++) { 1232 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1233 rmap_i = rmap[i]; 1234 irow_i = irow[i]; 1235 jmax = nrow[i]; 1236 for (j=0; j<jmax; j++) { 1237 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1238 } 1239 } 1240 #else 1241 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1242 if (ismax) { 1243 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1244 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1245 } 1246 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1247 for (i=0; i<ismax; i++) { 1248 rmap_i = rmap[i]; 1249 irow_i = irow[i]; 1250 jmax = nrow[i]; 1251 for (j=0; j<jmax; j++) { 1252 rmap_i[irow_i[j]] = j; 1253 } 1254 } 1255 #endif 1256 1257 /* Update lens from offproc data */ 1258 { 1259 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1260 1261 for (tmp2=0; tmp2<nrqs; tmp2++) { 1262 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1263 idex = pa[idex2]; 1264 sbuf1_i = sbuf1[idex]; 1265 jmax = sbuf1_i[0]; 1266 ct1 = 2*jmax+1; 1267 ct2 = 0; 1268 rbuf2_i = rbuf2[idex2]; 1269 rbuf3_i = rbuf3[idex2]; 1270 for (j=1; j<=jmax; j++) { 1271 is_no = sbuf1_i[2*j-1]; 1272 max1 = sbuf1_i[2*j]; 1273 lens_i = lens[is_no]; 1274 if (!allcolumns[is_no]){cmap_i = cmap[is_no];} 1275 rmap_i = rmap[is_no]; 1276 for (k=0; k<max1; k++,ct1++) { 1277 #if defined (PETSC_USE_CTABLE) 1278 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1279 row--; 1280 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1281 #else 1282 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1283 #endif 1284 max2 = rbuf2_i[ct1]; 1285 for (l=0; l<max2; l++,ct2++) { 1286 if (!allcolumns[is_no]){ 1287 #if defined (PETSC_USE_CTABLE) 1288 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1289 #else 1290 tcol = cmap_i[rbuf3_i[ct2]]; 1291 #endif 1292 if (tcol) { 1293 lens_i[row]++; 1294 } 1295 } else { /* allcolumns */ 1296 lens_i[row]++; /* lens_i[row] += max2 ? */ 1297 } 1298 } 1299 } 1300 } 1301 } 1302 } 1303 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1304 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1305 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1306 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1307 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1308 1309 /* Create the submatrices */ 1310 if (scall == MAT_REUSE_MATRIX) { 1311 PetscBool flag; 1312 1313 /* 1314 Assumes new rows are same length as the old rows,hence bug! 1315 */ 1316 for (i=0; i<ismax; i++) { 1317 mat = (Mat_SeqAIJ *)(submats[i]->data); 1318 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1319 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1320 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1321 /* Initial matrix as if empty */ 1322 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1323 submats[i]->factortype = C->factortype; 1324 } 1325 } else { 1326 for (i=0; i<ismax; i++) { 1327 PetscInt rbs,cbs; 1328 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1329 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1330 1331 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1332 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1333 1334 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1335 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1336 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1337 } 1338 } 1339 1340 /* Assemble the matrices */ 1341 /* First assemble the local rows */ 1342 { 1343 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1344 PetscScalar *imat_a; 1345 1346 for (i=0; i<ismax; i++) { 1347 mat = (Mat_SeqAIJ*)submats[i]->data; 1348 imat_ilen = mat->ilen; 1349 imat_j = mat->j; 1350 imat_i = mat->i; 1351 imat_a = mat->a; 1352 if (!allcolumns[i]) cmap_i = cmap[i]; 1353 rmap_i = rmap[i]; 1354 irow_i = irow[i]; 1355 jmax = nrow[i]; 1356 for (j=0; j<jmax; j++) { 1357 l = 0; 1358 row = irow_i[j]; 1359 while (row >= C->rmap->range[l+1]) l++; 1360 proc = l; 1361 if (proc == rank) { 1362 old_row = row; 1363 #if defined (PETSC_USE_CTABLE) 1364 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1365 row--; 1366 #else 1367 row = rmap_i[row]; 1368 #endif 1369 ilen_row = imat_ilen[row]; 1370 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1371 mat_i = imat_i[row] ; 1372 mat_a = imat_a + mat_i; 1373 mat_j = imat_j + mat_i; 1374 if (!allcolumns[i]){ 1375 for (k=0; k<ncols; k++) { 1376 #if defined (PETSC_USE_CTABLE) 1377 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1378 #else 1379 tcol = cmap_i[cols[k]]; 1380 #endif 1381 if (tcol){ 1382 *mat_j++ = tcol - 1; 1383 *mat_a++ = vals[k]; 1384 ilen_row++; 1385 } 1386 } 1387 } else { /* allcolumns */ 1388 for (k=0; k<ncols; k++) { 1389 *mat_j++ = cols[k] ; /* global col index! */ 1390 *mat_a++ = vals[k]; 1391 ilen_row++; 1392 } 1393 } 1394 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1395 imat_ilen[row] = ilen_row; 1396 } 1397 } 1398 } 1399 } 1400 1401 /* Now assemble the off proc rows*/ 1402 { 1403 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1404 PetscInt *imat_j,*imat_i; 1405 PetscScalar *imat_a,*rbuf4_i; 1406 1407 for (tmp2=0; tmp2<nrqs; tmp2++) { 1408 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1409 idex = pa[idex2]; 1410 sbuf1_i = sbuf1[idex]; 1411 jmax = sbuf1_i[0]; 1412 ct1 = 2*jmax + 1; 1413 ct2 = 0; 1414 rbuf2_i = rbuf2[idex2]; 1415 rbuf3_i = rbuf3[idex2]; 1416 rbuf4_i = rbuf4[idex2]; 1417 for (j=1; j<=jmax; j++) { 1418 is_no = sbuf1_i[2*j-1]; 1419 rmap_i = rmap[is_no]; 1420 if (!allcolumns[is_no]){cmap_i = cmap[is_no];} 1421 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1422 imat_ilen = mat->ilen; 1423 imat_j = mat->j; 1424 imat_i = mat->i; 1425 imat_a = mat->a; 1426 max1 = sbuf1_i[2*j]; 1427 for (k=0; k<max1; k++,ct1++) { 1428 row = sbuf1_i[ct1]; 1429 #if defined (PETSC_USE_CTABLE) 1430 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1431 row--; 1432 #else 1433 row = rmap_i[row]; 1434 #endif 1435 ilen = imat_ilen[row]; 1436 mat_i = imat_i[row] ; 1437 mat_a = imat_a + mat_i; 1438 mat_j = imat_j + mat_i; 1439 max2 = rbuf2_i[ct1]; 1440 if (!allcolumns[is_no]){ 1441 for (l=0; l<max2; l++,ct2++) { 1442 1443 #if defined (PETSC_USE_CTABLE) 1444 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1445 #else 1446 tcol = cmap_i[rbuf3_i[ct2]]; 1447 #endif 1448 if (tcol) { 1449 *mat_j++ = tcol - 1; 1450 *mat_a++ = rbuf4_i[ct2]; 1451 ilen++; 1452 } 1453 } 1454 } else { /* allcolumns */ 1455 for (l=0; l<max2; l++,ct2++) { 1456 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1457 *mat_a++ = rbuf4_i[ct2]; 1458 ilen++; 1459 } 1460 } 1461 imat_ilen[row] = ilen; 1462 } 1463 } 1464 } 1465 } 1466 1467 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1468 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1469 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1470 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1471 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1472 1473 /* Restore the indices */ 1474 for (i=0; i<ismax; i++) { 1475 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1476 if (!allcolumns[i]){ 1477 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1478 } 1479 } 1480 1481 /* Destroy allocated memory */ 1482 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1483 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1484 ierr = PetscFree(pa);CHKERRQ(ierr); 1485 1486 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1487 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1488 for (i=0; i<nrqr; ++i) { 1489 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1490 } 1491 for (i=0; i<nrqs; ++i) { 1492 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1493 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1494 } 1495 1496 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1497 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1498 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1499 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1500 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1501 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1502 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1503 1504 #if defined (PETSC_USE_CTABLE) 1505 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1506 #else 1507 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1508 #endif 1509 ierr = PetscFree(rmap);CHKERRQ(ierr); 1510 1511 for (i=0; i<ismax; i++) { 1512 if (!allcolumns[i]){ 1513 #if defined (PETSC_USE_CTABLE) 1514 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1515 #else 1516 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1517 #endif 1518 } 1519 } 1520 ierr = PetscFree(cmap);CHKERRQ(ierr); 1521 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1522 ierr = PetscFree(lens);CHKERRQ(ierr); 1523 1524 for (i=0; i<ismax; i++) { 1525 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1526 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1527 } 1528 PetscFunctionReturn(0); 1529 } 1530 1531 /* 1532 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1533 Be careful not to destroy them elsewhere. 1534 */ 1535 #undef __FUNCT__ 1536 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1537 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1538 { 1539 /* If making this function public, change the error returned in this function away from _PLIB. */ 1540 PetscErrorCode ierr; 1541 Mat_MPIAIJ *aij; 1542 PetscBool seqaij; 1543 1544 PetscFunctionBegin; 1545 /* Check to make sure the component matrices are compatible with C. */ 1546 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1547 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1548 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1549 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1550 if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1551 1552 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1553 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1554 ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs);CHKERRQ(ierr); 1555 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1556 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1557 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1558 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1559 aij = (Mat_MPIAIJ*)((*C)->data); 1560 aij->A = A; 1561 aij->B = B; 1562 ierr = PetscLogObjectParent(*C,A);CHKERRQ(ierr); 1563 ierr = PetscLogObjectParent(*C,B);CHKERRQ(ierr); 1564 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1565 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1566 PetscFunctionReturn(0); 1567 } 1568 1569 #undef __FUNCT__ 1570 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1571 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1572 { 1573 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1574 PetscFunctionBegin; 1575 PetscValidPointer(A,2); 1576 PetscValidPointer(B,3); 1577 *A = aij->A; 1578 *B = aij->B; 1579 /* Note that we don't incref *A and *B, so be careful! */ 1580 PetscFunctionReturn(0); 1581 } 1582 1583 #undef __FUNCT__ 1584 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1585 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1586 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1587 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1588 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1589 { 1590 PetscErrorCode ierr; 1591 PetscMPIInt size, flag; 1592 PetscInt i,ii; 1593 PetscInt ismax_c; 1594 1595 PetscFunctionBegin; 1596 if (!ismax) { 1597 PetscFunctionReturn(0); 1598 } 1599 for (i = 0, ismax_c = 0; i < ismax; ++i) { 1600 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 1601 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1602 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1603 if (size > 1) { 1604 ++ismax_c; 1605 } 1606 } 1607 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1608 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1609 } else { /* if (ismax_c) */ 1610 Mat *A,*B; 1611 IS *isrow_c, *iscol_c; 1612 PetscMPIInt size; 1613 /* 1614 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1615 array of sequential matrices underlying the resulting parallel matrices. 1616 Which arrays to allocate is based on the value of MatReuse scall. 1617 There are as many diag matrices as there are original index sets. 1618 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1619 1620 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1621 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1622 will deposite the extracted diag and off-diag parts. 1623 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1624 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1625 */ 1626 1627 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1628 if (scall != MAT_REUSE_MATRIX) { 1629 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1630 } else { 1631 ierr = PetscMalloc(ismax*sizeof(Mat), &A);CHKERRQ(ierr); 1632 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B);CHKERRQ(ierr); 1633 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1634 for (i = 0, ii = 0; i < ismax; ++i) { 1635 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1636 if (size > 1) { 1637 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 1638 ++ii; 1639 } else { 1640 A[i] = (*submat)[i]; 1641 } 1642 } 1643 } 1644 /* 1645 Construct the complements of the iscol ISs for parallel ISs only. 1646 These are used to extract the off-diag portion of the resulting parallel matrix. 1647 The row IS for the off-diag portion is the same as for the diag portion, 1648 so we merely alias the row IS, while skipping those that are sequential. 1649 */ 1650 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c);CHKERRQ(ierr); 1651 for (i = 0, ii = 0; i < ismax; ++i) { 1652 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1653 if (size > 1) { 1654 isrow_c[ii] = isrow[i]; 1655 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 1656 ++ii; 1657 } 1658 } 1659 /* Now obtain the sequential A and B submatrices separately. */ 1660 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 1661 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 1662 for (ii = 0; ii < ismax_c; ++ii) { 1663 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 1664 } 1665 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 1666 /* 1667 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1668 have been extracted directly into the parallel matrices containing them, or 1669 simply into the sequential matrix identical with the corresponding A (if size == 1). 1670 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1671 A is put into the correct submat slot (if size == 1). 1672 */ 1673 if (scall != MAT_REUSE_MATRIX) { 1674 for (i = 0, ii = 0; i < ismax; ++i) { 1675 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1676 if (size > 1) { 1677 /* 1678 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1679 */ 1680 /* Construct submat[i] from the Seq pieces A and B. */ 1681 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 1682 1683 ++ii; 1684 } else { 1685 (*submat)[i] = A[i]; 1686 } 1687 } 1688 } 1689 ierr = PetscFree(A);CHKERRQ(ierr); 1690 ierr = PetscFree(B);CHKERRQ(ierr); 1691 } 1692 PetscFunctionReturn(0); 1693 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1694 1695 1696 1697 #undef __FUNCT__ 1698 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1699 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1700 { 1701 PetscErrorCode ierr; 1702 1703 PetscFunctionBegin; 1704 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 1705 PetscFunctionReturn(0); 1706 } 1707