1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++) { 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 260 rw1[proc] = isz1[i]; 261 } 262 ierr = PetscFree(onodes1);CHKERRQ(ierr); 263 ierr = PetscFree(olengths1);CHKERRQ(ierr); 264 265 /* Determine the number of messages to expect, their lengths, from from-ids */ 266 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 267 ierr = PetscFree(rw1);CHKERRQ(ierr); 268 } 269 /* Now post the Irecvs corresponding to these messages */ 270 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 271 272 /* Now post the sends */ 273 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 274 for (i=0; i<nrqr; ++i) { 275 j = recv_status[i].MPI_SOURCE; 276 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* receive work done on other processors*/ 280 { 281 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 282 PetscMPIInt idex; 283 PetscBT table_i; 284 MPI_Status *status2; 285 286 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 287 for (i=0; i<nrqs; ++i) { 288 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 289 /* Process the message*/ 290 rbuf2_i = rbuf2[idex]; 291 ct1 = 2*rbuf2_i[0]+1; 292 jmax = rbuf2[idex][0]; 293 for (j=1; j<=jmax; j++) { 294 max = rbuf2_i[2*j]; 295 is_no = rbuf2_i[2*j-1]; 296 isz_i = isz[is_no]; 297 data_i = data[is_no]; 298 table_i = table[is_no]; 299 for (k=0; k<max; k++,ct1++) { 300 row = rbuf2_i[ct1]; 301 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 302 } 303 isz[is_no] = isz_i; 304 } 305 } 306 307 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 308 ierr = PetscFree(status2);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<imax; ++i) { 312 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 313 } 314 315 ierr = PetscFree(onodes2);CHKERRQ(ierr); 316 ierr = PetscFree(olengths2);CHKERRQ(ierr); 317 318 ierr = PetscFree(pa);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 324 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 325 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 326 ierr = PetscFree(s_status);CHKERRQ(ierr); 327 ierr = PetscFree(recv_status);CHKERRQ(ierr); 328 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 329 ierr = PetscFree(xdata);CHKERRQ(ierr); 330 ierr = PetscFree(isz1);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 336 /* 337 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 338 the work on the local processor. 339 340 Inputs: 341 C - MAT_MPIAIJ; 342 imax - total no of index sets processed at a time; 343 table - an array of char - size = m bits. 344 345 Output: 346 isz - array containing the count of the solution elements corresponding 347 to each index set; 348 data - pointer to the solutions 349 */ 350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 351 { 352 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 353 Mat A = c->A,B = c->B; 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 355 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 356 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 357 PetscBT table_i; 358 359 PetscFunctionBegin; 360 rstart = C->rmap->rstart; 361 cstart = C->cmap->rstart; 362 ai = a->i; 363 aj = a->j; 364 bi = b->i; 365 bj = b->j; 366 garray = c->garray; 367 368 369 for (i=0; i<imax; i++) { 370 data_i = data[i]; 371 table_i = table[i]; 372 isz_i = isz[i]; 373 for (j=0,max=isz[i]; j<max; j++) { 374 row = data_i[j] - rstart; 375 start = ai[row]; 376 end = ai[row+1]; 377 for (k=start; k<end; k++) { /* Amat */ 378 val = aj[k] + cstart; 379 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 380 } 381 start = bi[row]; 382 end = bi[row+1]; 383 for (k=start; k<end; k++) { /* Bmat */ 384 val = garray[bj[k]]; 385 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 386 } 387 } 388 isz[i] = isz_i; 389 } 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 395 /* 396 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 397 and return the output 398 399 Input: 400 C - the matrix 401 nrqr - no of messages being processed. 402 rbuf - an array of pointers to the recieved requests 403 404 Output: 405 xdata - array of messages to be sent back 406 isz1 - size of each message 407 408 For better efficiency perhaps we should malloc separately each xdata[i], 409 then if a remalloc is required we need only copy the data for that one row 410 rather then all previous rows as it is now where a single large chunck of 411 memory is used. 412 413 */ 414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 415 { 416 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 417 Mat A = c->A,B = c->B; 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 419 PetscErrorCode ierr; 420 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 421 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 422 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 423 PetscInt *rbuf_i,kmax,rbuf_0; 424 PetscBT xtable; 425 426 PetscFunctionBegin; 427 m = C->rmap->N; 428 rstart = C->rmap->rstart; 429 cstart = C->cmap->rstart; 430 ai = a->i; 431 aj = a->j; 432 bi = b->i; 433 bj = b->j; 434 garray = c->garray; 435 436 437 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 438 rbuf_i = rbuf[i]; 439 rbuf_0 = rbuf_i[0]; 440 ct += rbuf_0; 441 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 442 } 443 444 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 445 else max1 = 1; 446 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 447 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 448 ++no_malloc; 449 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 450 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 451 452 ct3 = 0; 453 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 454 rbuf_i = rbuf[i]; 455 rbuf_0 = rbuf_i[0]; 456 ct1 = 2*rbuf_0+1; 457 ct2 = ct1; 458 ct3 += ct1; 459 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 460 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 461 oct2 = ct2; 462 kmax = rbuf_i[2*j]; 463 for (k=0; k<kmax; k++,ct1++) { 464 row = rbuf_i[ct1]; 465 if (!PetscBTLookupSet(xtable,row)) { 466 if (!(ct3 < mem_estimate)) { 467 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 468 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 469 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 470 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 471 xdata[0] = tmp; 472 mem_estimate = new_estimate; ++no_malloc; 473 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 474 } 475 xdata[i][ct2++] = row; 476 ct3++; 477 } 478 } 479 for (k=oct2,max2=ct2; k<max2; k++) { 480 row = xdata[i][k] - rstart; 481 start = ai[row]; 482 end = ai[row+1]; 483 for (l=start; l<end; l++) { 484 val = aj[l] + cstart; 485 if (!PetscBTLookupSet(xtable,val)) { 486 if (!(ct3 < mem_estimate)) { 487 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 488 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 489 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 490 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 491 xdata[0] = tmp; 492 mem_estimate = new_estimate; ++no_malloc; 493 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 494 } 495 xdata[i][ct2++] = val; 496 ct3++; 497 } 498 } 499 start = bi[row]; 500 end = bi[row+1]; 501 for (l=start; l<end; l++) { 502 val = garray[bj[l]]; 503 if (!PetscBTLookupSet(xtable,val)) { 504 if (!(ct3 < mem_estimate)) { 505 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 506 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 507 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 508 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 509 xdata[0] = tmp; 510 mem_estimate = new_estimate; ++no_malloc; 511 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 512 } 513 xdata[i][ct2++] = val; 514 ct3++; 515 } 516 } 517 } 518 /* Update the header*/ 519 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 520 xdata[i][2*j-1] = rbuf_i[2*j-1]; 521 } 522 xdata[i][0] = rbuf_0; 523 xdata[i+1] = xdata[i] + ct2; 524 isz1[i] = ct2; /* size of each message */ 525 } 526 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 527 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 528 PetscFunctionReturn(0); 529 } 530 /* -------------------------------------------------------------------------*/ 531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 533 /* 534 Every processor gets the entire matrix 535 */ 536 #undef __FUNCT__ 537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 539 { 540 Mat B; 541 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 542 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 543 PetscErrorCode ierr; 544 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 545 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 546 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 547 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 548 549 PetscFunctionBegin; 550 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 551 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 552 553 if (scall == MAT_INITIAL_MATRIX) { 554 /* ---------------------------------------------------------------- 555 Tell every processor the number of nonzeros per row 556 */ 557 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 558 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 559 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 560 } 561 sendcount = A->rmap->rend - A->rmap->rstart; 562 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 563 for (i=0; i<size; i++) { 564 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 565 displs[i] = A->rmap->range[i]; 566 } 567 #if defined(PETSC_HAVE_MPI_IN_PLACE) 568 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 569 #else 570 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 571 #endif 572 /* --------------------------------------------------------------- 573 Create the sequential matrix of the same type as the local block diagonal 574 */ 575 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 576 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 577 ierr = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 578 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 579 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 580 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 581 **Bin = B; 582 b = (Mat_SeqAIJ *)B->data; 583 584 /*-------------------------------------------------------------------- 585 Copy my part of matrix column indices over 586 */ 587 sendcount = ad->nz + bd->nz; 588 jsendbuf = b->j + b->i[rstarts[rank]]; 589 a_jsendbuf = ad->j; 590 b_jsendbuf = bd->j; 591 n = A->rmap->rend - A->rmap->rstart; 592 cnt = 0; 593 for (i=0; i<n; i++) { 594 595 /* put in lower diagonal portion */ 596 m = bd->i[i+1] - bd->i[i]; 597 while (m > 0) { 598 /* is it above diagonal (in bd (compressed) numbering) */ 599 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 600 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 601 m--; 602 } 603 604 /* put in diagonal portion */ 605 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 606 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 607 } 608 609 /* put in upper diagonal portion */ 610 while (m-- > 0) { 611 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 612 } 613 } 614 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 615 616 /*-------------------------------------------------------------------- 617 Gather all column indices to all processors 618 */ 619 for (i=0; i<size; i++) { 620 recvcounts[i] = 0; 621 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 622 recvcounts[i] += lens[j]; 623 } 624 } 625 displs[0] = 0; 626 for (i=1; i<size; i++) { 627 displs[i] = displs[i-1] + recvcounts[i-1]; 628 } 629 #if defined(PETSC_HAVE_MPI_IN_PLACE) 630 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 631 #else 632 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 633 #endif 634 /*-------------------------------------------------------------------- 635 Assemble the matrix into useable form (note numerical values not yet set) 636 */ 637 /* set the b->ilen (length of each row) values */ 638 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 639 /* set the b->i indices */ 640 b->i[0] = 0; 641 for (i=1; i<=A->rmap->N; i++) { 642 b->i[i] = b->i[i-1] + lens[i-1]; 643 } 644 ierr = PetscFree(lens);CHKERRQ(ierr); 645 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 646 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 647 648 } else { 649 B = **Bin; 650 b = (Mat_SeqAIJ *)B->data; 651 } 652 653 /*-------------------------------------------------------------------- 654 Copy my part of matrix numerical values into the values location 655 */ 656 if (flag == MAT_GET_VALUES) { 657 sendcount = ad->nz + bd->nz; 658 sendbuf = b->a + b->i[rstarts[rank]]; 659 a_sendbuf = ad->a; 660 b_sendbuf = bd->a; 661 b_sendj = bd->j; 662 n = A->rmap->rend - A->rmap->rstart; 663 cnt = 0; 664 for (i=0; i<n; i++) { 665 666 /* put in lower diagonal portion */ 667 m = bd->i[i+1] - bd->i[i]; 668 while (m > 0) { 669 /* is it above diagonal (in bd (compressed) numbering) */ 670 if (garray[*b_sendj] > A->rmap->rstart + i) break; 671 sendbuf[cnt++] = *b_sendbuf++; 672 m--; 673 b_sendj++; 674 } 675 676 /* put in diagonal portion */ 677 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 678 sendbuf[cnt++] = *a_sendbuf++; 679 } 680 681 /* put in upper diagonal portion */ 682 while (m-- > 0) { 683 sendbuf[cnt++] = *b_sendbuf++; 684 b_sendj++; 685 } 686 } 687 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 688 689 /* ----------------------------------------------------------------- 690 Gather all numerical values to all processors 691 */ 692 if (!recvcounts) { 693 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 694 } 695 for (i=0; i<size; i++) { 696 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 697 } 698 displs[0] = 0; 699 for (i=1; i<size; i++) { 700 displs[i] = displs[i-1] + recvcounts[i-1]; 701 } 702 recvbuf = b->a; 703 #if defined(PETSC_HAVE_MPI_IN_PLACE) 704 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 705 #else 706 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 707 #endif 708 } /* endof (flag == MAT_GET_VALUES) */ 709 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 710 711 if (A->symmetric) { 712 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 713 } else if (A->hermitian) { 714 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 715 } else if (A->structurally_symmetric) { 716 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 717 } 718 PetscFunctionReturn(0); 719 } 720 721 722 723 #undef __FUNCT__ 724 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 725 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 726 { 727 PetscErrorCode ierr; 728 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 729 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 730 731 PetscFunctionBegin; 732 /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */ 733 /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level. 734 However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that 735 take care to order the result correctly by assembling it with MatSetValues() (after preallocating). 736 */ 737 for (i = 0; i < ismax; ++i) { 738 PetscBool sorted; 739 ierr = ISSorted(iscol[i], &sorted);CHKERRQ(ierr); 740 if (!sorted) SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_SUP, "Column index set %D not sorted", i); 741 } 742 743 /* 744 Check for special case: each processor gets entire matrix 745 */ 746 if (ismax == 1 && C->rmap->N == C->cmap->N) { 747 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 748 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 749 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 750 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 751 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 752 wantallmatrix = PETSC_TRUE; 753 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 754 } 755 } 756 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPIU_BOOL,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 757 if (twantallmatrix) { 758 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 759 PetscFunctionReturn(0); 760 } 761 762 /* Allocate memory to hold all the submatrices */ 763 if (scall != MAT_REUSE_MATRIX) { 764 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 765 } 766 767 /* Check for special case: each processor gets entire matrix columns */ 768 ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr); 769 for (i=0; i<ismax; i++) { 770 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 771 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 772 if (colflag && ncol == C->cmap->N) { 773 allcolumns[i] = PETSC_TRUE; 774 } else { 775 allcolumns[i] = PETSC_FALSE; 776 } 777 } 778 779 /* Determine the number of stages through which submatrices are done */ 780 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 781 782 /* 783 Each stage will extract nmax submatrices. 784 nmax is determined by the matrix column dimension. 785 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 786 */ 787 if (!nmax) nmax = 1; 788 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 789 790 /* Make sure every processor loops through the nstages */ 791 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 792 793 for (i=0,pos=0; i<nstages; i++) { 794 if (pos+nmax <= ismax) max_no = nmax; 795 else if (pos == ismax) max_no = 0; 796 else max_no = ismax-pos; 797 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 798 pos += max_no; 799 } 800 801 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 802 PetscFunctionReturn(0); 803 } 804 805 /* -------------------------------------------------------------------------*/ 806 #undef __FUNCT__ 807 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 808 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 809 { 810 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 811 Mat A = c->A; 812 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 813 const PetscInt **icol,**irow; 814 PetscInt *nrow,*ncol,start; 815 PetscErrorCode ierr; 816 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 817 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 818 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 819 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 820 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 821 #if defined (PETSC_USE_CTABLE) 822 PetscTable *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i; 823 #else 824 PetscInt **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i; 825 #endif 826 const PetscInt *irow_i; 827 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 828 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 829 MPI_Request *r_waits4,*s_waits3,*s_waits4; 830 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 831 MPI_Status *r_status3,*r_status4,*s_status4; 832 MPI_Comm comm; 833 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 834 PetscMPIInt *onodes1,*olengths1; 835 PetscMPIInt idex,idex2,end; 836 837 PetscFunctionBegin; 838 comm = ((PetscObject)C)->comm; 839 tag0 = ((PetscObject)C)->tag; 840 size = c->size; 841 rank = c->rank; 842 843 /* Get some new tags to keep the communication clean */ 844 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 845 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 846 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 847 848 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 849 850 for (i=0; i<ismax; i++) { 851 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 852 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 853 if (allcolumns[i]) { 854 icol[i] = PETSC_NULL; 855 ncol[i] = C->cmap->N; 856 } else { 857 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 858 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 859 } 860 } 861 862 /* evaluate communication - mesg to who, length of mesg, and buffer space 863 required. Based on this, buffers are allocated, and data copied into them*/ 864 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 865 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 866 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 867 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 868 for (i=0; i<ismax; i++) { 869 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 870 jmax = nrow[i]; 871 irow_i = irow[i]; 872 for (j=0; j<jmax; j++) { 873 l = 0; 874 row = irow_i[j]; 875 while (row >= C->rmap->range[l+1]) l++; 876 proc = l; 877 w4[proc]++; 878 } 879 for (j=0; j<size; j++) { 880 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 881 } 882 } 883 884 nrqs = 0; /* no of outgoing messages */ 885 msz = 0; /* total mesg length (for all procs) */ 886 w1[rank] = 0; /* no mesg sent to self */ 887 w3[rank] = 0; 888 for (i=0; i<size; i++) { 889 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 890 } 891 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 892 for (i=0,j=0; i<size; i++) { 893 if (w1[i]) { pa[j] = i; j++; } 894 } 895 896 /* Each message would have a header = 1 + 2*(no of IS) + data */ 897 for (i=0; i<nrqs; i++) { 898 j = pa[i]; 899 w1[j] += w2[j] + 2* w3[j]; 900 msz += w1[j]; 901 } 902 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 903 904 /* Determine the number of messages to expect, their lengths, from from-ids */ 905 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 906 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 907 908 /* Now post the Irecvs corresponding to these messages */ 909 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 910 911 ierr = PetscFree(onodes1);CHKERRQ(ierr); 912 ierr = PetscFree(olengths1);CHKERRQ(ierr); 913 914 /* Allocate Memory for outgoing messages */ 915 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 916 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 917 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 918 919 { 920 PetscInt *iptr = tmp,ict = 0; 921 for (i=0; i<nrqs; i++) { 922 j = pa[i]; 923 iptr += ict; 924 sbuf1[j] = iptr; 925 ict = w1[j]; 926 } 927 } 928 929 /* Form the outgoing messages */ 930 /* Initialize the header space */ 931 for (i=0; i<nrqs; i++) { 932 j = pa[i]; 933 sbuf1[j][0] = 0; 934 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 935 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 936 } 937 938 /* Parse the isrow and copy data into outbuf */ 939 for (i=0; i<ismax; i++) { 940 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 941 irow_i = irow[i]; 942 jmax = nrow[i]; 943 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 944 l = 0; 945 row = irow_i[j]; 946 while (row >= C->rmap->range[l+1]) l++; 947 proc = l; 948 if (proc != rank) { /* copy to the outgoing buf*/ 949 ctr[proc]++; 950 *ptr[proc] = row; 951 ptr[proc]++; 952 } 953 } 954 /* Update the headers for the current IS */ 955 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 956 if ((ctr_j = ctr[j])) { 957 sbuf1_j = sbuf1[j]; 958 k = ++sbuf1_j[0]; 959 sbuf1_j[2*k] = ctr_j; 960 sbuf1_j[2*k-1] = i; 961 } 962 } 963 } 964 965 /* Now post the sends */ 966 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 967 for (i=0; i<nrqs; ++i) { 968 j = pa[i]; 969 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 970 } 971 972 /* Post Receives to capture the buffer size */ 973 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 974 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 975 rbuf2[0] = tmp + msz; 976 for (i=1; i<nrqs; ++i) { 977 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 978 } 979 for (i=0; i<nrqs; ++i) { 980 j = pa[i]; 981 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 982 } 983 984 /* Send to other procs the buf size they should allocate */ 985 986 987 /* Receive messages*/ 988 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 989 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 990 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 991 { 992 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 993 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 994 PetscInt *sbuf2_i; 995 996 for (i=0; i<nrqr; ++i) { 997 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 998 req_size[idex] = 0; 999 rbuf1_i = rbuf1[idex]; 1000 start = 2*rbuf1_i[0] + 1; 1001 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1002 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 1003 sbuf2_i = sbuf2[idex]; 1004 for (j=start; j<end; j++) { 1005 id = rbuf1_i[j] - rstart; 1006 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1007 sbuf2_i[j] = ncols; 1008 req_size[idex] += ncols; 1009 } 1010 req_source[idex] = r_status1[i].MPI_SOURCE; 1011 /* form the header */ 1012 sbuf2_i[0] = req_size[idex]; 1013 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 1014 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1015 } 1016 } 1017 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1018 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1019 1020 /* recv buffer sizes */ 1021 /* Receive messages*/ 1022 1023 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1024 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1025 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1026 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1027 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1028 1029 for (i=0; i<nrqs; ++i) { 1030 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1031 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1032 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1033 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1034 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1035 } 1036 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1037 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1038 1039 /* Wait on sends1 and sends2 */ 1040 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1041 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1042 1043 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1044 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1045 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1046 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1047 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1048 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1049 1050 /* Now allocate buffers for a->j, and send them off */ 1051 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1052 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1053 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1054 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1055 1056 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1057 { 1058 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1059 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1060 PetscInt cend = C->cmap->rend; 1061 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1062 1063 for (i=0; i<nrqr; i++) { 1064 rbuf1_i = rbuf1[i]; 1065 sbuf_aj_i = sbuf_aj[i]; 1066 ct1 = 2*rbuf1_i[0] + 1; 1067 ct2 = 0; 1068 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1069 kmax = rbuf1[i][2*j]; 1070 for (k=0; k<kmax; k++,ct1++) { 1071 row = rbuf1_i[ct1] - rstart; 1072 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1073 ncols = nzA + nzB; 1074 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1075 1076 /* load the column indices for this row into cols*/ 1077 cols = sbuf_aj_i + ct2; 1078 1079 lwrite = 0; 1080 for (l=0; l<nzB; l++) { 1081 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1082 } 1083 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1084 for (l=0; l<nzB; l++) { 1085 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1086 } 1087 1088 ct2 += ncols; 1089 } 1090 } 1091 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1092 } 1093 } 1094 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1095 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1096 1097 /* Allocate buffers for a->a, and send them off */ 1098 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1099 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1100 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1101 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1102 1103 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1104 { 1105 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1106 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1107 PetscInt cend = C->cmap->rend; 1108 PetscInt *b_j = b->j; 1109 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1110 1111 for (i=0; i<nrqr; i++) { 1112 rbuf1_i = rbuf1[i]; 1113 sbuf_aa_i = sbuf_aa[i]; 1114 ct1 = 2*rbuf1_i[0]+1; 1115 ct2 = 0; 1116 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1117 kmax = rbuf1_i[2*j]; 1118 for (k=0; k<kmax; k++,ct1++) { 1119 row = rbuf1_i[ct1] - rstart; 1120 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1121 ncols = nzA + nzB; 1122 cworkB = b_j + b_i[row]; 1123 vworkA = a_a + a_i[row]; 1124 vworkB = b_a + b_i[row]; 1125 1126 /* load the column values for this row into vals*/ 1127 vals = sbuf_aa_i+ct2; 1128 1129 lwrite = 0; 1130 for (l=0; l<nzB; l++) { 1131 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1132 } 1133 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1134 for (l=0; l<nzB; l++) { 1135 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1136 } 1137 1138 ct2 += ncols; 1139 } 1140 } 1141 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1142 } 1143 } 1144 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1145 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1146 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1147 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1148 1149 /* Form the matrix */ 1150 /* create col map: global col of C -> local col of submatrices */ 1151 { 1152 const PetscInt *icol_i; 1153 #if defined (PETSC_USE_CTABLE) 1154 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1155 for (i=0; i<ismax; i++) { 1156 if (!allcolumns[i]) { 1157 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1158 jmax = ncol[i]; 1159 icol_i = icol[i]; 1160 cmap_i = cmap[i]; 1161 for (j=0; j<jmax; j++) { 1162 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1163 } 1164 } else { 1165 cmap[i] = PETSC_NULL; 1166 } 1167 } 1168 #else 1169 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1170 for (i=0; i<ismax; i++) { 1171 if (!allcolumns[i]) { 1172 ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr); 1173 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1174 jmax = ncol[i]; 1175 icol_i = icol[i]; 1176 cmap_i = cmap[i]; 1177 for (j=0; j<jmax; j++) { 1178 cmap_i[icol_i[j]] = j+1; 1179 } 1180 } else { 1181 cmap[i] = PETSC_NULL; 1182 } 1183 } 1184 #endif 1185 } 1186 1187 /* Create lens which is required for MatCreate... */ 1188 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1189 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1190 if (ismax) { 1191 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1192 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1193 } 1194 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1195 1196 /* Update lens from local data */ 1197 for (i=0; i<ismax; i++) { 1198 jmax = nrow[i]; 1199 if (!allcolumns[i]) cmap_i = cmap[i]; 1200 irow_i = irow[i]; 1201 lens_i = lens[i]; 1202 for (j=0; j<jmax; j++) { 1203 l = 0; 1204 row = irow_i[j]; 1205 while (row >= C->rmap->range[l+1]) l++; 1206 proc = l; 1207 if (proc == rank) { 1208 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1209 if (!allcolumns[i]) { 1210 for (k=0; k<ncols; k++) { 1211 #if defined (PETSC_USE_CTABLE) 1212 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1213 #else 1214 tcol = cmap_i[cols[k]]; 1215 #endif 1216 if (tcol) { lens_i[j]++;} 1217 } 1218 } else { /* allcolumns */ 1219 lens_i[j] = ncols; 1220 } 1221 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1222 } 1223 } 1224 } 1225 1226 /* Create row map: global row of C -> local row of submatrices */ 1227 #if defined (PETSC_USE_CTABLE) 1228 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1229 for (i=0; i<ismax; i++) { 1230 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1231 rmap_i = rmap[i]; 1232 irow_i = irow[i]; 1233 jmax = nrow[i]; 1234 for (j=0; j<jmax; j++) { 1235 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1236 } 1237 } 1238 #else 1239 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1240 if (ismax) { 1241 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1242 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1243 } 1244 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1245 for (i=0; i<ismax; i++) { 1246 rmap_i = rmap[i]; 1247 irow_i = irow[i]; 1248 jmax = nrow[i]; 1249 for (j=0; j<jmax; j++) { 1250 rmap_i[irow_i[j]] = j; 1251 } 1252 } 1253 #endif 1254 1255 /* Update lens from offproc data */ 1256 { 1257 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1258 1259 for (tmp2=0; tmp2<nrqs; tmp2++) { 1260 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1261 idex = pa[idex2]; 1262 sbuf1_i = sbuf1[idex]; 1263 jmax = sbuf1_i[0]; 1264 ct1 = 2*jmax+1; 1265 ct2 = 0; 1266 rbuf2_i = rbuf2[idex2]; 1267 rbuf3_i = rbuf3[idex2]; 1268 for (j=1; j<=jmax; j++) { 1269 is_no = sbuf1_i[2*j-1]; 1270 max1 = sbuf1_i[2*j]; 1271 lens_i = lens[is_no]; 1272 if (!allcolumns[is_no]) {cmap_i = cmap[is_no];} 1273 rmap_i = rmap[is_no]; 1274 for (k=0; k<max1; k++,ct1++) { 1275 #if defined (PETSC_USE_CTABLE) 1276 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1277 row--; 1278 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1279 #else 1280 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1281 #endif 1282 max2 = rbuf2_i[ct1]; 1283 for (l=0; l<max2; l++,ct2++) { 1284 if (!allcolumns[is_no]) { 1285 #if defined (PETSC_USE_CTABLE) 1286 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1287 #else 1288 tcol = cmap_i[rbuf3_i[ct2]]; 1289 #endif 1290 if (tcol) { 1291 lens_i[row]++; 1292 } 1293 } else { /* allcolumns */ 1294 lens_i[row]++; /* lens_i[row] += max2 ? */ 1295 } 1296 } 1297 } 1298 } 1299 } 1300 } 1301 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1302 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1303 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1304 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1305 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1306 1307 /* Create the submatrices */ 1308 if (scall == MAT_REUSE_MATRIX) { 1309 PetscBool flag; 1310 1311 /* 1312 Assumes new rows are same length as the old rows,hence bug! 1313 */ 1314 for (i=0; i<ismax; i++) { 1315 mat = (Mat_SeqAIJ *)(submats[i]->data); 1316 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1317 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1318 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1319 /* Initial matrix as if empty */ 1320 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1321 submats[i]->factortype = C->factortype; 1322 } 1323 } else { 1324 for (i=0; i<ismax; i++) { 1325 PetscInt rbs,cbs; 1326 ierr = ISGetBlockSize(isrow[i],&rbs);CHKERRQ(ierr); 1327 ierr = ISGetBlockSize(iscol[i],&cbs);CHKERRQ(ierr); 1328 1329 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1330 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1331 1332 ierr = MatSetBlockSizes(submats[i],rbs,cbs);CHKERRQ(ierr); 1333 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1334 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1335 } 1336 } 1337 1338 /* Assemble the matrices */ 1339 /* First assemble the local rows */ 1340 { 1341 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1342 PetscScalar *imat_a; 1343 1344 for (i=0; i<ismax; i++) { 1345 mat = (Mat_SeqAIJ*)submats[i]->data; 1346 imat_ilen = mat->ilen; 1347 imat_j = mat->j; 1348 imat_i = mat->i; 1349 imat_a = mat->a; 1350 if (!allcolumns[i]) cmap_i = cmap[i]; 1351 rmap_i = rmap[i]; 1352 irow_i = irow[i]; 1353 jmax = nrow[i]; 1354 for (j=0; j<jmax; j++) { 1355 l = 0; 1356 row = irow_i[j]; 1357 while (row >= C->rmap->range[l+1]) l++; 1358 proc = l; 1359 if (proc == rank) { 1360 old_row = row; 1361 #if defined (PETSC_USE_CTABLE) 1362 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1363 row--; 1364 #else 1365 row = rmap_i[row]; 1366 #endif 1367 ilen_row = imat_ilen[row]; 1368 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1369 mat_i = imat_i[row] ; 1370 mat_a = imat_a + mat_i; 1371 mat_j = imat_j + mat_i; 1372 if (!allcolumns[i]) { 1373 for (k=0; k<ncols; k++) { 1374 #if defined (PETSC_USE_CTABLE) 1375 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1376 #else 1377 tcol = cmap_i[cols[k]]; 1378 #endif 1379 if (tcol) { 1380 *mat_j++ = tcol - 1; 1381 *mat_a++ = vals[k]; 1382 ilen_row++; 1383 } 1384 } 1385 } else { /* allcolumns */ 1386 for (k=0; k<ncols; k++) { 1387 *mat_j++ = cols[k] ; /* global col index! */ 1388 *mat_a++ = vals[k]; 1389 ilen_row++; 1390 } 1391 } 1392 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1393 imat_ilen[row] = ilen_row; 1394 } 1395 } 1396 } 1397 } 1398 1399 /* Now assemble the off proc rows*/ 1400 { 1401 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1402 PetscInt *imat_j,*imat_i; 1403 PetscScalar *imat_a,*rbuf4_i; 1404 1405 for (tmp2=0; tmp2<nrqs; tmp2++) { 1406 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1407 idex = pa[idex2]; 1408 sbuf1_i = sbuf1[idex]; 1409 jmax = sbuf1_i[0]; 1410 ct1 = 2*jmax + 1; 1411 ct2 = 0; 1412 rbuf2_i = rbuf2[idex2]; 1413 rbuf3_i = rbuf3[idex2]; 1414 rbuf4_i = rbuf4[idex2]; 1415 for (j=1; j<=jmax; j++) { 1416 is_no = sbuf1_i[2*j-1]; 1417 rmap_i = rmap[is_no]; 1418 if (!allcolumns[is_no]) {cmap_i = cmap[is_no];} 1419 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1420 imat_ilen = mat->ilen; 1421 imat_j = mat->j; 1422 imat_i = mat->i; 1423 imat_a = mat->a; 1424 max1 = sbuf1_i[2*j]; 1425 for (k=0; k<max1; k++,ct1++) { 1426 row = sbuf1_i[ct1]; 1427 #if defined (PETSC_USE_CTABLE) 1428 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1429 row--; 1430 #else 1431 row = rmap_i[row]; 1432 #endif 1433 ilen = imat_ilen[row]; 1434 mat_i = imat_i[row] ; 1435 mat_a = imat_a + mat_i; 1436 mat_j = imat_j + mat_i; 1437 max2 = rbuf2_i[ct1]; 1438 if (!allcolumns[is_no]) { 1439 for (l=0; l<max2; l++,ct2++) { 1440 1441 #if defined (PETSC_USE_CTABLE) 1442 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1443 #else 1444 tcol = cmap_i[rbuf3_i[ct2]]; 1445 #endif 1446 if (tcol) { 1447 *mat_j++ = tcol - 1; 1448 *mat_a++ = rbuf4_i[ct2]; 1449 ilen++; 1450 } 1451 } 1452 } else { /* allcolumns */ 1453 for (l=0; l<max2; l++,ct2++) { 1454 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1455 *mat_a++ = rbuf4_i[ct2]; 1456 ilen++; 1457 } 1458 } 1459 imat_ilen[row] = ilen; 1460 } 1461 } 1462 } 1463 } 1464 1465 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1466 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1467 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1468 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1469 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1470 1471 /* Restore the indices */ 1472 for (i=0; i<ismax; i++) { 1473 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1474 if (!allcolumns[i]) { 1475 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1476 } 1477 } 1478 1479 /* Destroy allocated memory */ 1480 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1481 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1482 ierr = PetscFree(pa);CHKERRQ(ierr); 1483 1484 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1485 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1486 for (i=0; i<nrqr; ++i) { 1487 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1488 } 1489 for (i=0; i<nrqs; ++i) { 1490 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1491 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1492 } 1493 1494 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1495 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1496 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1497 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1498 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1499 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1500 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1501 1502 #if defined (PETSC_USE_CTABLE) 1503 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1504 #else 1505 if (ismax) {ierr = PetscFree(rmap[0]);CHKERRQ(ierr);} 1506 #endif 1507 ierr = PetscFree(rmap);CHKERRQ(ierr); 1508 1509 for (i=0; i<ismax; i++) { 1510 if (!allcolumns[i]) { 1511 #if defined (PETSC_USE_CTABLE) 1512 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1513 #else 1514 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1515 #endif 1516 } 1517 } 1518 ierr = PetscFree(cmap);CHKERRQ(ierr); 1519 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);} 1520 ierr = PetscFree(lens);CHKERRQ(ierr); 1521 1522 for (i=0; i<ismax; i++) { 1523 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1524 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1525 } 1526 PetscFunctionReturn(0); 1527 } 1528 1529 /* 1530 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1531 Be careful not to destroy them elsewhere. 1532 */ 1533 #undef __FUNCT__ 1534 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1535 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1536 { 1537 /* If making this function public, change the error returned in this function away from _PLIB. */ 1538 PetscErrorCode ierr; 1539 Mat_MPIAIJ *aij; 1540 PetscBool seqaij; 1541 1542 PetscFunctionBegin; 1543 /* Check to make sure the component matrices are compatible with C. */ 1544 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1545 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1546 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij);CHKERRQ(ierr); 1547 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1548 if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1549 1550 ierr = MatCreate(comm, C);CHKERRQ(ierr); 1551 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE);CHKERRQ(ierr); 1552 ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs);CHKERRQ(ierr); 1553 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1554 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1555 if ((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1556 ierr = MatSetType(*C, MATMPIAIJ);CHKERRQ(ierr); 1557 aij = (Mat_MPIAIJ*)((*C)->data); 1558 aij->A = A; 1559 aij->B = B; 1560 ierr = PetscLogObjectParent(*C,A);CHKERRQ(ierr); 1561 ierr = PetscLogObjectParent(*C,B);CHKERRQ(ierr); 1562 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1563 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1564 PetscFunctionReturn(0); 1565 } 1566 1567 #undef __FUNCT__ 1568 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1569 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1570 { 1571 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1572 1573 PetscFunctionBegin; 1574 PetscValidPointer(A,2); 1575 PetscValidPointer(B,3); 1576 *A = aij->A; 1577 *B = aij->B; 1578 /* Note that we don't incref *A and *B, so be careful! */ 1579 PetscFunctionReturn(0); 1580 } 1581 1582 #undef __FUNCT__ 1583 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1584 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1585 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1586 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1587 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1588 { 1589 PetscErrorCode ierr; 1590 PetscMPIInt size, flag; 1591 PetscInt i,ii; 1592 PetscInt ismax_c; 1593 1594 PetscFunctionBegin; 1595 if (!ismax) { 1596 PetscFunctionReturn(0); 1597 } 1598 for (i = 0, ismax_c = 0; i < ismax; ++i) { 1599 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag);CHKERRQ(ierr); 1600 if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1601 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1602 if (size > 1) { 1603 ++ismax_c; 1604 } 1605 } 1606 if (!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1607 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);CHKERRQ(ierr); 1608 } else { /* if (ismax_c) */ 1609 Mat *A,*B; 1610 IS *isrow_c, *iscol_c; 1611 PetscMPIInt size; 1612 /* 1613 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1614 array of sequential matrices underlying the resulting parallel matrices. 1615 Which arrays to allocate is based on the value of MatReuse scall. 1616 There are as many diag matrices as there are original index sets. 1617 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1618 1619 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1620 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1621 will deposite the extracted diag and off-diag parts. 1622 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1623 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1624 */ 1625 1626 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1627 if (scall != MAT_REUSE_MATRIX) { 1628 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1629 } else { 1630 ierr = PetscMalloc(ismax*sizeof(Mat), &A);CHKERRQ(ierr); 1631 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B);CHKERRQ(ierr); 1632 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1633 for (i = 0, ii = 0; i < ismax; ++i) { 1634 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1635 if (size > 1) { 1636 ierr = (*extractseq)((*submat)[i],A+i,B+ii);CHKERRQ(ierr); 1637 ++ii; 1638 } else { 1639 A[i] = (*submat)[i]; 1640 } 1641 } 1642 } 1643 /* 1644 Construct the complements of the iscol ISs for parallel ISs only. 1645 These are used to extract the off-diag portion of the resulting parallel matrix. 1646 The row IS for the off-diag portion is the same as for the diag portion, 1647 so we merely alias the row IS, while skipping those that are sequential. 1648 */ 1649 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c);CHKERRQ(ierr); 1650 for (i = 0, ii = 0; i < ismax; ++i) { 1651 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1652 if (size > 1) { 1653 isrow_c[ii] = isrow[i]; 1654 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii]));CHKERRQ(ierr); 1655 ++ii; 1656 } 1657 } 1658 /* Now obtain the sequential A and B submatrices separately. */ 1659 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A);CHKERRQ(ierr); 1660 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B);CHKERRQ(ierr); 1661 for (ii = 0; ii < ismax_c; ++ii) { 1662 ierr = ISDestroy(&iscol_c[ii]);CHKERRQ(ierr); 1663 } 1664 ierr = PetscFree2(isrow_c, iscol_c);CHKERRQ(ierr); 1665 /* 1666 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1667 have been extracted directly into the parallel matrices containing them, or 1668 simply into the sequential matrix identical with the corresponding A (if size == 1). 1669 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1670 A is put into the correct submat slot (if size == 1). 1671 */ 1672 if (scall != MAT_REUSE_MATRIX) { 1673 for (i = 0, ii = 0; i < ismax; ++i) { 1674 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);CHKERRQ(ierr); 1675 if (size > 1) { 1676 /* 1677 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1678 */ 1679 /* Construct submat[i] from the Seq pieces A and B. */ 1680 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i);CHKERRQ(ierr); 1681 1682 ++ii; 1683 } else { 1684 (*submat)[i] = A[i]; 1685 } 1686 } 1687 } 1688 ierr = PetscFree(A);CHKERRQ(ierr); 1689 ierr = PetscFree(B);CHKERRQ(ierr); 1690 } 1691 PetscFunctionReturn(0); 1692 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1693 1694 1695 1696 #undef __FUNCT__ 1697 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1698 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1699 { 1700 PetscErrorCode ierr; 1701 1702 PetscFunctionBegin; 1703 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 1704 PetscFunctionReturn(0); 1705 } 1706