1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++){ 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 260 rw1[proc] = isz1[i]; 261 } 262 ierr = PetscFree(onodes1);CHKERRQ(ierr); 263 ierr = PetscFree(olengths1);CHKERRQ(ierr); 264 265 /* Determine the number of messages to expect, their lengths, from from-ids */ 266 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 267 ierr = PetscFree(rw1);CHKERRQ(ierr); 268 } 269 /* Now post the Irecvs corresponding to these messages */ 270 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 271 272 /* Now post the sends */ 273 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 274 for (i=0; i<nrqr; ++i) { 275 j = recv_status[i].MPI_SOURCE; 276 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* receive work done on other processors*/ 280 { 281 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 282 PetscMPIInt idex; 283 PetscBT table_i; 284 MPI_Status *status2; 285 286 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 287 for (i=0; i<nrqs; ++i) { 288 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 289 /* Process the message*/ 290 rbuf2_i = rbuf2[idex]; 291 ct1 = 2*rbuf2_i[0]+1; 292 jmax = rbuf2[idex][0]; 293 for (j=1; j<=jmax; j++) { 294 max = rbuf2_i[2*j]; 295 is_no = rbuf2_i[2*j-1]; 296 isz_i = isz[is_no]; 297 data_i = data[is_no]; 298 table_i = table[is_no]; 299 for (k=0; k<max; k++,ct1++) { 300 row = rbuf2_i[ct1]; 301 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 302 } 303 isz[is_no] = isz_i; 304 } 305 } 306 307 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 308 ierr = PetscFree(status2);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<imax; ++i) { 312 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 313 } 314 315 ierr = PetscFree(onodes2);CHKERRQ(ierr); 316 ierr = PetscFree(olengths2);CHKERRQ(ierr); 317 318 ierr = PetscFree(pa);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 324 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 325 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 326 ierr = PetscFree(s_status);CHKERRQ(ierr); 327 ierr = PetscFree(recv_status);CHKERRQ(ierr); 328 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 329 ierr = PetscFree(xdata);CHKERRQ(ierr); 330 ierr = PetscFree(isz1);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 336 /* 337 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 338 the work on the local processor. 339 340 Inputs: 341 C - MAT_MPIAIJ; 342 imax - total no of index sets processed at a time; 343 table - an array of char - size = m bits. 344 345 Output: 346 isz - array containing the count of the solution elements corresponding 347 to each index set; 348 data - pointer to the solutions 349 */ 350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 351 { 352 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 353 Mat A = c->A,B = c->B; 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 355 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 356 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 357 PetscBT table_i; 358 359 PetscFunctionBegin; 360 rstart = C->rmap->rstart; 361 cstart = C->cmap->rstart; 362 ai = a->i; 363 aj = a->j; 364 bi = b->i; 365 bj = b->j; 366 garray = c->garray; 367 368 369 for (i=0; i<imax; i++) { 370 data_i = data[i]; 371 table_i = table[i]; 372 isz_i = isz[i]; 373 for (j=0,max=isz[i]; j<max; j++) { 374 row = data_i[j] - rstart; 375 start = ai[row]; 376 end = ai[row+1]; 377 for (k=start; k<end; k++) { /* Amat */ 378 val = aj[k] + cstart; 379 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 380 } 381 start = bi[row]; 382 end = bi[row+1]; 383 for (k=start; k<end; k++) { /* Bmat */ 384 val = garray[bj[k]]; 385 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 386 } 387 } 388 isz[i] = isz_i; 389 } 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 395 /* 396 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 397 and return the output 398 399 Input: 400 C - the matrix 401 nrqr - no of messages being processed. 402 rbuf - an array of pointers to the recieved requests 403 404 Output: 405 xdata - array of messages to be sent back 406 isz1 - size of each message 407 408 For better efficiency perhaps we should malloc separately each xdata[i], 409 then if a remalloc is required we need only copy the data for that one row 410 rather then all previous rows as it is now where a single large chunck of 411 memory is used. 412 413 */ 414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 415 { 416 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 417 Mat A = c->A,B = c->B; 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 419 PetscErrorCode ierr; 420 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 421 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 422 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 423 PetscInt *rbuf_i,kmax,rbuf_0; 424 PetscBT xtable; 425 426 PetscFunctionBegin; 427 m = C->rmap->N; 428 rstart = C->rmap->rstart; 429 cstart = C->cmap->rstart; 430 ai = a->i; 431 aj = a->j; 432 bi = b->i; 433 bj = b->j; 434 garray = c->garray; 435 436 437 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 438 rbuf_i = rbuf[i]; 439 rbuf_0 = rbuf_i[0]; 440 ct += rbuf_0; 441 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 442 } 443 444 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 445 else max1 = 1; 446 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 447 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 448 ++no_malloc; 449 ierr = PetscBTCreate(m,&xtable);CHKERRQ(ierr); 450 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 451 452 ct3 = 0; 453 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 454 rbuf_i = rbuf[i]; 455 rbuf_0 = rbuf_i[0]; 456 ct1 = 2*rbuf_0+1; 457 ct2 = ct1; 458 ct3 += ct1; 459 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 460 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 461 oct2 = ct2; 462 kmax = rbuf_i[2*j]; 463 for (k=0; k<kmax; k++,ct1++) { 464 row = rbuf_i[ct1]; 465 if (!PetscBTLookupSet(xtable,row)) { 466 if (!(ct3 < mem_estimate)) { 467 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 468 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 469 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 470 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 471 xdata[0] = tmp; 472 mem_estimate = new_estimate; ++no_malloc; 473 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 474 } 475 xdata[i][ct2++] = row; 476 ct3++; 477 } 478 } 479 for (k=oct2,max2=ct2; k<max2; k++) { 480 row = xdata[i][k] - rstart; 481 start = ai[row]; 482 end = ai[row+1]; 483 for (l=start; l<end; l++) { 484 val = aj[l] + cstart; 485 if (!PetscBTLookupSet(xtable,val)) { 486 if (!(ct3 < mem_estimate)) { 487 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 488 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 489 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 490 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 491 xdata[0] = tmp; 492 mem_estimate = new_estimate; ++no_malloc; 493 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 494 } 495 xdata[i][ct2++] = val; 496 ct3++; 497 } 498 } 499 start = bi[row]; 500 end = bi[row+1]; 501 for (l=start; l<end; l++) { 502 val = garray[bj[l]]; 503 if (!PetscBTLookupSet(xtable,val)) { 504 if (!(ct3 < mem_estimate)) { 505 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 506 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 507 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 508 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 509 xdata[0] = tmp; 510 mem_estimate = new_estimate; ++no_malloc; 511 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 512 } 513 xdata[i][ct2++] = val; 514 ct3++; 515 } 516 } 517 } 518 /* Update the header*/ 519 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 520 xdata[i][2*j-1] = rbuf_i[2*j-1]; 521 } 522 xdata[i][0] = rbuf_0; 523 xdata[i+1] = xdata[i] + ct2; 524 isz1[i] = ct2; /* size of each message */ 525 } 526 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr); 527 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 528 PetscFunctionReturn(0); 529 } 530 /* -------------------------------------------------------------------------*/ 531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 533 /* 534 Every processor gets the entire matrix 535 */ 536 #undef __FUNCT__ 537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 539 { 540 Mat B; 541 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 542 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 543 PetscErrorCode ierr; 544 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 545 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 546 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 547 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 548 549 PetscFunctionBegin; 550 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 551 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 552 553 if (scall == MAT_INITIAL_MATRIX) { 554 /* ---------------------------------------------------------------- 555 Tell every processor the number of nonzeros per row 556 */ 557 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 558 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 559 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 560 } 561 sendcount = A->rmap->rend - A->rmap->rstart; 562 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 563 for (i=0; i<size; i++) { 564 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 565 displs[i] = A->rmap->range[i]; 566 } 567 #if defined(PETSC_HAVE_MPI_IN_PLACE) 568 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 569 #else 570 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 571 #endif 572 /* --------------------------------------------------------------- 573 Create the sequential matrix of the same type as the local block diagonal 574 */ 575 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 576 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 577 ierr = MatSetBlockSizes(B,A->rmap->bs,A->cmap->bs);CHKERRQ(ierr); 578 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 579 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 580 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 581 **Bin = B; 582 b = (Mat_SeqAIJ *)B->data; 583 584 /*-------------------------------------------------------------------- 585 Copy my part of matrix column indices over 586 */ 587 sendcount = ad->nz + bd->nz; 588 jsendbuf = b->j + b->i[rstarts[rank]]; 589 a_jsendbuf = ad->j; 590 b_jsendbuf = bd->j; 591 n = A->rmap->rend - A->rmap->rstart; 592 cnt = 0; 593 for (i=0; i<n; i++) { 594 595 /* put in lower diagonal portion */ 596 m = bd->i[i+1] - bd->i[i]; 597 while (m > 0) { 598 /* is it above diagonal (in bd (compressed) numbering) */ 599 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 600 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 601 m--; 602 } 603 604 /* put in diagonal portion */ 605 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 606 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 607 } 608 609 /* put in upper diagonal portion */ 610 while (m-- > 0) { 611 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 612 } 613 } 614 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 615 616 /*-------------------------------------------------------------------- 617 Gather all column indices to all processors 618 */ 619 for (i=0; i<size; i++) { 620 recvcounts[i] = 0; 621 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 622 recvcounts[i] += lens[j]; 623 } 624 } 625 displs[0] = 0; 626 for (i=1; i<size; i++) { 627 displs[i] = displs[i-1] + recvcounts[i-1]; 628 } 629 #if defined(PETSC_HAVE_MPI_IN_PLACE) 630 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 631 #else 632 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 633 #endif 634 /*-------------------------------------------------------------------- 635 Assemble the matrix into useable form (note numerical values not yet set) 636 */ 637 /* set the b->ilen (length of each row) values */ 638 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 639 /* set the b->i indices */ 640 b->i[0] = 0; 641 for (i=1; i<=A->rmap->N; i++) { 642 b->i[i] = b->i[i-1] + lens[i-1]; 643 } 644 ierr = PetscFree(lens);CHKERRQ(ierr); 645 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 646 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 647 648 } else { 649 B = **Bin; 650 b = (Mat_SeqAIJ *)B->data; 651 } 652 653 /*-------------------------------------------------------------------- 654 Copy my part of matrix numerical values into the values location 655 */ 656 if (flag == MAT_GET_VALUES){ 657 sendcount = ad->nz + bd->nz; 658 sendbuf = b->a + b->i[rstarts[rank]]; 659 a_sendbuf = ad->a; 660 b_sendbuf = bd->a; 661 b_sendj = bd->j; 662 n = A->rmap->rend - A->rmap->rstart; 663 cnt = 0; 664 for (i=0; i<n; i++) { 665 666 /* put in lower diagonal portion */ 667 m = bd->i[i+1] - bd->i[i]; 668 while (m > 0) { 669 /* is it above diagonal (in bd (compressed) numbering) */ 670 if (garray[*b_sendj] > A->rmap->rstart + i) break; 671 sendbuf[cnt++] = *b_sendbuf++; 672 m--; 673 b_sendj++; 674 } 675 676 /* put in diagonal portion */ 677 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 678 sendbuf[cnt++] = *a_sendbuf++; 679 } 680 681 /* put in upper diagonal portion */ 682 while (m-- > 0) { 683 sendbuf[cnt++] = *b_sendbuf++; 684 b_sendj++; 685 } 686 } 687 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 688 689 /* ----------------------------------------------------------------- 690 Gather all numerical values to all processors 691 */ 692 if (!recvcounts) { 693 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 694 } 695 for (i=0; i<size; i++) { 696 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 697 } 698 displs[0] = 0; 699 for (i=1; i<size; i++) { 700 displs[i] = displs[i-1] + recvcounts[i-1]; 701 } 702 recvbuf = b->a; 703 #if defined(PETSC_HAVE_MPI_IN_PLACE) 704 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 705 #else 706 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 707 #endif 708 } /* endof (flag == MAT_GET_VALUES) */ 709 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 710 711 if (A->symmetric){ 712 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 713 } else if (A->hermitian) { 714 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 715 } else if (A->structurally_symmetric) { 716 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 717 } 718 PetscFunctionReturn(0); 719 } 720 721 722 723 #undef __FUNCT__ 724 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 725 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 726 { 727 PetscErrorCode ierr; 728 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 729 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 730 731 PetscFunctionBegin; 732 733 /* Currently, unsorted column indices will result in inverted column indices in the resulting submatrices. */ 734 /* It would make sense to error out in MatGetSubMatrices_MPIAIJ_Local(), the most impl-specific level. 735 However, there are more careful users of MatGetSubMatrices_MPIAIJ_Local() -- MatPermute_MPIAIJ() -- that 736 take care to order the result correctly by assembling it with MatSetValues() (after preallocating). 737 */ 738 for(i = 0; i < ismax; ++i) { 739 PetscBool sorted; 740 ierr = ISSorted(iscol[i], &sorted); CHKERRQ(ierr); 741 if(!sorted) SETERRQ1(((PetscObject)iscol[i])->comm, PETSC_ERR_SUP, "Column index set %D not sorted", i); 742 } 743 744 /* 745 Check for special case: each processor gets entire matrix 746 */ 747 if (ismax == 1 && C->rmap->N == C->cmap->N) { 748 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 749 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 750 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 751 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 752 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 753 wantallmatrix = PETSC_TRUE; 754 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 755 } 756 } 757 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 758 if (twantallmatrix) { 759 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 760 PetscFunctionReturn(0); 761 } 762 763 /* Allocate memory to hold all the submatrices */ 764 if (scall != MAT_REUSE_MATRIX) { 765 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 766 } 767 768 /* Check for special case: each processor gets entire matrix columns */ 769 ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr); 770 for (i=0; i<ismax; i++) { 771 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 772 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 773 if (colflag && ncol == C->cmap->N){ 774 allcolumns[i] = PETSC_TRUE; 775 } else { 776 allcolumns[i] = PETSC_FALSE; 777 } 778 } 779 780 /* Determine the number of stages through which submatrices are done */ 781 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 782 783 /* 784 Each stage will extract nmax submatrices. 785 nmax is determined by the matrix column dimension. 786 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 787 */ 788 if (!nmax) nmax = 1; 789 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 790 791 /* Make sure every processor loops through the nstages */ 792 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 793 794 for (i=0,pos=0; i<nstages; i++) { 795 if (pos+nmax <= ismax) max_no = nmax; 796 else if (pos == ismax) max_no = 0; 797 else max_no = ismax-pos; 798 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 799 pos += max_no; 800 } 801 802 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 803 PetscFunctionReturn(0); 804 } 805 806 /* -------------------------------------------------------------------------*/ 807 #undef __FUNCT__ 808 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 809 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 810 { 811 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 812 Mat A = c->A; 813 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 814 const PetscInt **icol,**irow; 815 PetscInt *nrow,*ncol,start; 816 PetscErrorCode ierr; 817 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 818 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 819 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 820 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 821 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 822 #if defined (PETSC_USE_CTABLE) 823 PetscTable *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i; 824 #else 825 PetscInt **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i; 826 #endif 827 const PetscInt *irow_i; 828 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 829 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 830 MPI_Request *r_waits4,*s_waits3,*s_waits4; 831 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 832 MPI_Status *r_status3,*r_status4,*s_status4; 833 MPI_Comm comm; 834 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 835 PetscMPIInt *onodes1,*olengths1; 836 PetscMPIInt idex,idex2,end; 837 838 PetscFunctionBegin; 839 840 comm = ((PetscObject)C)->comm; 841 tag0 = ((PetscObject)C)->tag; 842 size = c->size; 843 rank = c->rank; 844 845 /* Get some new tags to keep the communication clean */ 846 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 847 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 848 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 849 850 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 851 852 for (i=0; i<ismax; i++) { 853 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 854 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 855 if (allcolumns[i]){ 856 icol[i] = PETSC_NULL; 857 ncol[i] = C->cmap->N; 858 } else { 859 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 860 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 861 } 862 } 863 864 /* evaluate communication - mesg to who, length of mesg, and buffer space 865 required. Based on this, buffers are allocated, and data copied into them*/ 866 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 867 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 868 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 869 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 870 for (i=0; i<ismax; i++) { 871 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 872 jmax = nrow[i]; 873 irow_i = irow[i]; 874 for (j=0; j<jmax; j++) { 875 l = 0; 876 row = irow_i[j]; 877 while (row >= C->rmap->range[l+1]) l++; 878 proc = l; 879 w4[proc]++; 880 } 881 for (j=0; j<size; j++) { 882 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 883 } 884 } 885 886 nrqs = 0; /* no of outgoing messages */ 887 msz = 0; /* total mesg length (for all procs) */ 888 w1[rank] = 0; /* no mesg sent to self */ 889 w3[rank] = 0; 890 for (i=0; i<size; i++) { 891 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 892 } 893 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 894 for (i=0,j=0; i<size; i++) { 895 if (w1[i]) { pa[j] = i; j++; } 896 } 897 898 /* Each message would have a header = 1 + 2*(no of IS) + data */ 899 for (i=0; i<nrqs; i++) { 900 j = pa[i]; 901 w1[j] += w2[j] + 2* w3[j]; 902 msz += w1[j]; 903 } 904 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 905 906 /* Determine the number of messages to expect, their lengths, from from-ids */ 907 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 908 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 909 910 /* Now post the Irecvs corresponding to these messages */ 911 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 912 913 ierr = PetscFree(onodes1);CHKERRQ(ierr); 914 ierr = PetscFree(olengths1);CHKERRQ(ierr); 915 916 /* Allocate Memory for outgoing messages */ 917 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 918 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 919 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 920 921 { 922 PetscInt *iptr = tmp,ict = 0; 923 for (i=0; i<nrqs; i++) { 924 j = pa[i]; 925 iptr += ict; 926 sbuf1[j] = iptr; 927 ict = w1[j]; 928 } 929 } 930 931 /* Form the outgoing messages */ 932 /* Initialize the header space */ 933 for (i=0; i<nrqs; i++) { 934 j = pa[i]; 935 sbuf1[j][0] = 0; 936 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 937 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 938 } 939 940 /* Parse the isrow and copy data into outbuf */ 941 for (i=0; i<ismax; i++) { 942 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 943 irow_i = irow[i]; 944 jmax = nrow[i]; 945 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 946 l = 0; 947 row = irow_i[j]; 948 while (row >= C->rmap->range[l+1]) l++; 949 proc = l; 950 if (proc != rank) { /* copy to the outgoing buf*/ 951 ctr[proc]++; 952 *ptr[proc] = row; 953 ptr[proc]++; 954 } 955 } 956 /* Update the headers for the current IS */ 957 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 958 if ((ctr_j = ctr[j])) { 959 sbuf1_j = sbuf1[j]; 960 k = ++sbuf1_j[0]; 961 sbuf1_j[2*k] = ctr_j; 962 sbuf1_j[2*k-1] = i; 963 } 964 } 965 } 966 967 /* Now post the sends */ 968 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 969 for (i=0; i<nrqs; ++i) { 970 j = pa[i]; 971 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 972 } 973 974 /* Post Receives to capture the buffer size */ 975 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 976 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 977 rbuf2[0] = tmp + msz; 978 for (i=1; i<nrqs; ++i) { 979 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 980 } 981 for (i=0; i<nrqs; ++i) { 982 j = pa[i]; 983 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 984 } 985 986 /* Send to other procs the buf size they should allocate */ 987 988 989 /* Receive messages*/ 990 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 991 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 992 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 993 { 994 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 995 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 996 PetscInt *sbuf2_i; 997 998 for (i=0; i<nrqr; ++i) { 999 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 1000 req_size[idex] = 0; 1001 rbuf1_i = rbuf1[idex]; 1002 start = 2*rbuf1_i[0] + 1; 1003 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 1004 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 1005 sbuf2_i = sbuf2[idex]; 1006 for (j=start; j<end; j++) { 1007 id = rbuf1_i[j] - rstart; 1008 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 1009 sbuf2_i[j] = ncols; 1010 req_size[idex] += ncols; 1011 } 1012 req_source[idex] = r_status1[i].MPI_SOURCE; 1013 /* form the header */ 1014 sbuf2_i[0] = req_size[idex]; 1015 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 1016 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1017 } 1018 } 1019 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1020 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1021 1022 /* recv buffer sizes */ 1023 /* Receive messages*/ 1024 1025 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1026 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1027 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1028 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1029 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1030 1031 for (i=0; i<nrqs; ++i) { 1032 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1033 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1034 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1035 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1036 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1037 } 1038 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1039 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1040 1041 /* Wait on sends1 and sends2 */ 1042 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1043 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1044 1045 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1046 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1047 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1048 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1049 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1050 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1051 1052 /* Now allocate buffers for a->j, and send them off */ 1053 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1054 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1055 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1056 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1057 1058 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1059 { 1060 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1061 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1062 PetscInt cend = C->cmap->rend; 1063 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1064 1065 for (i=0; i<nrqr; i++) { 1066 rbuf1_i = rbuf1[i]; 1067 sbuf_aj_i = sbuf_aj[i]; 1068 ct1 = 2*rbuf1_i[0] + 1; 1069 ct2 = 0; 1070 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1071 kmax = rbuf1[i][2*j]; 1072 for (k=0; k<kmax; k++,ct1++) { 1073 row = rbuf1_i[ct1] - rstart; 1074 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1075 ncols = nzA + nzB; 1076 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1077 1078 /* load the column indices for this row into cols*/ 1079 cols = sbuf_aj_i + ct2; 1080 1081 lwrite = 0; 1082 for (l=0; l<nzB; l++) { 1083 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1084 } 1085 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1086 for (l=0; l<nzB; l++) { 1087 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1088 } 1089 1090 ct2 += ncols; 1091 } 1092 } 1093 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1094 } 1095 } 1096 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1097 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1098 1099 /* Allocate buffers for a->a, and send them off */ 1100 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1101 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1102 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1103 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1104 1105 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1106 { 1107 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1108 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1109 PetscInt cend = C->cmap->rend; 1110 PetscInt *b_j = b->j; 1111 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1112 1113 for (i=0; i<nrqr; i++) { 1114 rbuf1_i = rbuf1[i]; 1115 sbuf_aa_i = sbuf_aa[i]; 1116 ct1 = 2*rbuf1_i[0]+1; 1117 ct2 = 0; 1118 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1119 kmax = rbuf1_i[2*j]; 1120 for (k=0; k<kmax; k++,ct1++) { 1121 row = rbuf1_i[ct1] - rstart; 1122 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1123 ncols = nzA + nzB; 1124 cworkB = b_j + b_i[row]; 1125 vworkA = a_a + a_i[row]; 1126 vworkB = b_a + b_i[row]; 1127 1128 /* load the column values for this row into vals*/ 1129 vals = sbuf_aa_i+ct2; 1130 1131 lwrite = 0; 1132 for (l=0; l<nzB; l++) { 1133 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1134 } 1135 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1136 for (l=0; l<nzB; l++) { 1137 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1138 } 1139 1140 ct2 += ncols; 1141 } 1142 } 1143 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1144 } 1145 } 1146 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1147 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1148 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1149 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1150 1151 /* Form the matrix */ 1152 /* create col map: global col of C -> local col of submatrices */ 1153 { 1154 const PetscInt *icol_i; 1155 #if defined (PETSC_USE_CTABLE) 1156 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1157 for (i=0; i<ismax; i++) { 1158 if (!allcolumns[i]){ 1159 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1160 jmax = ncol[i]; 1161 icol_i = icol[i]; 1162 cmap_i = cmap[i]; 1163 for (j=0; j<jmax; j++) { 1164 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1165 } 1166 } else { 1167 cmap[i] = PETSC_NULL; 1168 } 1169 } 1170 #else 1171 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1172 for (i=0; i<ismax; i++) { 1173 if (!allcolumns[i]){ 1174 ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr); 1175 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1176 jmax = ncol[i]; 1177 icol_i = icol[i]; 1178 cmap_i = cmap[i]; 1179 for (j=0; j<jmax; j++) { 1180 cmap_i[icol_i[j]] = j+1; 1181 } 1182 } else { 1183 cmap[i] = PETSC_NULL; 1184 } 1185 } 1186 #endif 1187 } 1188 1189 /* Create lens which is required for MatCreate... */ 1190 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1191 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1192 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1193 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1194 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1195 1196 /* Update lens from local data */ 1197 for (i=0; i<ismax; i++) { 1198 jmax = nrow[i]; 1199 if (!allcolumns[i]) cmap_i = cmap[i]; 1200 irow_i = irow[i]; 1201 lens_i = lens[i]; 1202 for (j=0; j<jmax; j++) { 1203 l = 0; 1204 row = irow_i[j]; 1205 while (row >= C->rmap->range[l+1]) l++; 1206 proc = l; 1207 if (proc == rank) { 1208 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1209 if (!allcolumns[i]){ 1210 for (k=0; k<ncols; k++) { 1211 #if defined (PETSC_USE_CTABLE) 1212 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1213 #else 1214 tcol = cmap_i[cols[k]]; 1215 #endif 1216 if (tcol) { lens_i[j]++;} 1217 } 1218 } else { /* allcolumns */ 1219 lens_i[j] = ncols; 1220 } 1221 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1222 } 1223 } 1224 } 1225 1226 /* Create row map: global row of C -> local row of submatrices */ 1227 #if defined (PETSC_USE_CTABLE) 1228 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1229 for (i=0; i<ismax; i++) { 1230 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1231 rmap_i = rmap[i]; 1232 irow_i = irow[i]; 1233 jmax = nrow[i]; 1234 for (j=0; j<jmax; j++) { 1235 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1236 } 1237 } 1238 #else 1239 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1240 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1241 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1242 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1243 for (i=0; i<ismax; i++) { 1244 rmap_i = rmap[i]; 1245 irow_i = irow[i]; 1246 jmax = nrow[i]; 1247 for (j=0; j<jmax; j++) { 1248 rmap_i[irow_i[j]] = j; 1249 } 1250 } 1251 #endif 1252 1253 /* Update lens from offproc data */ 1254 { 1255 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1256 1257 for (tmp2=0; tmp2<nrqs; tmp2++) { 1258 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1259 idex = pa[idex2]; 1260 sbuf1_i = sbuf1[idex]; 1261 jmax = sbuf1_i[0]; 1262 ct1 = 2*jmax+1; 1263 ct2 = 0; 1264 rbuf2_i = rbuf2[idex2]; 1265 rbuf3_i = rbuf3[idex2]; 1266 for (j=1; j<=jmax; j++) { 1267 is_no = sbuf1_i[2*j-1]; 1268 max1 = sbuf1_i[2*j]; 1269 lens_i = lens[is_no]; 1270 if (!allcolumns[is_no]){cmap_i = cmap[is_no];} 1271 rmap_i = rmap[is_no]; 1272 for (k=0; k<max1; k++,ct1++) { 1273 #if defined (PETSC_USE_CTABLE) 1274 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1275 row--; 1276 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1277 #else 1278 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1279 #endif 1280 max2 = rbuf2_i[ct1]; 1281 for (l=0; l<max2; l++,ct2++) { 1282 if (!allcolumns[is_no]){ 1283 #if defined (PETSC_USE_CTABLE) 1284 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1285 #else 1286 tcol = cmap_i[rbuf3_i[ct2]]; 1287 #endif 1288 if (tcol) { 1289 lens_i[row]++; 1290 } 1291 } else { /* allcolumns */ 1292 lens_i[row]++; /* lens_i[row] += max2 ? */ 1293 } 1294 } 1295 } 1296 } 1297 } 1298 } 1299 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1300 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1301 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1302 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1303 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1304 1305 /* Create the submatrices */ 1306 if (scall == MAT_REUSE_MATRIX) { 1307 PetscBool flag; 1308 1309 /* 1310 Assumes new rows are same length as the old rows,hence bug! 1311 */ 1312 for (i=0; i<ismax; i++) { 1313 mat = (Mat_SeqAIJ *)(submats[i]->data); 1314 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1315 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1316 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1317 /* Initial matrix as if empty */ 1318 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1319 submats[i]->factortype = C->factortype; 1320 } 1321 } else { 1322 for (i=0; i<ismax; i++) { 1323 PetscInt rbs,cbs; 1324 ierr = ISGetBlockSize(isrow[i],&rbs); CHKERRQ(ierr); 1325 ierr = ISGetBlockSize(iscol[i],&cbs); CHKERRQ(ierr); 1326 1327 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1328 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1329 1330 ierr = MatSetBlockSizes(submats[i],rbs,cbs); CHKERRQ(ierr); 1331 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1332 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1333 } 1334 } 1335 1336 /* Assemble the matrices */ 1337 /* First assemble the local rows */ 1338 { 1339 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1340 PetscScalar *imat_a; 1341 1342 for (i=0; i<ismax; i++) { 1343 mat = (Mat_SeqAIJ*)submats[i]->data; 1344 imat_ilen = mat->ilen; 1345 imat_j = mat->j; 1346 imat_i = mat->i; 1347 imat_a = mat->a; 1348 if (!allcolumns[i]) cmap_i = cmap[i]; 1349 rmap_i = rmap[i]; 1350 irow_i = irow[i]; 1351 jmax = nrow[i]; 1352 for (j=0; j<jmax; j++) { 1353 l = 0; 1354 row = irow_i[j]; 1355 while (row >= C->rmap->range[l+1]) l++; 1356 proc = l; 1357 if (proc == rank) { 1358 old_row = row; 1359 #if defined (PETSC_USE_CTABLE) 1360 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1361 row--; 1362 #else 1363 row = rmap_i[row]; 1364 #endif 1365 ilen_row = imat_ilen[row]; 1366 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1367 mat_i = imat_i[row] ; 1368 mat_a = imat_a + mat_i; 1369 mat_j = imat_j + mat_i; 1370 if (!allcolumns[i]){ 1371 for (k=0; k<ncols; k++) { 1372 #if defined (PETSC_USE_CTABLE) 1373 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1374 #else 1375 tcol = cmap_i[cols[k]]; 1376 #endif 1377 if (tcol){ 1378 *mat_j++ = tcol - 1; 1379 *mat_a++ = vals[k]; 1380 ilen_row++; 1381 } 1382 } 1383 } else { /* allcolumns */ 1384 for (k=0; k<ncols; k++) { 1385 *mat_j++ = cols[k] ; /* global col index! */ 1386 *mat_a++ = vals[k]; 1387 ilen_row++; 1388 } 1389 } 1390 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1391 imat_ilen[row] = ilen_row; 1392 } 1393 } 1394 } 1395 } 1396 1397 /* Now assemble the off proc rows*/ 1398 { 1399 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1400 PetscInt *imat_j,*imat_i; 1401 PetscScalar *imat_a,*rbuf4_i; 1402 1403 for (tmp2=0; tmp2<nrqs; tmp2++) { 1404 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1405 idex = pa[idex2]; 1406 sbuf1_i = sbuf1[idex]; 1407 jmax = sbuf1_i[0]; 1408 ct1 = 2*jmax + 1; 1409 ct2 = 0; 1410 rbuf2_i = rbuf2[idex2]; 1411 rbuf3_i = rbuf3[idex2]; 1412 rbuf4_i = rbuf4[idex2]; 1413 for (j=1; j<=jmax; j++) { 1414 is_no = sbuf1_i[2*j-1]; 1415 rmap_i = rmap[is_no]; 1416 if (!allcolumns[is_no]){cmap_i = cmap[is_no];} 1417 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1418 imat_ilen = mat->ilen; 1419 imat_j = mat->j; 1420 imat_i = mat->i; 1421 imat_a = mat->a; 1422 max1 = sbuf1_i[2*j]; 1423 for (k=0; k<max1; k++,ct1++) { 1424 row = sbuf1_i[ct1]; 1425 #if defined (PETSC_USE_CTABLE) 1426 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1427 row--; 1428 #else 1429 row = rmap_i[row]; 1430 #endif 1431 ilen = imat_ilen[row]; 1432 mat_i = imat_i[row] ; 1433 mat_a = imat_a + mat_i; 1434 mat_j = imat_j + mat_i; 1435 max2 = rbuf2_i[ct1]; 1436 if (!allcolumns[is_no]){ 1437 for (l=0; l<max2; l++,ct2++) { 1438 1439 #if defined (PETSC_USE_CTABLE) 1440 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1441 #else 1442 tcol = cmap_i[rbuf3_i[ct2]]; 1443 #endif 1444 if (tcol) { 1445 *mat_j++ = tcol - 1; 1446 *mat_a++ = rbuf4_i[ct2]; 1447 ilen++; 1448 } 1449 } 1450 } else { /* allcolumns */ 1451 for (l=0; l<max2; l++,ct2++) { 1452 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1453 *mat_a++ = rbuf4_i[ct2]; 1454 ilen++; 1455 } 1456 } 1457 imat_ilen[row] = ilen; 1458 } 1459 } 1460 } 1461 } 1462 1463 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1464 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1465 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1466 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1467 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1468 1469 /* Restore the indices */ 1470 for (i=0; i<ismax; i++) { 1471 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1472 if (!allcolumns[i]){ 1473 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1474 } 1475 } 1476 1477 /* Destroy allocated memory */ 1478 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1479 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1480 ierr = PetscFree(pa);CHKERRQ(ierr); 1481 1482 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1483 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1484 for (i=0; i<nrqr; ++i) { 1485 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1486 } 1487 for (i=0; i<nrqs; ++i) { 1488 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1489 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1490 } 1491 1492 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1493 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1494 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1495 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1496 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1497 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1498 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1499 1500 #if defined (PETSC_USE_CTABLE) 1501 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1502 #else 1503 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1504 #endif 1505 ierr = PetscFree(rmap);CHKERRQ(ierr); 1506 1507 for (i=0; i<ismax; i++) { 1508 if (!allcolumns[i]){ 1509 #if defined (PETSC_USE_CTABLE) 1510 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1511 #else 1512 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1513 #endif 1514 } 1515 } 1516 ierr = PetscFree(cmap);CHKERRQ(ierr); 1517 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1518 ierr = PetscFree(lens);CHKERRQ(ierr); 1519 1520 for (i=0; i<ismax; i++) { 1521 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1522 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1523 } 1524 PetscFunctionReturn(0); 1525 } 1526 1527 /* 1528 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1529 Be careful not to destroy them elsewhere. 1530 */ 1531 #undef __FUNCT__ 1532 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1533 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1534 { 1535 /* If making this function public, change the error returned in this function away from _PLIB. */ 1536 PetscErrorCode ierr; 1537 Mat_MPIAIJ *aij; 1538 PetscBool seqaij; 1539 1540 PetscFunctionBegin; 1541 /* Check to make sure the component matrices are compatible with C. */ 1542 ierr = PetscObjectTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1543 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1544 ierr = PetscObjectTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1545 if (!seqaij) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1546 if (A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1547 1548 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1549 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1550 ierr = MatSetBlockSizes(*C,A->rmap->bs, A->cmap->bs); CHKERRQ(ierr); 1551 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1552 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1553 if((*C)->cmap->N != A->cmap->n + B->cmap->n) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1554 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1555 aij = (Mat_MPIAIJ*)((*C)->data); 1556 aij->A = A; 1557 aij->B = B; 1558 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1559 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1560 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1561 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1562 PetscFunctionReturn(0); 1563 } 1564 1565 #undef __FUNCT__ 1566 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1567 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1568 { 1569 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1570 PetscFunctionBegin; 1571 PetscValidPointer(A,2); 1572 PetscValidPointer(B,3); 1573 *A = aij->A; 1574 *B = aij->B; 1575 /* Note that we don't incref *A and *B, so be careful! */ 1576 PetscFunctionReturn(0); 1577 } 1578 1579 #undef __FUNCT__ 1580 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1581 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1582 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1583 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1584 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1585 { 1586 PetscErrorCode ierr; 1587 PetscMPIInt size, flag; 1588 PetscInt i,ii; 1589 PetscInt ismax_c; 1590 1591 PetscFunctionBegin; 1592 if (!ismax) { 1593 PetscFunctionReturn(0); 1594 } 1595 for (i = 0, ismax_c = 0; i < ismax; ++i) { 1596 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1597 if(flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1598 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1599 if(size > 1) { 1600 ++ismax_c; 1601 } 1602 } 1603 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1604 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1605 } else { /* if(ismax_c) */ 1606 Mat *A,*B; 1607 IS *isrow_c, *iscol_c; 1608 PetscMPIInt size; 1609 /* 1610 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1611 array of sequential matrices underlying the resulting parallel matrices. 1612 Which arrays to allocate is based on the value of MatReuse scall. 1613 There are as many diag matrices as there are original index sets. 1614 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1615 1616 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1617 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1618 will deposite the extracted diag and off-diag parts. 1619 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1620 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1621 */ 1622 1623 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1624 if (scall != MAT_REUSE_MATRIX) { 1625 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1626 } else { 1627 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1628 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1629 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1630 for(i = 0, ii = 0; i < ismax; ++i) { 1631 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1632 if(size > 1) { 1633 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1634 ++ii; 1635 } else { 1636 A[i] = (*submat)[i]; 1637 } 1638 } 1639 } 1640 /* 1641 Construct the complements of the iscol ISs for parallel ISs only. 1642 These are used to extract the off-diag portion of the resulting parallel matrix. 1643 The row IS for the off-diag portion is the same as for the diag portion, 1644 so we merely alias the row IS, while skipping those that are sequential. 1645 */ 1646 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1647 for(i = 0, ii = 0; i < ismax; ++i) { 1648 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1649 if(size > 1) { 1650 isrow_c[ii] = isrow[i]; 1651 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1652 ++ii; 1653 } 1654 } 1655 /* Now obtain the sequential A and B submatrices separately. */ 1656 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1657 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1658 for(ii = 0; ii < ismax_c; ++ii) { 1659 ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr); 1660 } 1661 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1662 /* 1663 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1664 have been extracted directly into the parallel matrices containing them, or 1665 simply into the sequential matrix identical with the corresponding A (if size == 1). 1666 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1667 A is put into the correct submat slot (if size == 1). 1668 */ 1669 if(scall != MAT_REUSE_MATRIX) { 1670 for(i = 0, ii = 0; i < ismax; ++i) { 1671 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1672 if(size > 1) { 1673 /* 1674 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1675 */ 1676 /* Construct submat[i] from the Seq pieces A and B. */ 1677 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1678 1679 ++ii; 1680 } else { 1681 (*submat)[i] = A[i]; 1682 } 1683 } 1684 } 1685 ierr = PetscFree(A); CHKERRQ(ierr); 1686 ierr = PetscFree(B); CHKERRQ(ierr); 1687 } 1688 PetscFunctionReturn(0); 1689 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1690 1691 1692 1693 #undef __FUNCT__ 1694 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1695 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1696 { 1697 PetscErrorCode ierr; 1698 1699 PetscFunctionBegin; 1700 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ,MatCreateMPIAIJFromSeqMatrices_Private,MatMPIAIJExtractSeqMatrices_Private);CHKERRQ(ierr); 1701 PetscFunctionReturn(0); 1702 } 1703