1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 char *t_p; 70 71 PetscFunctionBegin; 72 comm = ((PetscObject)C)->comm; 73 size = c->size; 74 rank = c->rank; 75 M = C->rmap->N; 76 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 78 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 79 80 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 81 82 for (i=0; i<imax; i++) { 83 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 84 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 85 } 86 87 /* evaluate communication - mesg to who,length of mesg, and buffer space 88 required. Based on this, buffers are allocated, and data copied into them*/ 89 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 90 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 93 for (i=0; i<imax; i++) { 94 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 95 idx_i = idx[i]; 96 len = n[i]; 97 for (j=0; j<len; j++) { 98 row = idx_i[j]; 99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 100 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 101 w4[proc]++; 102 } 103 for (j=0; j<size; j++){ 104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 105 } 106 } 107 108 nrqs = 0; /* no of outgoing messages */ 109 msz = 0; /* total mesg length (for all proc */ 110 w1[rank] = 0; /* no mesg sent to intself */ 111 w3[rank] = 0; 112 for (i=0; i<size; i++) { 113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 114 } 115 /* pa - is list of processors to communicate with */ 116 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 117 for (i=0,j=0; i<size; i++) { 118 if (w1[i]) {pa[j] = i; j++;} 119 } 120 121 /* Each message would have a header = 1 + 2*(no of IS) + data */ 122 for (i=0; i<nrqs; i++) { 123 j = pa[i]; 124 w1[j] += w2[j] + 2*w3[j]; 125 msz += w1[j]; 126 } 127 128 /* Determine the number of messages to expect, their lengths, from from-ids */ 129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 131 132 /* Now post the Irecvs corresponding to these messages */ 133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 134 135 /* Allocate Memory for outgoing messages */ 136 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 137 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 139 140 { 141 PetscInt *iptr = tmp,ict = 0; 142 for (i=0; i<nrqs; i++) { 143 j = pa[i]; 144 iptr += ict; 145 outdat[j] = iptr; 146 ict = w1[j]; 147 } 148 } 149 150 /* Form the outgoing messages */ 151 /*plug in the headers*/ 152 for (i=0; i<nrqs; i++) { 153 j = pa[i]; 154 outdat[j][0] = 0; 155 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 156 ptr[j] = outdat[j] + 2*w3[j] + 1; 157 } 158 159 /* Memory for doing local proc's work*/ 160 { 161 ierr = PetscMalloc5(imax,PetscBT,&table, imax,PetscInt*,&data, imax,PetscInt,&isz, 162 M*imax,PetscInt,&d_p, (M/PETSC_BITS_PER_BYTE+1)*imax,char,&t_p);CHKERRQ(ierr); 163 ierr = PetscMemzero(table,imax*sizeof(PetscBT));CHKERRQ(ierr); 164 ierr = PetscMemzero(data,imax*sizeof(PetscInt*));CHKERRQ(ierr); 165 ierr = PetscMemzero(isz,imax*sizeof(PetscInt));CHKERRQ(ierr); 166 ierr = PetscMemzero(d_p,M*imax*sizeof(PetscInt));CHKERRQ(ierr); 167 ierr = PetscMemzero(t_p,(M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char));CHKERRQ(ierr); 168 169 for (i=0; i<imax; i++) { 170 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 171 data[i] = d_p + M*i; 172 } 173 } 174 175 /* Parse the IS and update local tables and the outgoing buf with the data*/ 176 { 177 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 178 PetscBT table_i; 179 180 for (i=0; i<imax; i++) { 181 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 182 n_i = n[i]; 183 table_i = table[i]; 184 idx_i = idx[i]; 185 data_i = data[i]; 186 isz_i = isz[i]; 187 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 188 row = idx_i[j]; 189 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 190 if (proc != rank) { /* copy to the outgoing buffer */ 191 ctr[proc]++; 192 *ptr[proc] = row; 193 ptr[proc]++; 194 } else { /* Update the local table */ 195 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 196 } 197 } 198 /* Update the headers for the current IS */ 199 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 200 if ((ctr_j = ctr[j])) { 201 outdat_j = outdat[j]; 202 k = ++outdat_j[0]; 203 outdat_j[2*k] = ctr_j; 204 outdat_j[2*k-1] = i; 205 } 206 } 207 isz[i] = isz_i; 208 } 209 } 210 211 /* Now post the sends */ 212 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 213 for (i=0; i<nrqs; ++i) { 214 j = pa[i]; 215 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 216 } 217 218 /* No longer need the original indices*/ 219 for (i=0; i<imax; ++i) { 220 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 221 } 222 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 223 224 for (i=0; i<imax; ++i) { 225 ierr = ISDestroy(&is[i]);CHKERRQ(ierr); 226 } 227 228 /* Do Local work*/ 229 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 230 231 /* Receive messages*/ 232 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 233 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 234 235 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 236 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 237 238 /* Phase 1 sends are complete - deallocate buffers */ 239 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 240 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 241 242 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 243 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 244 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 245 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 246 ierr = PetscFree(rbuf);CHKERRQ(ierr); 247 248 249 /* Send the data back*/ 250 /* Do a global reduction to know the buffer space req for incoming messages*/ 251 { 252 PetscMPIInt *rw1; 253 254 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 255 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 256 257 for (i=0; i<nrqr; ++i) { 258 proc = recv_status[i].MPI_SOURCE; 259 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 260 rw1[proc] = isz1[i]; 261 } 262 ierr = PetscFree(onodes1);CHKERRQ(ierr); 263 ierr = PetscFree(olengths1);CHKERRQ(ierr); 264 265 /* Determine the number of messages to expect, their lengths, from from-ids */ 266 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 267 ierr = PetscFree(rw1);CHKERRQ(ierr); 268 } 269 /* Now post the Irecvs corresponding to these messages */ 270 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 271 272 /* Now post the sends */ 273 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 274 for (i=0; i<nrqr; ++i) { 275 j = recv_status[i].MPI_SOURCE; 276 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* receive work done on other processors*/ 280 { 281 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 282 PetscMPIInt idex; 283 PetscBT table_i; 284 MPI_Status *status2; 285 286 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 287 for (i=0; i<nrqs; ++i) { 288 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 289 /* Process the message*/ 290 rbuf2_i = rbuf2[idex]; 291 ct1 = 2*rbuf2_i[0]+1; 292 jmax = rbuf2[idex][0]; 293 for (j=1; j<=jmax; j++) { 294 max = rbuf2_i[2*j]; 295 is_no = rbuf2_i[2*j-1]; 296 isz_i = isz[is_no]; 297 data_i = data[is_no]; 298 table_i = table[is_no]; 299 for (k=0; k<max; k++,ct1++) { 300 row = rbuf2_i[ct1]; 301 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 302 } 303 isz[is_no] = isz_i; 304 } 305 } 306 307 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 308 ierr = PetscFree(status2);CHKERRQ(ierr); 309 } 310 311 for (i=0; i<imax; ++i) { 312 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 313 } 314 315 ierr = PetscFree(onodes2);CHKERRQ(ierr); 316 ierr = PetscFree(olengths2);CHKERRQ(ierr); 317 318 ierr = PetscFree(pa);CHKERRQ(ierr); 319 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 320 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 321 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 322 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 323 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 324 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 325 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr); 326 ierr = PetscFree(s_status);CHKERRQ(ierr); 327 ierr = PetscFree(recv_status);CHKERRQ(ierr); 328 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 329 ierr = PetscFree(xdata);CHKERRQ(ierr); 330 ierr = PetscFree(isz1);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 #undef __FUNCT__ 335 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 336 /* 337 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 338 the work on the local processor. 339 340 Inputs: 341 C - MAT_MPIAIJ; 342 imax - total no of index sets processed at a time; 343 table - an array of char - size = m bits. 344 345 Output: 346 isz - array containing the count of the solution elements corresponding 347 to each index set; 348 data - pointer to the solutions 349 */ 350 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 351 { 352 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 353 Mat A = c->A,B = c->B; 354 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 355 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 356 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 357 PetscBT table_i; 358 359 PetscFunctionBegin; 360 rstart = C->rmap->rstart; 361 cstart = C->cmap->rstart; 362 ai = a->i; 363 aj = a->j; 364 bi = b->i; 365 bj = b->j; 366 garray = c->garray; 367 368 369 for (i=0; i<imax; i++) { 370 data_i = data[i]; 371 table_i = table[i]; 372 isz_i = isz[i]; 373 for (j=0,max=isz[i]; j<max; j++) { 374 row = data_i[j] - rstart; 375 start = ai[row]; 376 end = ai[row+1]; 377 for (k=start; k<end; k++) { /* Amat */ 378 val = aj[k] + cstart; 379 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 380 } 381 start = bi[row]; 382 end = bi[row+1]; 383 for (k=start; k<end; k++) { /* Bmat */ 384 val = garray[bj[k]]; 385 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 386 } 387 } 388 isz[i] = isz_i; 389 } 390 PetscFunctionReturn(0); 391 } 392 393 #undef __FUNCT__ 394 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 395 /* 396 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 397 and return the output 398 399 Input: 400 C - the matrix 401 nrqr - no of messages being processed. 402 rbuf - an array of pointers to the recieved requests 403 404 Output: 405 xdata - array of messages to be sent back 406 isz1 - size of each message 407 408 For better efficiency perhaps we should malloc separately each xdata[i], 409 then if a remalloc is required we need only copy the data for that one row 410 rather then all previous rows as it is now where a single large chunck of 411 memory is used. 412 413 */ 414 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 415 { 416 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 417 Mat A = c->A,B = c->B; 418 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 419 PetscErrorCode ierr; 420 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 421 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 422 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 423 PetscInt *rbuf_i,kmax,rbuf_0; 424 PetscBT xtable; 425 426 PetscFunctionBegin; 427 m = C->rmap->N; 428 rstart = C->rmap->rstart; 429 cstart = C->cmap->rstart; 430 ai = a->i; 431 aj = a->j; 432 bi = b->i; 433 bj = b->j; 434 garray = c->garray; 435 436 437 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 438 rbuf_i = rbuf[i]; 439 rbuf_0 = rbuf_i[0]; 440 ct += rbuf_0; 441 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 442 } 443 444 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 445 else max1 = 1; 446 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 447 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 448 ++no_malloc; 449 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 450 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 451 452 ct3 = 0; 453 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 454 rbuf_i = rbuf[i]; 455 rbuf_0 = rbuf_i[0]; 456 ct1 = 2*rbuf_0+1; 457 ct2 = ct1; 458 ct3 += ct1; 459 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 460 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 461 oct2 = ct2; 462 kmax = rbuf_i[2*j]; 463 for (k=0; k<kmax; k++,ct1++) { 464 row = rbuf_i[ct1]; 465 if (!PetscBTLookupSet(xtable,row)) { 466 if (!(ct3 < mem_estimate)) { 467 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 468 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 469 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 470 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 471 xdata[0] = tmp; 472 mem_estimate = new_estimate; ++no_malloc; 473 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 474 } 475 xdata[i][ct2++] = row; 476 ct3++; 477 } 478 } 479 for (k=oct2,max2=ct2; k<max2; k++) { 480 row = xdata[i][k] - rstart; 481 start = ai[row]; 482 end = ai[row+1]; 483 for (l=start; l<end; l++) { 484 val = aj[l] + cstart; 485 if (!PetscBTLookupSet(xtable,val)) { 486 if (!(ct3 < mem_estimate)) { 487 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 488 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 489 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 490 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 491 xdata[0] = tmp; 492 mem_estimate = new_estimate; ++no_malloc; 493 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 494 } 495 xdata[i][ct2++] = val; 496 ct3++; 497 } 498 } 499 start = bi[row]; 500 end = bi[row+1]; 501 for (l=start; l<end; l++) { 502 val = garray[bj[l]]; 503 if (!PetscBTLookupSet(xtable,val)) { 504 if (!(ct3 < mem_estimate)) { 505 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 506 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 507 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 508 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 509 xdata[0] = tmp; 510 mem_estimate = new_estimate; ++no_malloc; 511 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 512 } 513 xdata[i][ct2++] = val; 514 ct3++; 515 } 516 } 517 } 518 /* Update the header*/ 519 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 520 xdata[i][2*j-1] = rbuf_i[2*j-1]; 521 } 522 xdata[i][0] = rbuf_0; 523 xdata[i+1] = xdata[i] + ct2; 524 isz1[i] = ct2; /* size of each message */ 525 } 526 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 527 ierr = PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 528 PetscFunctionReturn(0); 529 } 530 /* -------------------------------------------------------------------------*/ 531 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,PetscBool*,Mat*); 532 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 533 /* 534 Every processor gets the entire matrix 535 */ 536 #undef __FUNCT__ 537 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 538 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 539 { 540 Mat B; 541 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 542 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 543 PetscErrorCode ierr; 544 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 545 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 546 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 547 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 548 549 PetscFunctionBegin; 550 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 551 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 552 553 if (scall == MAT_INITIAL_MATRIX) { 554 /* ---------------------------------------------------------------- 555 Tell every processor the number of nonzeros per row 556 */ 557 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 558 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 559 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 560 } 561 sendcount = A->rmap->rend - A->rmap->rstart; 562 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 563 for (i=0; i<size; i++) { 564 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 565 displs[i] = A->rmap->range[i]; 566 } 567 #if defined(PETSC_HAVE_MPI_IN_PLACE) 568 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 569 #else 570 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 571 #endif 572 /* --------------------------------------------------------------- 573 Create the sequential matrix of the same type as the local block diagonal 574 */ 575 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 576 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 577 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 578 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 579 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 580 **Bin = B; 581 b = (Mat_SeqAIJ *)B->data; 582 583 /*-------------------------------------------------------------------- 584 Copy my part of matrix column indices over 585 */ 586 sendcount = ad->nz + bd->nz; 587 jsendbuf = b->j + b->i[rstarts[rank]]; 588 a_jsendbuf = ad->j; 589 b_jsendbuf = bd->j; 590 n = A->rmap->rend - A->rmap->rstart; 591 cnt = 0; 592 for (i=0; i<n; i++) { 593 594 /* put in lower diagonal portion */ 595 m = bd->i[i+1] - bd->i[i]; 596 while (m > 0) { 597 /* is it above diagonal (in bd (compressed) numbering) */ 598 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 599 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 600 m--; 601 } 602 603 /* put in diagonal portion */ 604 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 605 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 606 } 607 608 /* put in upper diagonal portion */ 609 while (m-- > 0) { 610 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 611 } 612 } 613 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 614 615 /*-------------------------------------------------------------------- 616 Gather all column indices to all processors 617 */ 618 for (i=0; i<size; i++) { 619 recvcounts[i] = 0; 620 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 621 recvcounts[i] += lens[j]; 622 } 623 } 624 displs[0] = 0; 625 for (i=1; i<size; i++) { 626 displs[i] = displs[i-1] + recvcounts[i-1]; 627 } 628 #if defined(PETSC_HAVE_MPI_IN_PLACE) 629 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 630 #else 631 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 632 #endif 633 /*-------------------------------------------------------------------- 634 Assemble the matrix into useable form (note numerical values not yet set) 635 */ 636 /* set the b->ilen (length of each row) values */ 637 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 638 /* set the b->i indices */ 639 b->i[0] = 0; 640 for (i=1; i<=A->rmap->N; i++) { 641 b->i[i] = b->i[i-1] + lens[i-1]; 642 } 643 ierr = PetscFree(lens);CHKERRQ(ierr); 644 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 645 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 646 647 } else { 648 B = **Bin; 649 b = (Mat_SeqAIJ *)B->data; 650 } 651 652 /*-------------------------------------------------------------------- 653 Copy my part of matrix numerical values into the values location 654 */ 655 if (flag == MAT_GET_VALUES){ 656 sendcount = ad->nz + bd->nz; 657 sendbuf = b->a + b->i[rstarts[rank]]; 658 a_sendbuf = ad->a; 659 b_sendbuf = bd->a; 660 b_sendj = bd->j; 661 n = A->rmap->rend - A->rmap->rstart; 662 cnt = 0; 663 for (i=0; i<n; i++) { 664 665 /* put in lower diagonal portion */ 666 m = bd->i[i+1] - bd->i[i]; 667 while (m > 0) { 668 /* is it above diagonal (in bd (compressed) numbering) */ 669 if (garray[*b_sendj] > A->rmap->rstart + i) break; 670 sendbuf[cnt++] = *b_sendbuf++; 671 m--; 672 b_sendj++; 673 } 674 675 /* put in diagonal portion */ 676 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 677 sendbuf[cnt++] = *a_sendbuf++; 678 } 679 680 /* put in upper diagonal portion */ 681 while (m-- > 0) { 682 sendbuf[cnt++] = *b_sendbuf++; 683 b_sendj++; 684 } 685 } 686 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 687 688 /* ----------------------------------------------------------------- 689 Gather all numerical values to all processors 690 */ 691 if (!recvcounts) { 692 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 693 } 694 for (i=0; i<size; i++) { 695 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 696 } 697 displs[0] = 0; 698 for (i=1; i<size; i++) { 699 displs[i] = displs[i-1] + recvcounts[i-1]; 700 } 701 recvbuf = b->a; 702 #if defined(PETSC_HAVE_MPI_IN_PLACE) 703 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 704 #else 705 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 706 #endif 707 } /* endof (flag == MAT_GET_VALUES) */ 708 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 709 710 if (A->symmetric){ 711 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 712 } else if (A->hermitian) { 713 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 714 } else if (A->structurally_symmetric) { 715 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 716 } 717 PetscFunctionReturn(0); 718 } 719 720 721 722 #undef __FUNCT__ 723 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 724 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 725 { 726 PetscErrorCode ierr; 727 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 728 PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE,twantallmatrix,*allcolumns; 729 730 PetscFunctionBegin; 731 /* 732 Check for special case: each processor gets entire matrix 733 */ 734 if (ismax == 1 && C->rmap->N == C->cmap->N) { 735 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 736 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 737 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 738 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 739 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 740 wantallmatrix = PETSC_TRUE; 741 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 742 } 743 } 744 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 745 if (twantallmatrix) { 746 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 747 PetscFunctionReturn(0); 748 } 749 750 /* Allocate memory to hold all the submatrices */ 751 if (scall != MAT_REUSE_MATRIX) { 752 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 753 } 754 755 /* Check for special case: each processor gets entire matrix columns */ 756 ierr = PetscMalloc((ismax+1)*sizeof(PetscBool),&allcolumns);CHKERRQ(ierr); 757 for (i=0; i<ismax; i++) { 758 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr); 759 ierr = ISGetLocalSize(iscol[i],&ncol);CHKERRQ(ierr); 760 if (colflag && ncol == C->cmap->N){ 761 allcolumns[i] = PETSC_TRUE; 762 } else { 763 allcolumns[i] = PETSC_FALSE; 764 } 765 } 766 767 /* Determine the number of stages through which submatrices are done */ 768 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 769 770 /* 771 Each stage will extract nmax submatrices. 772 nmax is determined by the matrix column dimension. 773 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 774 */ 775 if (!nmax) nmax = 1; 776 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 777 778 /* Make sure every processor loops through the nstages */ 779 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 780 781 for (i=0,pos=0; i<nstages; i++) { 782 if (pos+nmax <= ismax) max_no = nmax; 783 else if (pos == ismax) max_no = 0; 784 else max_no = ismax-pos; 785 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,allcolumns+pos,*submat+pos);CHKERRQ(ierr); 786 pos += max_no; 787 } 788 789 ierr = PetscFree(allcolumns);CHKERRQ(ierr); 790 PetscFunctionReturn(0); 791 } 792 793 /* -------------------------------------------------------------------------*/ 794 #undef __FUNCT__ 795 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 796 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool *allcolumns,Mat *submats) 797 { 798 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 799 Mat A = c->A; 800 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 801 const PetscInt **icol,**irow; 802 PetscInt *nrow,*ncol,start; 803 PetscErrorCode ierr; 804 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 805 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 806 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 807 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 808 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 809 #if defined (PETSC_USE_CTABLE) 810 PetscTable *cmap,cmap_i=PETSC_NULL,*rmap,rmap_i; 811 #else 812 PetscInt **cmap,*cmap_i=PETSC_NULL,**rmap,*rmap_i; 813 #endif 814 const PetscInt *irow_i; 815 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 816 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 817 MPI_Request *r_waits4,*s_waits3,*s_waits4; 818 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 819 MPI_Status *r_status3,*r_status4,*s_status4; 820 MPI_Comm comm; 821 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 822 PetscMPIInt *onodes1,*olengths1; 823 PetscMPIInt idex,idex2,end; 824 825 PetscFunctionBegin; 826 comm = ((PetscObject)C)->comm; 827 tag0 = ((PetscObject)C)->tag; 828 size = c->size; 829 rank = c->rank; 830 831 /* Get some new tags to keep the communication clean */ 832 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 833 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 834 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 835 836 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 837 838 for (i=0; i<ismax; i++) { 839 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 840 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 841 if (allcolumns[i]){ 842 icol[i] = PETSC_NULL; 843 ncol[i] = C->cmap->N; 844 } else { 845 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 846 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 847 } 848 } 849 850 /* evaluate communication - mesg to who, length of mesg, and buffer space 851 required. Based on this, buffers are allocated, and data copied into them*/ 852 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 853 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 854 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 855 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 856 for (i=0; i<ismax; i++) { 857 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 858 jmax = nrow[i]; 859 irow_i = irow[i]; 860 for (j=0; j<jmax; j++) { 861 l = 0; 862 row = irow_i[j]; 863 while (row >= C->rmap->range[l+1]) l++; 864 proc = l; 865 w4[proc]++; 866 } 867 for (j=0; j<size; j++) { 868 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 869 } 870 } 871 872 nrqs = 0; /* no of outgoing messages */ 873 msz = 0; /* total mesg length (for all procs) */ 874 w1[rank] = 0; /* no mesg sent to self */ 875 w3[rank] = 0; 876 for (i=0; i<size; i++) { 877 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 878 } 879 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 880 for (i=0,j=0; i<size; i++) { 881 if (w1[i]) { pa[j] = i; j++; } 882 } 883 884 /* Each message would have a header = 1 + 2*(no of IS) + data */ 885 for (i=0; i<nrqs; i++) { 886 j = pa[i]; 887 w1[j] += w2[j] + 2* w3[j]; 888 msz += w1[j]; 889 } 890 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 891 892 /* Determine the number of messages to expect, their lengths, from from-ids */ 893 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 894 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 895 896 /* Now post the Irecvs corresponding to these messages */ 897 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 898 899 ierr = PetscFree(onodes1);CHKERRQ(ierr); 900 ierr = PetscFree(olengths1);CHKERRQ(ierr); 901 902 /* Allocate Memory for outgoing messages */ 903 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 904 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 905 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 906 907 { 908 PetscInt *iptr = tmp,ict = 0; 909 for (i=0; i<nrqs; i++) { 910 j = pa[i]; 911 iptr += ict; 912 sbuf1[j] = iptr; 913 ict = w1[j]; 914 } 915 } 916 917 /* Form the outgoing messages */ 918 /* Initialize the header space */ 919 for (i=0; i<nrqs; i++) { 920 j = pa[i]; 921 sbuf1[j][0] = 0; 922 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 923 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 924 } 925 926 /* Parse the isrow and copy data into outbuf */ 927 for (i=0; i<ismax; i++) { 928 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 929 irow_i = irow[i]; 930 jmax = nrow[i]; 931 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 932 l = 0; 933 row = irow_i[j]; 934 while (row >= C->rmap->range[l+1]) l++; 935 proc = l; 936 if (proc != rank) { /* copy to the outgoing buf*/ 937 ctr[proc]++; 938 *ptr[proc] = row; 939 ptr[proc]++; 940 } 941 } 942 /* Update the headers for the current IS */ 943 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 944 if ((ctr_j = ctr[j])) { 945 sbuf1_j = sbuf1[j]; 946 k = ++sbuf1_j[0]; 947 sbuf1_j[2*k] = ctr_j; 948 sbuf1_j[2*k-1] = i; 949 } 950 } 951 } 952 953 /* Now post the sends */ 954 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 955 for (i=0; i<nrqs; ++i) { 956 j = pa[i]; 957 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 958 } 959 960 /* Post Receives to capture the buffer size */ 961 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 962 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 963 rbuf2[0] = tmp + msz; 964 for (i=1; i<nrqs; ++i) { 965 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 966 } 967 for (i=0; i<nrqs; ++i) { 968 j = pa[i]; 969 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 970 } 971 972 /* Send to other procs the buf size they should allocate */ 973 974 975 /* Receive messages*/ 976 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 977 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 978 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 979 { 980 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 981 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 982 PetscInt *sbuf2_i; 983 984 for (i=0; i<nrqr; ++i) { 985 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 986 req_size[idex] = 0; 987 rbuf1_i = rbuf1[idex]; 988 start = 2*rbuf1_i[0] + 1; 989 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 990 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 991 sbuf2_i = sbuf2[idex]; 992 for (j=start; j<end; j++) { 993 id = rbuf1_i[j] - rstart; 994 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 995 sbuf2_i[j] = ncols; 996 req_size[idex] += ncols; 997 } 998 req_source[idex] = r_status1[i].MPI_SOURCE; 999 /* form the header */ 1000 sbuf2_i[0] = req_size[idex]; 1001 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 1002 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 1003 } 1004 } 1005 ierr = PetscFree(r_status1);CHKERRQ(ierr); 1006 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 1007 1008 /* recv buffer sizes */ 1009 /* Receive messages*/ 1010 1011 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1012 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1013 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1014 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1015 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1016 1017 for (i=0; i<nrqs; ++i) { 1018 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1019 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1020 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1021 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1022 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1023 } 1024 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1025 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1026 1027 /* Wait on sends1 and sends2 */ 1028 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1029 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1030 1031 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1032 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1033 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1034 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1035 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1036 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1037 1038 /* Now allocate buffers for a->j, and send them off */ 1039 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1040 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1041 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1042 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1043 1044 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1045 { 1046 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1047 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1048 PetscInt cend = C->cmap->rend; 1049 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1050 1051 for (i=0; i<nrqr; i++) { 1052 rbuf1_i = rbuf1[i]; 1053 sbuf_aj_i = sbuf_aj[i]; 1054 ct1 = 2*rbuf1_i[0] + 1; 1055 ct2 = 0; 1056 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1057 kmax = rbuf1[i][2*j]; 1058 for (k=0; k<kmax; k++,ct1++) { 1059 row = rbuf1_i[ct1] - rstart; 1060 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1061 ncols = nzA + nzB; 1062 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1063 1064 /* load the column indices for this row into cols*/ 1065 cols = sbuf_aj_i + ct2; 1066 1067 lwrite = 0; 1068 for (l=0; l<nzB; l++) { 1069 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1070 } 1071 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1072 for (l=0; l<nzB; l++) { 1073 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1074 } 1075 1076 ct2 += ncols; 1077 } 1078 } 1079 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1080 } 1081 } 1082 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1083 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1084 1085 /* Allocate buffers for a->a, and send them off */ 1086 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1087 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1088 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1089 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1090 1091 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1092 { 1093 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1094 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1095 PetscInt cend = C->cmap->rend; 1096 PetscInt *b_j = b->j; 1097 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1098 1099 for (i=0; i<nrqr; i++) { 1100 rbuf1_i = rbuf1[i]; 1101 sbuf_aa_i = sbuf_aa[i]; 1102 ct1 = 2*rbuf1_i[0]+1; 1103 ct2 = 0; 1104 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1105 kmax = rbuf1_i[2*j]; 1106 for (k=0; k<kmax; k++,ct1++) { 1107 row = rbuf1_i[ct1] - rstart; 1108 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1109 ncols = nzA + nzB; 1110 cworkB = b_j + b_i[row]; 1111 vworkA = a_a + a_i[row]; 1112 vworkB = b_a + b_i[row]; 1113 1114 /* load the column values for this row into vals*/ 1115 vals = sbuf_aa_i+ct2; 1116 1117 lwrite = 0; 1118 for (l=0; l<nzB; l++) { 1119 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1120 } 1121 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1122 for (l=0; l<nzB; l++) { 1123 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1124 } 1125 1126 ct2 += ncols; 1127 } 1128 } 1129 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1130 } 1131 } 1132 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1133 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1134 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1135 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1136 1137 /* Form the matrix */ 1138 /* create col map: global col of C -> local col of submatrices */ 1139 { 1140 const PetscInt *icol_i; 1141 #if defined (PETSC_USE_CTABLE) 1142 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1143 for (i=0; i<ismax; i++) { 1144 if (!allcolumns[i]){ 1145 ierr = PetscTableCreate(ncol[i]+1,C->cmap->N+1,&cmap[i]);CHKERRQ(ierr); 1146 jmax = ncol[i]; 1147 icol_i = icol[i]; 1148 cmap_i = cmap[i]; 1149 for (j=0; j<jmax; j++) { 1150 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1151 } 1152 } else { 1153 cmap[i] = PETSC_NULL; 1154 } 1155 } 1156 #else 1157 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1158 for (i=0; i<ismax; i++) { 1159 if (!allcolumns[i]){ 1160 ierr = PetscMalloc(C->cmap->N*sizeof(PetscInt),&cmap[i]);CHKERRQ(ierr); 1161 ierr = PetscMemzero(cmap[i],C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1162 jmax = ncol[i]; 1163 icol_i = icol[i]; 1164 cmap_i = cmap[i]; 1165 for (j=0; j<jmax; j++) { 1166 cmap_i[icol_i[j]] = j+1; 1167 } 1168 } else { 1169 cmap[i] = PETSC_NULL; 1170 } 1171 } 1172 #endif 1173 } 1174 1175 /* Create lens which is required for MatCreate... */ 1176 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1177 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1178 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1179 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1180 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1181 1182 /* Update lens from local data */ 1183 for (i=0; i<ismax; i++) { 1184 jmax = nrow[i]; 1185 if (!allcolumns[i]) cmap_i = cmap[i]; 1186 irow_i = irow[i]; 1187 lens_i = lens[i]; 1188 for (j=0; j<jmax; j++) { 1189 l = 0; 1190 row = irow_i[j]; 1191 while (row >= C->rmap->range[l+1]) l++; 1192 proc = l; 1193 if (proc == rank) { 1194 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1195 if (!allcolumns[i]){ 1196 for (k=0; k<ncols; k++) { 1197 #if defined (PETSC_USE_CTABLE) 1198 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1199 #else 1200 tcol = cmap_i[cols[k]]; 1201 #endif 1202 if (tcol) { lens_i[j]++;} 1203 } 1204 } else { /* allcolumns */ 1205 lens_i[j] = ncols; 1206 } 1207 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1208 } 1209 } 1210 } 1211 1212 /* Create row map: global row of C -> local row of submatrices */ 1213 #if defined (PETSC_USE_CTABLE) 1214 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1215 for (i=0; i<ismax; i++) { 1216 ierr = PetscTableCreate(nrow[i]+1,C->rmap->N+1,&rmap[i]);CHKERRQ(ierr); 1217 rmap_i = rmap[i]; 1218 irow_i = irow[i]; 1219 jmax = nrow[i]; 1220 for (j=0; j<jmax; j++) { 1221 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr); 1222 } 1223 } 1224 #else 1225 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1226 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1227 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1228 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1229 for (i=0; i<ismax; i++) { 1230 rmap_i = rmap[i]; 1231 irow_i = irow[i]; 1232 jmax = nrow[i]; 1233 for (j=0; j<jmax; j++) { 1234 rmap_i[irow_i[j]] = j; 1235 } 1236 } 1237 #endif 1238 1239 /* Update lens from offproc data */ 1240 { 1241 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1242 1243 for (tmp2=0; tmp2<nrqs; tmp2++) { 1244 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1245 idex = pa[idex2]; 1246 sbuf1_i = sbuf1[idex]; 1247 jmax = sbuf1_i[0]; 1248 ct1 = 2*jmax+1; 1249 ct2 = 0; 1250 rbuf2_i = rbuf2[idex2]; 1251 rbuf3_i = rbuf3[idex2]; 1252 for (j=1; j<=jmax; j++) { 1253 is_no = sbuf1_i[2*j-1]; 1254 max1 = sbuf1_i[2*j]; 1255 lens_i = lens[is_no]; 1256 if (!allcolumns[is_no]){cmap_i = cmap[is_no];} 1257 rmap_i = rmap[is_no]; 1258 for (k=0; k<max1; k++,ct1++) { 1259 #if defined (PETSC_USE_CTABLE) 1260 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1261 row--; 1262 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1263 #else 1264 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1265 #endif 1266 max2 = rbuf2_i[ct1]; 1267 for (l=0; l<max2; l++,ct2++) { 1268 if (!allcolumns[is_no]){ 1269 #if defined (PETSC_USE_CTABLE) 1270 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1271 #else 1272 tcol = cmap_i[rbuf3_i[ct2]]; 1273 #endif 1274 if (tcol) { 1275 lens_i[row]++; 1276 } 1277 } else { /* allcolumns */ 1278 lens_i[row]++; /* lens_i[row] += max2 ? */ 1279 } 1280 } 1281 } 1282 } 1283 } 1284 } 1285 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1286 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1287 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1288 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1289 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1290 1291 /* Create the submatrices */ 1292 if (scall == MAT_REUSE_MATRIX) { 1293 PetscBool flag; 1294 1295 /* 1296 Assumes new rows are same length as the old rows,hence bug! 1297 */ 1298 for (i=0; i<ismax; i++) { 1299 mat = (Mat_SeqAIJ *)(submats[i]->data); 1300 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1301 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1302 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1303 /* Initial matrix as if empty */ 1304 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1305 submats[i]->factortype = C->factortype; 1306 } 1307 } else { 1308 for (i=0; i<ismax; i++) { 1309 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1310 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1311 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1312 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1313 } 1314 } 1315 1316 /* Assemble the matrices */ 1317 /* First assemble the local rows */ 1318 { 1319 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1320 PetscScalar *imat_a; 1321 1322 for (i=0; i<ismax; i++) { 1323 mat = (Mat_SeqAIJ*)submats[i]->data; 1324 imat_ilen = mat->ilen; 1325 imat_j = mat->j; 1326 imat_i = mat->i; 1327 imat_a = mat->a; 1328 if (!allcolumns[i]) cmap_i = cmap[i]; 1329 rmap_i = rmap[i]; 1330 irow_i = irow[i]; 1331 jmax = nrow[i]; 1332 for (j=0; j<jmax; j++) { 1333 l = 0; 1334 row = irow_i[j]; 1335 while (row >= C->rmap->range[l+1]) l++; 1336 proc = l; 1337 if (proc == rank) { 1338 old_row = row; 1339 #if defined (PETSC_USE_CTABLE) 1340 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1341 row--; 1342 #else 1343 row = rmap_i[row]; 1344 #endif 1345 ilen_row = imat_ilen[row]; 1346 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1347 mat_i = imat_i[row] ; 1348 mat_a = imat_a + mat_i; 1349 mat_j = imat_j + mat_i; 1350 if (!allcolumns[i]){ 1351 for (k=0; k<ncols; k++) { 1352 #if defined (PETSC_USE_CTABLE) 1353 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1354 #else 1355 tcol = cmap_i[cols[k]]; 1356 #endif 1357 if (tcol){ 1358 *mat_j++ = tcol - 1; 1359 *mat_a++ = vals[k]; 1360 ilen_row++; 1361 } 1362 } 1363 } else { /* allcolumns */ 1364 for (k=0; k<ncols; k++) { 1365 *mat_j++ = cols[k] ; /* global col index! */ 1366 *mat_a++ = vals[k]; 1367 ilen_row++; 1368 } 1369 } 1370 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1371 imat_ilen[row] = ilen_row; 1372 } 1373 } 1374 } 1375 } 1376 1377 /* Now assemble the off proc rows*/ 1378 { 1379 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1380 PetscInt *imat_j,*imat_i; 1381 PetscScalar *imat_a,*rbuf4_i; 1382 1383 for (tmp2=0; tmp2<nrqs; tmp2++) { 1384 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1385 idex = pa[idex2]; 1386 sbuf1_i = sbuf1[idex]; 1387 jmax = sbuf1_i[0]; 1388 ct1 = 2*jmax + 1; 1389 ct2 = 0; 1390 rbuf2_i = rbuf2[idex2]; 1391 rbuf3_i = rbuf3[idex2]; 1392 rbuf4_i = rbuf4[idex2]; 1393 for (j=1; j<=jmax; j++) { 1394 is_no = sbuf1_i[2*j-1]; 1395 rmap_i = rmap[is_no]; 1396 if (!allcolumns[is_no]){cmap_i = cmap[is_no];} 1397 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1398 imat_ilen = mat->ilen; 1399 imat_j = mat->j; 1400 imat_i = mat->i; 1401 imat_a = mat->a; 1402 max1 = sbuf1_i[2*j]; 1403 for (k=0; k<max1; k++,ct1++) { 1404 row = sbuf1_i[ct1]; 1405 #if defined (PETSC_USE_CTABLE) 1406 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1407 row--; 1408 #else 1409 row = rmap_i[row]; 1410 #endif 1411 ilen = imat_ilen[row]; 1412 mat_i = imat_i[row] ; 1413 mat_a = imat_a + mat_i; 1414 mat_j = imat_j + mat_i; 1415 max2 = rbuf2_i[ct1]; 1416 if (!allcolumns[is_no]){ 1417 for (l=0; l<max2; l++,ct2++) { 1418 1419 #if defined (PETSC_USE_CTABLE) 1420 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1421 #else 1422 tcol = cmap_i[rbuf3_i[ct2]]; 1423 #endif 1424 if (tcol) { 1425 *mat_j++ = tcol - 1; 1426 *mat_a++ = rbuf4_i[ct2]; 1427 ilen++; 1428 } 1429 } 1430 } else { /* allcolumns */ 1431 for (l=0; l<max2; l++,ct2++) { 1432 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */ 1433 *mat_a++ = rbuf4_i[ct2]; 1434 ilen++; 1435 } 1436 } 1437 imat_ilen[row] = ilen; 1438 } 1439 } 1440 } 1441 } 1442 1443 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1444 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1445 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1446 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1447 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1448 1449 /* Restore the indices */ 1450 for (i=0; i<ismax; i++) { 1451 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1452 if (!allcolumns[i]){ 1453 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1454 } 1455 } 1456 1457 /* Destroy allocated memory */ 1458 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1459 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1460 ierr = PetscFree(pa);CHKERRQ(ierr); 1461 1462 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1463 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1464 for (i=0; i<nrqr; ++i) { 1465 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1466 } 1467 for (i=0; i<nrqs; ++i) { 1468 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1469 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1470 } 1471 1472 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1473 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1474 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1475 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1476 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1477 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1478 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1479 1480 #if defined (PETSC_USE_CTABLE) 1481 for (i=0; i<ismax; i++) {ierr = PetscTableDestroy((PetscTable*)&rmap[i]);CHKERRQ(ierr);} 1482 #else 1483 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1484 #endif 1485 ierr = PetscFree(rmap);CHKERRQ(ierr); 1486 1487 for (i=0; i<ismax; i++) { 1488 if (!allcolumns[i]){ 1489 #if defined (PETSC_USE_CTABLE) 1490 ierr = PetscTableDestroy((PetscTable*)&cmap[i]);CHKERRQ(ierr); 1491 #else 1492 ierr = PetscFree(cmap[i]);CHKERRQ(ierr); 1493 #endif 1494 } 1495 } 1496 ierr = PetscFree(cmap);CHKERRQ(ierr); 1497 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1498 ierr = PetscFree(lens);CHKERRQ(ierr); 1499 1500 for (i=0; i<ismax; i++) { 1501 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1502 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1503 } 1504 PetscFunctionReturn(0); 1505 } 1506 1507 /* 1508 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1509 Be careful not to destroy them elsewhere. 1510 */ 1511 #undef __FUNCT__ 1512 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1513 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1514 { 1515 /* If making this function public, change the error returned in this function away from _PLIB. */ 1516 PetscErrorCode ierr; 1517 Mat_MPIAIJ *aij; 1518 PetscBool seqaij; 1519 1520 PetscFunctionBegin; 1521 /* Check to make sure the component matrices are compatible with C. */ 1522 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1523 if(!seqaij) { 1524 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1525 } 1526 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1527 if(!seqaij) { 1528 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1529 } 1530 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1531 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1532 } 1533 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1534 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1535 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1536 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1537 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1538 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1539 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1540 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1541 } 1542 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1543 aij = (Mat_MPIAIJ*)((*C)->data); 1544 aij->A = A; 1545 aij->B = B; 1546 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1547 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1548 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1549 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1550 PetscFunctionReturn(0); 1551 } 1552 1553 #undef __FUNCT__ 1554 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1555 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1556 { 1557 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1558 PetscFunctionBegin; 1559 PetscValidPointer(A,2); 1560 PetscValidPointer(B,3); 1561 *A = aij->A; 1562 *B = aij->B; 1563 /* Note that we don't incref *A and *B, so be careful! */ 1564 PetscFunctionReturn(0); 1565 } 1566 1567 1568 #undef __FUNCT__ 1569 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1570 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1571 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1572 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1573 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1574 { 1575 PetscErrorCode ierr; 1576 PetscMPIInt size, flag; 1577 PetscInt i,ii; 1578 PetscInt ismax_c; 1579 PetscFunctionBegin; 1580 if(!ismax) { 1581 PetscFunctionReturn(0); 1582 } 1583 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1584 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1585 if(flag != MPI_IDENT) { 1586 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1587 } 1588 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1589 if(size > 1) { 1590 ++ismax_c; 1591 } 1592 } 1593 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1594 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1595 } 1596 else { /* if(ismax_c) */ 1597 Mat *A,*B; 1598 IS *isrow_c, *iscol_c; 1599 PetscMPIInt size; 1600 /* 1601 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1602 array of sequential matrices underlying the resulting parallel matrices. 1603 Which arrays to allocate is based on the value of MatReuse scall. 1604 There are as many diag matrices as there are original index sets. 1605 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1606 1607 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1608 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1609 will deposite the extracted diag and off-diag parts. 1610 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1611 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1612 */ 1613 1614 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1615 if (scall != MAT_REUSE_MATRIX) { 1616 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1617 } 1618 else { 1619 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1620 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1621 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1622 for(i = 0, ii = 0; i < ismax; ++i) { 1623 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1624 if(size > 1) { 1625 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1626 ++ii; 1627 } 1628 else { 1629 A[i] = (*submat)[i]; 1630 } 1631 } 1632 } 1633 /* 1634 Construct the complements of the iscol ISs for parallel ISs only. 1635 These are used to extract the off-diag portion of the resulting parallel matrix. 1636 The row IS for the off-diag portion is the same as for the diag portion, 1637 so we merely alias the row IS, while skipping those that are sequential. 1638 */ 1639 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1640 for(i = 0, ii = 0; i < ismax; ++i) { 1641 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1642 if(size > 1) { 1643 isrow_c[ii] = isrow[i]; 1644 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1645 ++ii; 1646 } 1647 } 1648 /* Now obtain the sequential A and B submatrices separately. */ 1649 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1650 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1651 for(ii = 0; ii < ismax_c; ++ii) { 1652 ierr = ISDestroy(&iscol_c[ii]); CHKERRQ(ierr); 1653 } 1654 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1655 /* 1656 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1657 have been extracted directly into the parallel matrices containing them, or 1658 simply into the sequential matrix identical with the corresponding A (if size == 1). 1659 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1660 A is put into the correct submat slot (if size == 1). 1661 */ 1662 if(scall != MAT_REUSE_MATRIX) { 1663 for(i = 0, ii = 0; i < ismax; ++i) { 1664 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1665 if(size > 1) { 1666 /* 1667 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1668 */ 1669 /* Construct submat[i] from the Seq pieces A and B. */ 1670 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1671 1672 ++ii; 1673 } 1674 else { 1675 (*submat)[i] = A[i]; 1676 } 1677 } 1678 } 1679 ierr = PetscFree(A); CHKERRQ(ierr); 1680 ierr = PetscFree(B); CHKERRQ(ierr); 1681 } 1682 PetscFunctionReturn(0); 1683 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1684 1685 1686 1687 #undef __FUNCT__ 1688 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1689 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1690 { 1691 PetscErrorCode ierr; 1692 PetscFunctionBegin; 1693 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1694 MatCreateMPIAIJFromSeqMatrices_Private, 1695 MatMPIAIJExtractSeqMatrices_Private); 1696 CHKERRQ(ierr); 1697 PetscFunctionReturn(0); 1698 } 1699