1 2 /* 3 Routines to compute overlapping regions of a parallel MPI matrix 4 and to find submatrices that were shared across processors. 5 */ 6 #include <../src/mat/impls/aij/mpi/mpiaij.h> 7 #include <petscbt.h> 8 9 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS *); 10 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char **,PetscInt*,PetscInt**); 11 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt **,PetscInt**,PetscInt*); 12 extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 13 extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**); 14 15 #undef __FUNCT__ 16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ" 17 PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov) 18 { 19 PetscErrorCode ierr; 20 PetscInt i; 21 22 PetscFunctionBegin; 23 if (ov < 0) SETERRQ(((PetscObject)C)->comm,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified"); 24 for (i=0; i<ov; ++i) { 25 ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr); 26 } 27 PetscFunctionReturn(0); 28 } 29 30 /* 31 Sample message format: 32 If a processor A wants processor B to process some elements corresponding 33 to index sets is[1],is[5] 34 mesg [0] = 2 (no of index sets in the mesg) 35 ----------- 36 mesg [1] = 1 => is[1] 37 mesg [2] = sizeof(is[1]); 38 ----------- 39 mesg [3] = 5 => is[5] 40 mesg [4] = sizeof(is[5]); 41 ----------- 42 mesg [5] 43 mesg [n] datas[1] 44 ----------- 45 mesg[n+1] 46 mesg[m] data(is[5]) 47 ----------- 48 49 Notes: 50 nrqs - no of requests sent (or to be sent out) 51 nrqr - no of requests recieved (which have to be or which have been processed 52 */ 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once" 55 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[]) 56 { 57 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 58 PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2; 59 const PetscInt **idx,*idx_i; 60 PetscInt *n,**data,len; 61 PetscErrorCode ierr; 62 PetscMPIInt size,rank,tag1,tag2; 63 PetscInt M,i,j,k,**rbuf,row,proc = 0,nrqs,msz,**outdat,**ptr; 64 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2; 65 PetscBT *table; 66 MPI_Comm comm; 67 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2; 68 MPI_Status *s_status,*recv_status; 69 70 PetscFunctionBegin; 71 comm = ((PetscObject)C)->comm; 72 size = c->size; 73 rank = c->rank; 74 M = C->rmap->N; 75 76 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 77 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 78 79 ierr = PetscMalloc2(imax,PetscInt*,&idx,imax,PetscInt,&n);CHKERRQ(ierr); 80 81 for (i=0; i<imax; i++) { 82 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr); 83 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); 84 } 85 86 /* evaluate communication - mesg to who,length of mesg, and buffer space 87 required. Based on this, buffers are allocated, and data copied into them*/ 88 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); 89 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 90 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 91 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 92 for (i=0; i<imax; i++) { 93 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialise work vector*/ 94 idx_i = idx[i]; 95 len = n[i]; 96 for (j=0; j<len; j++) { 97 row = idx_i[j]; 98 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries"); 99 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 100 w4[proc]++; 101 } 102 for (j=0; j<size; j++){ 103 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 104 } 105 } 106 107 nrqs = 0; /* no of outgoing messages */ 108 msz = 0; /* total mesg length (for all proc */ 109 w1[rank] = 0; /* no mesg sent to intself */ 110 w3[rank] = 0; 111 for (i=0; i<size; i++) { 112 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 113 } 114 /* pa - is list of processors to communicate with */ 115 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); 116 for (i=0,j=0; i<size; i++) { 117 if (w1[i]) {pa[j] = i; j++;} 118 } 119 120 /* Each message would have a header = 1 + 2*(no of IS) + data */ 121 for (i=0; i<nrqs; i++) { 122 j = pa[i]; 123 w1[j] += w2[j] + 2*w3[j]; 124 msz += w1[j]; 125 } 126 127 /* Determine the number of messages to expect, their lengths, from from-ids */ 128 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 129 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 130 131 /* Now post the Irecvs corresponding to these messages */ 132 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr); 133 134 /* Allocate Memory for outgoing messages */ 135 ierr = PetscMalloc4(size,PetscInt*,&outdat,size,PetscInt*,&ptr,msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 136 ierr = PetscMemzero(outdat,size*sizeof(PetscInt*));CHKERRQ(ierr); 137 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 138 139 { 140 PetscInt *iptr = tmp,ict = 0; 141 for (i=0; i<nrqs; i++) { 142 j = pa[i]; 143 iptr += ict; 144 outdat[j] = iptr; 145 ict = w1[j]; 146 } 147 } 148 149 /* Form the outgoing messages */ 150 /*plug in the headers*/ 151 for (i=0; i<nrqs; i++) { 152 j = pa[i]; 153 outdat[j][0] = 0; 154 ierr = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 155 ptr[j] = outdat[j] + 2*w3[j] + 1; 156 } 157 158 /* Memory for doing local proc's work*/ 159 { 160 PetscInt *d_p; 161 char *t_p; 162 163 /* should replace with PetscMallocN() */ 164 ierr = PetscMalloc((imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) + 165 M*imax*sizeof(PetscInt) + (M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1,&table);CHKERRQ(ierr); 166 ierr = PetscMemzero(table,(imax)*(sizeof(PetscBT) + sizeof(PetscInt*)+ sizeof(PetscInt)) + 167 M*imax*sizeof(PetscInt) + (M/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1);CHKERRQ(ierr); 168 data = (PetscInt **)(table + imax); 169 isz = (PetscInt *)(data + imax); 170 d_p = (PetscInt *)(isz + imax); 171 t_p = (char *)(d_p + M*imax); 172 for (i=0; i<imax; i++) { 173 table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i; 174 data[i] = d_p + M*i; 175 } 176 } 177 178 /* Parse the IS and update local tables and the outgoing buf with the data*/ 179 { 180 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j; 181 PetscBT table_i; 182 183 for (i=0; i<imax; i++) { 184 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 185 n_i = n[i]; 186 table_i = table[i]; 187 idx_i = idx[i]; 188 data_i = data[i]; 189 isz_i = isz[i]; 190 for (j=0; j<n_i; j++) { /* parse the indices of each IS */ 191 row = idx_i[j]; 192 ierr = PetscLayoutFindOwner(C->rmap,row,&proc);CHKERRQ(ierr); 193 if (proc != rank) { /* copy to the outgoing buffer */ 194 ctr[proc]++; 195 *ptr[proc] = row; 196 ptr[proc]++; 197 } else { /* Update the local table */ 198 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 199 } 200 } 201 /* Update the headers for the current IS */ 202 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */ 203 if ((ctr_j = ctr[j])) { 204 outdat_j = outdat[j]; 205 k = ++outdat_j[0]; 206 outdat_j[2*k] = ctr_j; 207 outdat_j[2*k-1] = i; 208 } 209 } 210 isz[i] = isz_i; 211 } 212 } 213 214 /* Now post the sends */ 215 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 216 for (i=0; i<nrqs; ++i) { 217 j = pa[i]; 218 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr); 219 } 220 221 /* No longer need the original indices*/ 222 for (i=0; i<imax; ++i) { 223 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr); 224 } 225 ierr = PetscFree2(idx,n);CHKERRQ(ierr); 226 227 for (i=0; i<imax; ++i) { 228 ierr = ISDestroy(is[i]);CHKERRQ(ierr); 229 } 230 231 /* Do Local work*/ 232 ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr); 233 234 /* Receive messages*/ 235 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr); 236 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);} 237 238 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); 239 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} 240 241 /* Phase 1 sends are complete - deallocate buffers */ 242 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr); 243 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 244 245 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&xdata);CHKERRQ(ierr); 246 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt),&isz1);CHKERRQ(ierr); 247 ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr); 248 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr); 249 ierr = PetscFree(rbuf);CHKERRQ(ierr); 250 251 252 /* Send the data back*/ 253 /* Do a global reduction to know the buffer space req for incoming messages*/ 254 { 255 PetscMPIInt *rw1; 256 257 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&rw1);CHKERRQ(ierr); 258 ierr = PetscMemzero(rw1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 259 260 for (i=0; i<nrqr; ++i) { 261 proc = recv_status[i].MPI_SOURCE; 262 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch"); 263 rw1[proc] = isz1[i]; 264 } 265 ierr = PetscFree(onodes1);CHKERRQ(ierr); 266 ierr = PetscFree(olengths1);CHKERRQ(ierr); 267 268 /* Determine the number of messages to expect, their lengths, from from-ids */ 269 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr); 270 ierr = PetscFree(rw1);CHKERRQ(ierr); 271 } 272 /* Now post the Irecvs corresponding to these messages */ 273 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr); 274 275 /* Now post the sends */ 276 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 277 for (i=0; i<nrqr; ++i) { 278 j = recv_status[i].MPI_SOURCE; 279 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr); 280 } 281 282 /* receive work done on other processors*/ 283 { 284 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax; 285 PetscMPIInt idex; 286 PetscBT table_i; 287 MPI_Status *status2; 288 289 ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr); 290 for (i=0; i<nrqs; ++i) { 291 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr); 292 /* Process the message*/ 293 rbuf2_i = rbuf2[idex]; 294 ct1 = 2*rbuf2_i[0]+1; 295 jmax = rbuf2[idex][0]; 296 for (j=1; j<=jmax; j++) { 297 max = rbuf2_i[2*j]; 298 is_no = rbuf2_i[2*j-1]; 299 isz_i = isz[is_no]; 300 data_i = data[is_no]; 301 table_i = table[is_no]; 302 for (k=0; k<max; k++,ct1++) { 303 row = rbuf2_i[ct1]; 304 if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;} 305 } 306 isz[is_no] = isz_i; 307 } 308 } 309 310 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);} 311 ierr = PetscFree(status2);CHKERRQ(ierr); 312 } 313 314 for (i=0; i<imax; ++i) { 315 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr); 316 } 317 318 ierr = PetscFree(onodes2);CHKERRQ(ierr); 319 ierr = PetscFree(olengths2);CHKERRQ(ierr); 320 321 ierr = PetscFree(pa);CHKERRQ(ierr); 322 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr); 323 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 324 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 325 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 326 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 327 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 328 ierr = PetscFree(table);CHKERRQ(ierr); 329 ierr = PetscFree(s_status);CHKERRQ(ierr); 330 ierr = PetscFree(recv_status);CHKERRQ(ierr); 331 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 332 ierr = PetscFree(xdata);CHKERRQ(ierr); 333 ierr = PetscFree(isz1);CHKERRQ(ierr); 334 PetscFunctionReturn(0); 335 } 336 337 #undef __FUNCT__ 338 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local" 339 /* 340 MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do 341 the work on the local processor. 342 343 Inputs: 344 C - MAT_MPIAIJ; 345 imax - total no of index sets processed at a time; 346 table - an array of char - size = m bits. 347 348 Output: 349 isz - array containing the count of the solution elements correspondign 350 to each index set; 351 data - pointer to the solutions 352 */ 353 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data) 354 { 355 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 356 Mat A = c->A,B = c->B; 357 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 358 PetscInt start,end,val,max,rstart,cstart,*ai,*aj; 359 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i; 360 PetscBT table_i; 361 362 PetscFunctionBegin; 363 rstart = C->rmap->rstart; 364 cstart = C->cmap->rstart; 365 ai = a->i; 366 aj = a->j; 367 bi = b->i; 368 bj = b->j; 369 garray = c->garray; 370 371 372 for (i=0; i<imax; i++) { 373 data_i = data[i]; 374 table_i = table[i]; 375 isz_i = isz[i]; 376 for (j=0,max=isz[i]; j<max; j++) { 377 row = data_i[j] - rstart; 378 start = ai[row]; 379 end = ai[row+1]; 380 for (k=start; k<end; k++) { /* Amat */ 381 val = aj[k] + cstart; 382 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 383 } 384 start = bi[row]; 385 end = bi[row+1]; 386 for (k=start; k<end; k++) { /* Bmat */ 387 val = garray[bj[k]]; 388 if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;} 389 } 390 } 391 isz[i] = isz_i; 392 } 393 PetscFunctionReturn(0); 394 } 395 396 #undef __FUNCT__ 397 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive" 398 /* 399 MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages, 400 and return the output 401 402 Input: 403 C - the matrix 404 nrqr - no of messages being processed. 405 rbuf - an array of pointers to the recieved requests 406 407 Output: 408 xdata - array of messages to be sent back 409 isz1 - size of each message 410 411 For better efficiency perhaps we should malloc separately each xdata[i], 412 then if a remalloc is required we need only copy the data for that one row 413 rather then all previous rows as it is now where a single large chunck of 414 memory is used. 415 416 */ 417 static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1) 418 { 419 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 420 Mat A = c->A,B = c->B; 421 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data; 422 PetscErrorCode ierr; 423 PetscMPIInt rank; 424 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k; 425 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end; 426 PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr; 427 PetscInt *rbuf_i,kmax,rbuf_0; 428 PetscBT xtable; 429 430 PetscFunctionBegin; 431 rank = c->rank; 432 m = C->rmap->N; 433 rstart = C->rmap->rstart; 434 cstart = C->cmap->rstart; 435 ai = a->i; 436 aj = a->j; 437 bi = b->i; 438 bj = b->j; 439 garray = c->garray; 440 441 442 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) { 443 rbuf_i = rbuf[i]; 444 rbuf_0 = rbuf_i[0]; 445 ct += rbuf_0; 446 for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; } 447 } 448 449 if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n; 450 else max1 = 1; 451 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1); 452 ierr = PetscMalloc(mem_estimate*sizeof(PetscInt),&xdata[0]);CHKERRQ(ierr); 453 ++no_malloc; 454 ierr = PetscBTCreate(m,xtable);CHKERRQ(ierr); 455 ierr = PetscMemzero(isz1,nrqr*sizeof(PetscInt));CHKERRQ(ierr); 456 457 ct3 = 0; 458 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */ 459 rbuf_i = rbuf[i]; 460 rbuf_0 = rbuf_i[0]; 461 ct1 = 2*rbuf_0+1; 462 ct2 = ct1; 463 ct3 += ct1; 464 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/ 465 ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr); 466 oct2 = ct2; 467 kmax = rbuf_i[2*j]; 468 for (k=0; k<kmax; k++,ct1++) { 469 row = rbuf_i[ct1]; 470 if (!PetscBTLookupSet(xtable,row)) { 471 if (!(ct3 < mem_estimate)) { 472 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 473 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 474 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 475 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 476 xdata[0] = tmp; 477 mem_estimate = new_estimate; ++no_malloc; 478 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 479 } 480 xdata[i][ct2++] = row; 481 ct3++; 482 } 483 } 484 for (k=oct2,max2=ct2; k<max2; k++) { 485 row = xdata[i][k] - rstart; 486 start = ai[row]; 487 end = ai[row+1]; 488 for (l=start; l<end; l++) { 489 val = aj[l] + cstart; 490 if (!PetscBTLookupSet(xtable,val)) { 491 if (!(ct3 < mem_estimate)) { 492 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 493 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 494 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 495 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 496 xdata[0] = tmp; 497 mem_estimate = new_estimate; ++no_malloc; 498 for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 499 } 500 xdata[i][ct2++] = val; 501 ct3++; 502 } 503 } 504 start = bi[row]; 505 end = bi[row+1]; 506 for (l=start; l<end; l++) { 507 val = garray[bj[l]]; 508 if (!PetscBTLookupSet(xtable,val)) { 509 if (!(ct3 < mem_estimate)) { 510 new_estimate = (PetscInt)(1.5*mem_estimate)+1; 511 ierr = PetscMalloc(new_estimate*sizeof(PetscInt),&tmp);CHKERRQ(ierr); 512 ierr = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(PetscInt));CHKERRQ(ierr); 513 ierr = PetscFree(xdata[0]);CHKERRQ(ierr); 514 xdata[0] = tmp; 515 mem_estimate = new_estimate; ++no_malloc; 516 for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];} 517 } 518 xdata[i][ct2++] = val; 519 ct3++; 520 } 521 } 522 } 523 /* Update the header*/ 524 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/ 525 xdata[i][2*j-1] = rbuf_i[2*j-1]; 526 } 527 xdata[i][0] = rbuf_0; 528 xdata[i+1] = xdata[i] + ct2; 529 isz1[i] = ct2; /* size of each message */ 530 } 531 ierr = PetscBTDestroy(xtable);CHKERRQ(ierr); 532 ierr = PetscInfo4(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",rank,mem_estimate,ct3,no_malloc);CHKERRQ(ierr); 533 PetscFunctionReturn(0); 534 } 535 /* -------------------------------------------------------------------------*/ 536 extern PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*); 537 extern PetscErrorCode MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType); 538 /* 539 Every processor gets the entire matrix 540 */ 541 #undef __FUNCT__ 542 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All" 543 PetscErrorCode MatGetSubMatrix_MPIAIJ_All(Mat A,MatGetSubMatrixOption flag,MatReuse scall,Mat *Bin[]) 544 { 545 Mat B; 546 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 547 Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data; 548 PetscErrorCode ierr; 549 PetscMPIInt size,rank,*recvcounts = 0,*displs = 0; 550 PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j; 551 PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf; 552 MatScalar *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf; 553 554 PetscFunctionBegin; 555 ierr = MPI_Comm_size(((PetscObject)A)->comm,&size);CHKERRQ(ierr); 556 ierr = MPI_Comm_rank(((PetscObject)A)->comm,&rank);CHKERRQ(ierr); 557 558 if (scall == MAT_INITIAL_MATRIX) { 559 /* ---------------------------------------------------------------- 560 Tell every processor the number of nonzeros per row 561 */ 562 ierr = PetscMalloc(A->rmap->N*sizeof(PetscInt),&lens);CHKERRQ(ierr); 563 for (i=A->rmap->rstart; i<A->rmap->rend; i++) { 564 lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart]; 565 } 566 sendcount = A->rmap->rend - A->rmap->rstart; 567 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 568 for (i=0; i<size; i++) { 569 recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i]; 570 displs[i] = A->rmap->range[i]; 571 } 572 #if defined(PETSC_HAVE_MPI_IN_PLACE) 573 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 574 #else 575 ierr = MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 576 #endif 577 /* --------------------------------------------------------------- 578 Create the sequential matrix of the same type as the local block diagonal 579 */ 580 ierr = MatCreate(PETSC_COMM_SELF,&B);CHKERRQ(ierr); 581 ierr = MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 582 ierr = MatSetType(B,((PetscObject)a->A)->type_name);CHKERRQ(ierr); 583 ierr = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr); 584 ierr = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr); 585 **Bin = B; 586 b = (Mat_SeqAIJ *)B->data; 587 588 /*-------------------------------------------------------------------- 589 Copy my part of matrix column indices over 590 */ 591 sendcount = ad->nz + bd->nz; 592 jsendbuf = b->j + b->i[rstarts[rank]]; 593 a_jsendbuf = ad->j; 594 b_jsendbuf = bd->j; 595 n = A->rmap->rend - A->rmap->rstart; 596 cnt = 0; 597 for (i=0; i<n; i++) { 598 599 /* put in lower diagonal portion */ 600 m = bd->i[i+1] - bd->i[i]; 601 while (m > 0) { 602 /* is it above diagonal (in bd (compressed) numbering) */ 603 if (garray[*b_jsendbuf] > A->rmap->rstart + i) break; 604 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 605 m--; 606 } 607 608 /* put in diagonal portion */ 609 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 610 jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++; 611 } 612 613 /* put in upper diagonal portion */ 614 while (m-- > 0) { 615 jsendbuf[cnt++] = garray[*b_jsendbuf++]; 616 } 617 } 618 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 619 620 /*-------------------------------------------------------------------- 621 Gather all column indices to all processors 622 */ 623 for (i=0; i<size; i++) { 624 recvcounts[i] = 0; 625 for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) { 626 recvcounts[i] += lens[j]; 627 } 628 } 629 displs[0] = 0; 630 for (i=1; i<size; i++) { 631 displs[i] = displs[i-1] + recvcounts[i-1]; 632 } 633 #if defined(PETSC_HAVE_MPI_IN_PLACE) 634 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 635 #else 636 ierr = MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,((PetscObject)A)->comm);CHKERRQ(ierr); 637 #endif 638 /*-------------------------------------------------------------------- 639 Assemble the matrix into useable form (note numerical values not yet set) 640 */ 641 /* set the b->ilen (length of each row) values */ 642 ierr = PetscMemcpy(b->ilen,lens,A->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 643 /* set the b->i indices */ 644 b->i[0] = 0; 645 for (i=1; i<=A->rmap->N; i++) { 646 b->i[i] = b->i[i-1] + lens[i-1]; 647 } 648 ierr = PetscFree(lens);CHKERRQ(ierr); 649 ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 650 ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 651 652 } else { 653 B = **Bin; 654 b = (Mat_SeqAIJ *)B->data; 655 } 656 657 /*-------------------------------------------------------------------- 658 Copy my part of matrix numerical values into the values location 659 */ 660 if (flag == MAT_GET_VALUES){ 661 sendcount = ad->nz + bd->nz; 662 sendbuf = b->a + b->i[rstarts[rank]]; 663 a_sendbuf = ad->a; 664 b_sendbuf = bd->a; 665 b_sendj = bd->j; 666 n = A->rmap->rend - A->rmap->rstart; 667 cnt = 0; 668 for (i=0; i<n; i++) { 669 670 /* put in lower diagonal portion */ 671 m = bd->i[i+1] - bd->i[i]; 672 while (m > 0) { 673 /* is it above diagonal (in bd (compressed) numbering) */ 674 if (garray[*b_sendj] > A->rmap->rstart + i) break; 675 sendbuf[cnt++] = *b_sendbuf++; 676 m--; 677 b_sendj++; 678 } 679 680 /* put in diagonal portion */ 681 for (j=ad->i[i]; j<ad->i[i+1]; j++) { 682 sendbuf[cnt++] = *a_sendbuf++; 683 } 684 685 /* put in upper diagonal portion */ 686 while (m-- > 0) { 687 sendbuf[cnt++] = *b_sendbuf++; 688 b_sendj++; 689 } 690 } 691 if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt); 692 693 /* ----------------------------------------------------------------- 694 Gather all numerical values to all processors 695 */ 696 if (!recvcounts) { 697 ierr = PetscMalloc2(size,PetscMPIInt,&recvcounts,size,PetscMPIInt,&displs);CHKERRQ(ierr); 698 } 699 for (i=0; i<size; i++) { 700 recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]]; 701 } 702 displs[0] = 0; 703 for (i=1; i<size; i++) { 704 displs[i] = displs[i-1] + recvcounts[i-1]; 705 } 706 recvbuf = b->a; 707 #if defined(PETSC_HAVE_MPI_IN_PLACE) 708 ierr = MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 709 #else 710 ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,((PetscObject)A)->comm);CHKERRQ(ierr); 711 #endif 712 } /* endof (flag == MAT_GET_VALUES) */ 713 ierr = PetscFree2(recvcounts,displs);CHKERRQ(ierr); 714 715 if (A->symmetric){ 716 ierr = MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 717 } else if (A->hermitian) { 718 ierr = MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);CHKERRQ(ierr); 719 } else if (A->structurally_symmetric) { 720 ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr); 721 } 722 PetscFunctionReturn(0); 723 } 724 725 726 727 #undef __FUNCT__ 728 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ" 729 PetscErrorCode MatGetSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 730 { 731 PetscErrorCode ierr; 732 PetscInt nmax,nstages_local,nstages,i,pos,max_no,nrow,ncol; 733 PetscBool rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix; 734 735 PetscFunctionBegin; 736 /* 737 Check for special case each processor gets entire matrix 738 */ 739 if (ismax == 1 && C->rmap->N == C->cmap->N) { 740 ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr); 741 ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr); 742 ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr); 743 ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr); 744 if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) { 745 wantallmatrix = PETSC_TRUE; 746 ierr = PetscOptionsGetBool(((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr); 747 } 748 } 749 ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,((PetscObject)C)->comm);CHKERRQ(ierr); 750 if (twantallmatrix) { 751 ierr = MatGetSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);CHKERRQ(ierr); 752 PetscFunctionReturn(0); 753 } 754 755 /* Allocate memory to hold all the submatrices */ 756 if (scall != MAT_REUSE_MATRIX) { 757 ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr); 758 } 759 /* Determine the number of stages through which submatrices are done */ 760 nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt)); 761 /* 762 Each stage will extract nmax submatrices. 763 nmax is determined by the matrix column dimension. 764 If the original matrix has 20M columns, only one submatrix per stage is allowed, etc. 765 */ 766 if (!nmax) nmax = 1; 767 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 768 769 /* Make sure every processor loops through the nstages */ 770 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,((PetscObject)C)->comm);CHKERRQ(ierr); 771 772 for (i=0,pos=0; i<nstages; i++) { 773 if (pos+nmax <= ismax) max_no = nmax; 774 else if (pos == ismax) max_no = 0; 775 else max_no = ismax-pos; 776 ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 777 pos += max_no; 778 } 779 PetscFunctionReturn(0); 780 } 781 782 /* -------------------------------------------------------------------------*/ 783 #undef __FUNCT__ 784 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local" 785 PetscErrorCode MatGetSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats) 786 { 787 Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data; 788 Mat A = c->A; 789 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat; 790 const PetscInt **icol,**irow; 791 PetscInt *nrow,*ncol,start; 792 PetscErrorCode ierr; 793 PetscMPIInt rank,size,tag0,tag1,tag2,tag3,*w1,*w2,*w3,*w4,nrqr; 794 PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc; 795 PetscInt nrqs,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol; 796 PetscInt **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2; 797 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax; 798 #if defined (PETSC_USE_CTABLE) 799 PetscTable *cmap,cmap_i,*rmap,rmap_i; 800 #else 801 PetscInt **cmap,*cmap_i,**rmap,*rmap_i; 802 #endif 803 const PetscInt *irow_i; 804 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i; 805 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 806 MPI_Request *r_waits4,*s_waits3,*s_waits4; 807 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 808 MPI_Status *r_status3,*r_status4,*s_status4; 809 MPI_Comm comm; 810 PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i; 811 PetscBool sorted; 812 PetscMPIInt *onodes1,*olengths1; 813 PetscMPIInt idex,idex2,end; 814 815 PetscFunctionBegin; 816 comm = ((PetscObject)C)->comm; 817 tag0 = ((PetscObject)C)->tag; 818 size = c->size; 819 rank = c->rank; 820 821 /* Get some new tags to keep the communication clean */ 822 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); 823 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); 824 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr); 825 826 /* Check if the col indices are sorted */ 827 for (i=0; i<ismax; i++) { 828 ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr); 829 /*if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"isrow is not sorted");*/ 830 ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr); 831 /* if (!sorted) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"iscol is not sorted"); */ 832 } 833 ierr = PetscMalloc4(ismax,const PetscInt*,&irow,ismax,const PetscInt*,&icol,ismax,PetscInt,&nrow,ismax,PetscInt,&ncol);CHKERRQ(ierr); 834 835 for (i=0; i<ismax; i++) { 836 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr); 837 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr); 838 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr); 839 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr); 840 } 841 842 /* evaluate communication - mesg to who, length of mesg, and buffer space 843 required. Based on this, buffers are allocated, and data copied into them*/ 844 ierr = PetscMalloc4(size,PetscMPIInt,&w1,size,PetscMPIInt,&w2,size,PetscMPIInt,&w3,size,PetscMPIInt,&w4);CHKERRQ(ierr); /* mesg size */ 845 ierr = PetscMemzero(w1,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 846 ierr = PetscMemzero(w2,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 847 ierr = PetscMemzero(w3,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 848 for (i=0; i<ismax; i++) { 849 ierr = PetscMemzero(w4,size*sizeof(PetscMPIInt));CHKERRQ(ierr); /* initialize work vector*/ 850 jmax = nrow[i]; 851 irow_i = irow[i]; 852 for (j=0; j<jmax; j++) { 853 l = 0; 854 row = irow_i[j]; 855 while (row >= C->rmap->range[l+1]) l++; 856 proc = l; 857 w4[proc]++; 858 } 859 for (j=0; j<size; j++) { 860 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 861 } 862 } 863 864 nrqs = 0; /* no of outgoing messages */ 865 msz = 0; /* total mesg length (for all procs) */ 866 w1[rank] = 0; /* no mesg sent to self */ 867 w3[rank] = 0; 868 for (i=0; i<size; i++) { 869 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 870 } 871 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt),&pa);CHKERRQ(ierr); /*(proc -array)*/ 872 for (i=0,j=0; i<size; i++) { 873 if (w1[i]) { pa[j] = i; j++; } 874 } 875 876 /* Each message would have a header = 1 + 2*(no of IS) + data */ 877 for (i=0; i<nrqs; i++) { 878 j = pa[i]; 879 w1[j] += w2[j] + 2* w3[j]; 880 msz += w1[j]; 881 } 882 ierr = PetscInfo2(C,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr); 883 884 /* Determine the number of messages to expect, their lengths, from from-ids */ 885 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr); 886 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr); 887 888 /* Now post the Irecvs corresponding to these messages */ 889 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr); 890 891 ierr = PetscFree(onodes1);CHKERRQ(ierr); 892 ierr = PetscFree(olengths1);CHKERRQ(ierr); 893 894 /* Allocate Memory for outgoing messages */ 895 ierr = PetscMalloc4(size,PetscInt*,&sbuf1,size,PetscInt*,&ptr,2*msz,PetscInt,&tmp,size,PetscInt,&ctr);CHKERRQ(ierr); 896 ierr = PetscMemzero(sbuf1,size*sizeof(PetscInt*));CHKERRQ(ierr); 897 ierr = PetscMemzero(ptr,size*sizeof(PetscInt*));CHKERRQ(ierr); 898 899 { 900 PetscInt *iptr = tmp,ict = 0; 901 for (i=0; i<nrqs; i++) { 902 j = pa[i]; 903 iptr += ict; 904 sbuf1[j] = iptr; 905 ict = w1[j]; 906 } 907 } 908 909 /* Form the outgoing messages */ 910 /* Initialize the header space */ 911 for (i=0; i<nrqs; i++) { 912 j = pa[i]; 913 sbuf1[j][0] = 0; 914 ierr = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(PetscInt));CHKERRQ(ierr); 915 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 916 } 917 918 /* Parse the isrow and copy data into outbuf */ 919 for (i=0; i<ismax; i++) { 920 ierr = PetscMemzero(ctr,size*sizeof(PetscInt));CHKERRQ(ierr); 921 irow_i = irow[i]; 922 jmax = nrow[i]; 923 for (j=0; j<jmax; j++) { /* parse the indices of each IS */ 924 l = 0; 925 row = irow_i[j]; 926 while (row >= C->rmap->range[l+1]) l++; 927 proc = l; 928 if (proc != rank) { /* copy to the outgoing buf*/ 929 ctr[proc]++; 930 *ptr[proc] = row; 931 ptr[proc]++; 932 } 933 } 934 /* Update the headers for the current IS */ 935 for (j=0; j<size; j++) { /* Can Optimise this loop too */ 936 if ((ctr_j = ctr[j])) { 937 sbuf1_j = sbuf1[j]; 938 k = ++sbuf1_j[0]; 939 sbuf1_j[2*k] = ctr_j; 940 sbuf1_j[2*k-1] = i; 941 } 942 } 943 } 944 945 /* Now post the sends */ 946 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr); 947 for (i=0; i<nrqs; ++i) { 948 j = pa[i]; 949 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr); 950 } 951 952 /* Post Receives to capture the buffer size */ 953 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr); 954 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf2);CHKERRQ(ierr); 955 rbuf2[0] = tmp + msz; 956 for (i=1; i<nrqs; ++i) { 957 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 958 } 959 for (i=0; i<nrqs; ++i) { 960 j = pa[i]; 961 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr); 962 } 963 964 /* Send to other procs the buf size they should allocate */ 965 966 967 /* Receive messages*/ 968 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr); 969 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr); 970 ierr = PetscMalloc3(nrqr,PetscInt*,&sbuf2,nrqr,PetscInt,&req_size,nrqr,PetscInt,&req_source);CHKERRQ(ierr); 971 { 972 Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data; 973 PetscInt *sAi = sA->i,*sBi = sB->i,id,rstart = C->rmap->rstart; 974 PetscInt *sbuf2_i; 975 976 for (i=0; i<nrqr; ++i) { 977 ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr); 978 req_size[idex] = 0; 979 rbuf1_i = rbuf1[idex]; 980 start = 2*rbuf1_i[0] + 1; 981 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr); 982 ierr = PetscMalloc((end+1)*sizeof(PetscInt),&sbuf2[idex]);CHKERRQ(ierr); 983 sbuf2_i = sbuf2[idex]; 984 for (j=start; j<end; j++) { 985 id = rbuf1_i[j] - rstart; 986 ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; 987 sbuf2_i[j] = ncols; 988 req_size[idex] += ncols; 989 } 990 req_source[idex] = r_status1[i].MPI_SOURCE; 991 /* form the header */ 992 sbuf2_i[0] = req_size[idex]; 993 for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; } 994 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr); 995 } 996 } 997 ierr = PetscFree(r_status1);CHKERRQ(ierr); 998 ierr = PetscFree(r_waits1);CHKERRQ(ierr); 999 1000 /* recv buffer sizes */ 1001 /* Receive messages*/ 1002 1003 ierr = PetscMalloc((nrqs+1)*sizeof(PetscInt*),&rbuf3);CHKERRQ(ierr); 1004 ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr); 1005 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr); 1006 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr); 1007 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr); 1008 1009 for (i=0; i<nrqs; ++i) { 1010 ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr); 1011 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscInt),&rbuf3[idex]);CHKERRQ(ierr); 1012 ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr); 1013 ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPIU_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr); 1014 ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr); 1015 } 1016 ierr = PetscFree(r_status2);CHKERRQ(ierr); 1017 ierr = PetscFree(r_waits2);CHKERRQ(ierr); 1018 1019 /* Wait on sends1 and sends2 */ 1020 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr); 1021 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr); 1022 1023 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);} 1024 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);} 1025 ierr = PetscFree(s_status1);CHKERRQ(ierr); 1026 ierr = PetscFree(s_status2);CHKERRQ(ierr); 1027 ierr = PetscFree(s_waits1);CHKERRQ(ierr); 1028 ierr = PetscFree(s_waits2);CHKERRQ(ierr); 1029 1030 /* Now allocate buffers for a->j, and send them off */ 1031 ierr = PetscMalloc((nrqr+1)*sizeof(PetscInt*),&sbuf_aj);CHKERRQ(ierr); 1032 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1033 ierr = PetscMalloc((j+1)*sizeof(PetscInt),&sbuf_aj[0]);CHKERRQ(ierr); 1034 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; 1035 1036 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr); 1037 { 1038 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite; 1039 PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1040 PetscInt cend = C->cmap->rend; 1041 PetscInt *a_j = a->j,*b_j = b->j,ctmp; 1042 1043 for (i=0; i<nrqr; i++) { 1044 rbuf1_i = rbuf1[i]; 1045 sbuf_aj_i = sbuf_aj[i]; 1046 ct1 = 2*rbuf1_i[0] + 1; 1047 ct2 = 0; 1048 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1049 kmax = rbuf1[i][2*j]; 1050 for (k=0; k<kmax; k++,ct1++) { 1051 row = rbuf1_i[ct1] - rstart; 1052 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1053 ncols = nzA + nzB; 1054 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 1055 1056 /* load the column indices for this row into cols*/ 1057 cols = sbuf_aj_i + ct2; 1058 1059 lwrite = 0; 1060 for (l=0; l<nzB; l++) { 1061 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp; 1062 } 1063 for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l]; 1064 for (l=0; l<nzB; l++) { 1065 if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp; 1066 } 1067 1068 ct2 += ncols; 1069 } 1070 } 1071 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 1072 } 1073 } 1074 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr); 1075 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr); 1076 1077 /* Allocate buffers for a->a, and send them off */ 1078 ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr); 1079 for (i=0,j=0; i<nrqr; i++) j += req_size[i]; 1080 ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr); 1081 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 1082 1083 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr); 1084 { 1085 PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite; 1086 PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray; 1087 PetscInt cend = C->cmap->rend; 1088 PetscInt *b_j = b->j; 1089 PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a; 1090 1091 for (i=0; i<nrqr; i++) { 1092 rbuf1_i = rbuf1[i]; 1093 sbuf_aa_i = sbuf_aa[i]; 1094 ct1 = 2*rbuf1_i[0]+1; 1095 ct2 = 0; 1096 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) { 1097 kmax = rbuf1_i[2*j]; 1098 for (k=0; k<kmax; k++,ct1++) { 1099 row = rbuf1_i[ct1] - rstart; 1100 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 1101 ncols = nzA + nzB; 1102 cworkB = b_j + b_i[row]; 1103 vworkA = a_a + a_i[row]; 1104 vworkB = b_a + b_i[row]; 1105 1106 /* load the column values for this row into vals*/ 1107 vals = sbuf_aa_i+ct2; 1108 1109 lwrite = 0; 1110 for (l=0; l<nzB; l++) { 1111 if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l]; 1112 } 1113 for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l]; 1114 for (l=0; l<nzB; l++) { 1115 if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l]; 1116 } 1117 1118 ct2 += ncols; 1119 } 1120 } 1121 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 1122 } 1123 } 1124 ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr); 1125 ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr); 1126 ierr = PetscFree(rbuf1[0]);CHKERRQ(ierr); 1127 ierr = PetscFree(rbuf1);CHKERRQ(ierr); 1128 1129 /* Form the matrix */ 1130 /* create col map */ 1131 { 1132 const PetscInt *icol_i; 1133 #if defined (PETSC_USE_CTABLE) 1134 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&cmap);CHKERRQ(ierr); 1135 for (i=0; i<ismax; i++) { 1136 ierr = PetscTableCreate(ncol[i]+1,&cmap[i]);CHKERRQ(ierr); 1137 jmax = ncol[i]; 1138 icol_i = icol[i]; 1139 cmap_i = cmap[i]; 1140 for (j=0; j<jmax; j++) { 1141 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1);CHKERRQ(ierr); 1142 } 1143 } 1144 #else 1145 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&cmap);CHKERRQ(ierr); 1146 ierr = PetscMalloc(ismax*C->cmap->N*sizeof(PetscInt),&cmap[0]);CHKERRQ(ierr); 1147 ierr = PetscMemzero(cmap[0],ismax*C->cmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1148 for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->cmap->N; } 1149 for (i=0; i<ismax; i++) { 1150 jmax = ncol[i]; 1151 icol_i = icol[i]; 1152 cmap_i = cmap[i]; 1153 for (j=0; j<jmax; j++) { 1154 cmap_i[icol_i[j]] = j+1; 1155 } 1156 } 1157 #endif 1158 } 1159 1160 /* Create lens which is required for MatCreate... */ 1161 for (i=0,j=0; i<ismax; i++) { j += nrow[i]; } 1162 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&lens);CHKERRQ(ierr); 1163 ierr = PetscMalloc(j*sizeof(PetscInt),&lens[0]);CHKERRQ(ierr); 1164 ierr = PetscMemzero(lens[0],j*sizeof(PetscInt));CHKERRQ(ierr); 1165 for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; } 1166 1167 /* Update lens from local data */ 1168 for (i=0; i<ismax; i++) { 1169 jmax = nrow[i]; 1170 cmap_i = cmap[i]; 1171 irow_i = irow[i]; 1172 lens_i = lens[i]; 1173 for (j=0; j<jmax; j++) { 1174 l = 0; 1175 row = irow_i[j]; 1176 while (row >= C->rmap->range[l+1]) l++; 1177 proc = l; 1178 if (proc == rank) { 1179 ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1180 for (k=0; k<ncols; k++) { 1181 #if defined (PETSC_USE_CTABLE) 1182 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1183 #else 1184 tcol = cmap_i[cols[k]]; 1185 #endif 1186 if (tcol) { lens_i[j]++;} 1187 } 1188 ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr); 1189 } 1190 } 1191 } 1192 1193 /* Create row map*/ 1194 #if defined (PETSC_USE_CTABLE) 1195 ierr = PetscMalloc((1+ismax)*sizeof(PetscTable),&rmap);CHKERRQ(ierr); 1196 for (i=0; i<ismax; i++) { 1197 ierr = PetscTableCreate(nrow[i]+1,&rmap[i]);CHKERRQ(ierr); 1198 rmap_i = rmap[i]; 1199 irow_i = irow[i]; 1200 jmax = nrow[i]; 1201 for (j=0; j<jmax; j++) { 1202 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1);CHKERRQ(ierr); 1203 } 1204 } 1205 #else 1206 ierr = PetscMalloc(ismax*sizeof(PetscInt*),&rmap);CHKERRQ(ierr); 1207 ierr = PetscMalloc(ismax*C->rmap->N*sizeof(PetscInt),&rmap[0]);CHKERRQ(ierr); 1208 ierr = PetscMemzero(rmap[0],ismax*C->rmap->N*sizeof(PetscInt));CHKERRQ(ierr); 1209 for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->rmap->N;} 1210 for (i=0; i<ismax; i++) { 1211 rmap_i = rmap[i]; 1212 irow_i = irow[i]; 1213 jmax = nrow[i]; 1214 for (j=0; j<jmax; j++) { 1215 rmap_i[irow_i[j]] = j; 1216 } 1217 } 1218 #endif 1219 1220 /* Update lens from offproc data */ 1221 { 1222 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i; 1223 1224 for (tmp2=0; tmp2<nrqs; tmp2++) { 1225 ierr = MPI_Waitany(nrqs,r_waits3,&idex2,r_status3+tmp2);CHKERRQ(ierr); 1226 idex = pa[idex2]; 1227 sbuf1_i = sbuf1[idex]; 1228 jmax = sbuf1_i[0]; 1229 ct1 = 2*jmax+1; 1230 ct2 = 0; 1231 rbuf2_i = rbuf2[idex2]; 1232 rbuf3_i = rbuf3[idex2]; 1233 for (j=1; j<=jmax; j++) { 1234 is_no = sbuf1_i[2*j-1]; 1235 max1 = sbuf1_i[2*j]; 1236 lens_i = lens[is_no]; 1237 cmap_i = cmap[is_no]; 1238 rmap_i = rmap[is_no]; 1239 for (k=0; k<max1; k++,ct1++) { 1240 #if defined (PETSC_USE_CTABLE) 1241 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr); 1242 row--; 1243 if (row < 0) { SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table"); } 1244 #else 1245 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 1246 #endif 1247 max2 = rbuf2_i[ct1]; 1248 for (l=0; l<max2; l++,ct2++) { 1249 #if defined (PETSC_USE_CTABLE) 1250 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1251 #else 1252 tcol = cmap_i[rbuf3_i[ct2]]; 1253 #endif 1254 if (tcol) { 1255 lens_i[row]++; 1256 } 1257 } 1258 } 1259 } 1260 } 1261 } 1262 ierr = PetscFree(r_status3);CHKERRQ(ierr); 1263 ierr = PetscFree(r_waits3);CHKERRQ(ierr); 1264 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);} 1265 ierr = PetscFree(s_status3);CHKERRQ(ierr); 1266 ierr = PetscFree(s_waits3);CHKERRQ(ierr); 1267 1268 /* Create the submatrices */ 1269 if (scall == MAT_REUSE_MATRIX) { 1270 PetscBool flag; 1271 1272 /* 1273 Assumes new rows are same length as the old rows,hence bug! 1274 */ 1275 for (i=0; i<ismax; i++) { 1276 mat = (Mat_SeqAIJ *)(submats[i]->data); 1277 if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size"); 1278 ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->rmap->n*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1279 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros"); 1280 /* Initial matrix as if empty */ 1281 ierr = PetscMemzero(mat->ilen,submats[i]->rmap->n*sizeof(PetscInt));CHKERRQ(ierr); 1282 submats[i]->factortype = C->factortype; 1283 } 1284 } else { 1285 for (i=0; i<ismax; i++) { 1286 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr); 1287 ierr = MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1288 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr); 1289 ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr); 1290 } 1291 } 1292 1293 /* Assemble the matrices */ 1294 /* First assemble the local rows */ 1295 { 1296 PetscInt ilen_row,*imat_ilen,*imat_j,*imat_i,old_row; 1297 PetscScalar *imat_a; 1298 1299 for (i=0; i<ismax; i++) { 1300 mat = (Mat_SeqAIJ*)submats[i]->data; 1301 imat_ilen = mat->ilen; 1302 imat_j = mat->j; 1303 imat_i = mat->i; 1304 imat_a = mat->a; 1305 cmap_i = cmap[i]; 1306 rmap_i = rmap[i]; 1307 irow_i = irow[i]; 1308 jmax = nrow[i]; 1309 for (j=0; j<jmax; j++) { 1310 l = 0; 1311 row = irow_i[j]; 1312 while (row >= C->rmap->range[l+1]) l++; 1313 proc = l; 1314 if (proc == rank) { 1315 old_row = row; 1316 #if defined (PETSC_USE_CTABLE) 1317 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1318 row--; 1319 #else 1320 row = rmap_i[row]; 1321 #endif 1322 ilen_row = imat_ilen[row]; 1323 ierr = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1324 mat_i = imat_i[row] ; 1325 mat_a = imat_a + mat_i; 1326 mat_j = imat_j + mat_i; 1327 for (k=0; k<ncols; k++) { 1328 #if defined (PETSC_USE_CTABLE) 1329 ierr = PetscTableFind(cmap_i,cols[k]+1,&tcol);CHKERRQ(ierr); 1330 #else 1331 tcol = cmap_i[cols[k]]; 1332 #endif 1333 if (tcol){ 1334 *mat_j++ = tcol - 1; 1335 *mat_a++ = vals[k]; 1336 ilen_row++; 1337 } 1338 } 1339 ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 1340 imat_ilen[row] = ilen_row; 1341 } 1342 } 1343 } 1344 } 1345 1346 /* Now assemble the off proc rows*/ 1347 { 1348 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 1349 PetscInt *imat_j,*imat_i; 1350 PetscScalar *imat_a,*rbuf4_i; 1351 1352 for (tmp2=0; tmp2<nrqs; tmp2++) { 1353 ierr = MPI_Waitany(nrqs,r_waits4,&idex2,r_status4+tmp2);CHKERRQ(ierr); 1354 idex = pa[idex2]; 1355 sbuf1_i = sbuf1[idex]; 1356 jmax = sbuf1_i[0]; 1357 ct1 = 2*jmax + 1; 1358 ct2 = 0; 1359 rbuf2_i = rbuf2[idex2]; 1360 rbuf3_i = rbuf3[idex2]; 1361 rbuf4_i = rbuf4[idex2]; 1362 for (j=1; j<=jmax; j++) { 1363 is_no = sbuf1_i[2*j-1]; 1364 rmap_i = rmap[is_no]; 1365 cmap_i = cmap[is_no]; 1366 mat = (Mat_SeqAIJ*)submats[is_no]->data; 1367 imat_ilen = mat->ilen; 1368 imat_j = mat->j; 1369 imat_i = mat->i; 1370 imat_a = mat->a; 1371 max1 = sbuf1_i[2*j]; 1372 for (k=0; k<max1; k++,ct1++) { 1373 row = sbuf1_i[ct1]; 1374 #if defined (PETSC_USE_CTABLE) 1375 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr); 1376 row--; 1377 #else 1378 row = rmap_i[row]; 1379 #endif 1380 ilen = imat_ilen[row]; 1381 mat_i = imat_i[row] ; 1382 mat_a = imat_a + mat_i; 1383 mat_j = imat_j + mat_i; 1384 max2 = rbuf2_i[ct1]; 1385 for (l=0; l<max2; l++,ct2++) { 1386 1387 #if defined (PETSC_USE_CTABLE) 1388 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr); 1389 #else 1390 tcol = cmap_i[rbuf3_i[ct2]]; 1391 #endif 1392 if (tcol) { 1393 *mat_j++ = tcol - 1; 1394 *mat_a++ = rbuf4_i[ct2]; 1395 ilen++; 1396 } 1397 } 1398 imat_ilen[row] = ilen; 1399 } 1400 } 1401 } 1402 } 1403 ierr = PetscFree(r_status4);CHKERRQ(ierr); 1404 ierr = PetscFree(r_waits4);CHKERRQ(ierr); 1405 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);} 1406 ierr = PetscFree(s_waits4);CHKERRQ(ierr); 1407 ierr = PetscFree(s_status4);CHKERRQ(ierr); 1408 1409 /* Restore the indices */ 1410 for (i=0; i<ismax; i++) { 1411 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr); 1412 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr); 1413 } 1414 1415 /* Destroy allocated memory */ 1416 ierr = PetscFree4(irow,icol,nrow,ncol);CHKERRQ(ierr); 1417 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr); 1418 ierr = PetscFree(pa);CHKERRQ(ierr); 1419 1420 ierr = PetscFree4(sbuf1,ptr,tmp,ctr);CHKERRQ(ierr); 1421 ierr = PetscFree(rbuf2);CHKERRQ(ierr); 1422 for (i=0; i<nrqr; ++i) { 1423 ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr); 1424 } 1425 for (i=0; i<nrqs; ++i) { 1426 ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr); 1427 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr); 1428 } 1429 1430 ierr = PetscFree3(sbuf2,req_size,req_source);CHKERRQ(ierr); 1431 ierr = PetscFree(rbuf3);CHKERRQ(ierr); 1432 ierr = PetscFree(rbuf4);CHKERRQ(ierr); 1433 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr); 1434 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr); 1435 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr); 1436 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr); 1437 #if defined (PETSC_USE_CTABLE) 1438 for (i=0; i<ismax; i++) { 1439 ierr = PetscTableDestroy((PetscTable)cmap[i]);CHKERRQ(ierr); 1440 ierr = PetscTableDestroy((PetscTable)rmap[i]);CHKERRQ(ierr); 1441 } 1442 #else 1443 ierr = PetscFree(cmap[0]);CHKERRQ(ierr); 1444 ierr = PetscFree(rmap[0]);CHKERRQ(ierr); 1445 #endif 1446 ierr = PetscFree(cmap);CHKERRQ(ierr); 1447 ierr = PetscFree(rmap);CHKERRQ(ierr); 1448 ierr = PetscFree(lens[0]);CHKERRQ(ierr); 1449 ierr = PetscFree(lens);CHKERRQ(ierr); 1450 1451 for (i=0; i<ismax; i++) { 1452 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1453 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1454 } 1455 PetscFunctionReturn(0); 1456 } 1457 1458 /* 1459 Observe that the Seq matrices used to construct this MPI matrix are not increfed. 1460 Be careful not to destroy them elsewhere. 1461 */ 1462 #undef __FUNCT__ 1463 #define __FUNCT__ "MatCreateMPIAIJFromSeqMatrices_Private" 1464 PetscErrorCode MatCreateMPIAIJFromSeqMatrices_Private(MPI_Comm comm, Mat A, Mat B, Mat *C) 1465 { 1466 /* If making this function public, change the error returned in this function away from _PLIB. */ 1467 PetscErrorCode ierr; 1468 Mat_MPIAIJ *aij; 1469 PetscBool seqaij; 1470 1471 PetscFunctionBegin; 1472 /* Check to make sure the component matrices are compatible with C. */ 1473 ierr = PetscTypeCompare((PetscObject)A, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1474 if(!seqaij) { 1475 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Diagonal matrix is of wrong type"); 1476 } 1477 ierr = PetscTypeCompare((PetscObject)B, MATSEQAIJ, &seqaij); CHKERRQ(ierr); 1478 if(!seqaij) { 1479 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Off-diagonal matrix is of wrong type"); 1480 } 1481 if(A->rmap->n != B->rmap->n || A->rmap->bs != B->rmap->bs || A->cmap->bs != B->cmap->bs) { 1482 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1483 } 1484 ierr = MatCreate(comm, C); CHKERRQ(ierr); 1485 ierr = MatSetSizes(*C,A->rmap->n, A->cmap->n, PETSC_DECIDE, PETSC_DECIDE); CHKERRQ(ierr); 1486 ierr = PetscLayoutSetBlockSize((*C)->rmap,A->rmap->bs);CHKERRQ(ierr); 1487 ierr = PetscLayoutSetBlockSize((*C)->cmap,A->cmap->bs);CHKERRQ(ierr); 1488 ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 1489 ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 1490 if((*C)->cmap->N != A->cmap->n + B->cmap->n) { 1491 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Incompatible component matrices of an MPIAIJ matrix"); 1492 } 1493 ierr = MatSetType(*C, MATMPIAIJ); CHKERRQ(ierr); 1494 aij = (Mat_MPIAIJ*)((*C)->data); 1495 aij->A = A; 1496 aij->B = B; 1497 ierr = PetscLogObjectParent(*C,A); CHKERRQ(ierr); 1498 ierr = PetscLogObjectParent(*C,B); CHKERRQ(ierr); 1499 (*C)->preallocated = (PetscBool)(A->preallocated && B->preallocated); 1500 (*C)->assembled = (PetscBool)(A->assembled && B->assembled); 1501 PetscFunctionReturn(0); 1502 } 1503 1504 #undef __FUNCT__ 1505 #define __FUNCT__ "MatMPIAIJExtractSeqMatrices_Private" 1506 PetscErrorCode MatMPIAIJExtractSeqMatrices_Private(Mat C, Mat *A, Mat *B) 1507 { 1508 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) (C->data); 1509 PetscFunctionBegin; 1510 PetscValidPointer(A,2); 1511 PetscValidPointer(B,3); 1512 *A = aij->A; 1513 *B = aij->B; 1514 /* Note that we don't incref *A and *B, so be careful! */ 1515 PetscFunctionReturn(0); 1516 } 1517 1518 1519 #undef __FUNCT__ 1520 #define __FUNCT__ "MatGetSubMatricesParallel_MPIXAIJ" 1521 PetscErrorCode MatGetSubMatricesParallel_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[], 1522 PetscErrorCode(*getsubmats_seq)(Mat, PetscInt, const IS[], const IS[], MatReuse, Mat**), 1523 PetscErrorCode(*makefromseq)(MPI_Comm, Mat, Mat,Mat*), 1524 PetscErrorCode(*extractseq)(Mat, Mat*, Mat*)) 1525 { 1526 PetscErrorCode ierr; 1527 PetscMPIInt size, flag; 1528 PetscInt i,ii; 1529 PetscInt ismax_c; 1530 PetscFunctionBegin; 1531 if(!ismax) { 1532 PetscFunctionReturn(0); 1533 } 1534 for(i = 0, ismax_c = 0; i < ismax; ++i) { 1535 ierr = MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm, &flag); CHKERRQ(ierr); 1536 if(flag != MPI_IDENT) { 1537 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator"); 1538 } 1539 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1540 if(size > 1) { 1541 ++ismax_c; 1542 } 1543 } 1544 if(!ismax_c) { /* Sequential ISs only, so can call the sequential matrix extraction subroutine. */ 1545 ierr = (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat); CHKERRQ(ierr); 1546 } 1547 else { /* if(ismax_c) */ 1548 Mat *A,*B; 1549 IS *isrow_c, *iscol_c; 1550 PetscMPIInt size; 1551 /* 1552 Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate 1553 array of sequential matrices underlying the resulting parallel matrices. 1554 Which arrays to allocate is based on the value of MatReuse scall. 1555 There are as many diag matrices as there are original index sets. 1556 There are only as many parallel and off-diag matrices, as there are parallel (comm size > 1) index sets. 1557 1558 Sequential matrix arrays are allocated in any event: even if the array of parallel matrices already exists, 1559 we need to consolidate the underlying seq matrices into as single array to serve as placeholders into getsubmats_seq 1560 will deposite the extracted diag and off-diag parts. 1561 However, if reuse is taking place, we have to allocate the seq matrix arrays here. 1562 If reuse is NOT taking place, then the seq matrix arrays are allocated by getsubmats_seq. 1563 */ 1564 1565 /* Parallel matrix array is allocated only if no reuse is taking place. */ 1566 if (scall != MAT_REUSE_MATRIX) { 1567 ierr = PetscMalloc((ismax)*sizeof(Mat),submat);CHKERRQ(ierr); 1568 } 1569 else { 1570 ierr = PetscMalloc(ismax*sizeof(Mat), &A); CHKERRQ(ierr); 1571 ierr = PetscMalloc(ismax_c*sizeof(Mat), &B); CHKERRQ(ierr); 1572 /* If parallel matrices are being reused, then simply reuse the underlying seq matrices as well. */ 1573 for(i = 0, ii = 0; i < ismax; ++i) { 1574 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1575 if(size > 1) { 1576 ierr = (*extractseq)((*submat)[i],A+i,B+ii); CHKERRQ(ierr); 1577 ++ii; 1578 } 1579 else { 1580 A[i] = (*submat)[i]; 1581 } 1582 } 1583 } 1584 /* 1585 Construct the complements of the iscol ISs for parallel ISs only. 1586 These are used to extract the off-diag portion of the resulting parallel matrix. 1587 The row IS for the off-diag portion is the same as for the diag portion, 1588 so we merely alias the row IS, while skipping those that are sequential. 1589 */ 1590 ierr = PetscMalloc2(ismax_c,IS,&isrow_c, ismax_c, IS, &iscol_c); CHKERRQ(ierr); 1591 for(i = 0, ii = 0; i < ismax; ++i) { 1592 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1593 if(size > 1) { 1594 isrow_c[ii] = isrow[i]; 1595 ierr = ISGetNonlocalIS(iscol[i], &(iscol_c[ii])); CHKERRQ(ierr); 1596 ++ii; 1597 } 1598 } 1599 /* Now obtain the sequential A and B submatrices separately. */ 1600 ierr = (*getsubmats_seq)(C,ismax,isrow, iscol,scall, &A); CHKERRQ(ierr); 1601 ierr = (*getsubmats_seq)(C,ismax_c,isrow_c, iscol_c,scall, &B); CHKERRQ(ierr); 1602 for(ii = 0; ii < ismax_c; ++ii) { 1603 ierr = ISDestroy(iscol_c[ii]); CHKERRQ(ierr); 1604 } 1605 ierr = PetscFree2(isrow_c, iscol_c); CHKERRQ(ierr); 1606 /* 1607 If scall == MAT_REUSE_MATRIX, we are done, since the sequential matrices A & B 1608 have been extracted directly into the parallel matrices containing them, or 1609 simply into the sequential matrix identical with the corresponding A (if size == 1). 1610 Otherwise, make sure that parallel matrices are constructed from A & B, or the 1611 A is put into the correct submat slot (if size == 1). 1612 */ 1613 if(scall != MAT_REUSE_MATRIX) { 1614 for(i = 0, ii = 0; i < ismax; ++i) { 1615 ierr = MPI_Comm_size(((PetscObject)isrow[i])->comm, &size); CHKERRQ(ierr); 1616 if(size > 1) { 1617 /* 1618 For each parallel isrow[i], create parallel matrices from the extracted sequential matrices. 1619 */ 1620 /* Construct submat[i] from the Seq pieces A and B. */ 1621 ierr = (*makefromseq)(((PetscObject)isrow[i])->comm, A[i], B[ii], (*submat)+i); CHKERRQ(ierr); 1622 1623 ++ii; 1624 } 1625 else { 1626 (*submat)[i] = A[i]; 1627 } 1628 } 1629 } 1630 ierr = PetscFree(A); CHKERRQ(ierr); 1631 ierr = PetscFree(B); CHKERRQ(ierr); 1632 } 1633 PetscFunctionReturn(0); 1634 }/* MatGetSubMatricesParallel_MPIXAIJ() */ 1635 1636 1637 1638 #undef __FUNCT__ 1639 #define __FUNCT__ "MatGetSubMatricesParallel_MPIAIJ" 1640 PetscErrorCode MatGetSubMatricesParallel_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[]) 1641 { 1642 PetscErrorCode ierr; 1643 PetscFunctionBegin; 1644 ierr = MatGetSubMatricesParallel_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatGetSubMatrices_MPIAIJ, 1645 MatCreateMPIAIJFromSeqMatrices_Private, 1646 MatMPIAIJExtractSeqMatrices_Private); 1647 CHKERRQ(ierr); 1648 PetscFunctionReturn(0); 1649 } 1650