1 #ifdef PETSC_RCS_HEADER 2 static char vcid[] = "$Id: mmdense.c,v 1.13 1997/10/19 03:25:11 bsmith Exp balay $"; 3 #endif 4 5 /* 6 Support for the parallel dense matrix vector multiply 7 */ 8 #include "src/mat/impls/dense/mpi/mpidense.h" 9 #include "src/vec/vecimpl.h" 10 11 #undef __FUNC__ 12 #define __FUNC__ "MatSetUpMultiply_MPIDense" 13 int MatSetUpMultiply_MPIDense(Mat mat) 14 { 15 Mat_MPIDense *mdn = (Mat_MPIDense *) mat->data; 16 int ierr,n; 17 IS tofrom; 18 Vec gvec; 19 20 PetscFunctionBegin; 21 /* Create local vector that is used to scatter into */ 22 ierr = VecCreateSeq(PETSC_COMM_SELF,mdn->N,&mdn->lvec); CHKERRQ(ierr); 23 24 /* Create temporary index set for building scatter gather */ 25 ierr = ISCreateStride(PETSC_COMM_SELF,mdn->N,0,1,&tofrom); CHKERRQ(ierr); 26 27 /* Create temporary global vector to generate scatter context */ 28 n = mdn->cowners[mdn->rank+1] - mdn->cowners[mdn->rank]; 29 ierr = VecCreateMPI(mat->comm,n,mdn->N,&gvec); CHKERRQ(ierr); 30 31 /* Generate the scatter context */ 32 ierr = VecScatterCreate(gvec,tofrom,mdn->lvec,tofrom,&mdn->Mvctx); CHKERRQ(ierr); 33 PLogObjectParent(mat,mdn->Mvctx); 34 PLogObjectParent(mat,mdn->lvec); 35 PLogObjectParent(mat,tofrom); 36 PLogObjectParent(mat,gvec); 37 38 ierr = ISDestroy(tofrom); CHKERRQ(ierr); 39 ierr = VecDestroy(gvec); CHKERRQ(ierr); 40 PetscFunctionReturn(0); 41 } 42 43 #if defined (__JUNK__) 44 45 int MatGetSubMatrices_MPIDense_Local(Mat,int,IS*,IS*,MatGetSubMatrixCall,Mat*); 46 #undef __FUNC__ 47 #define __FUNC__ "MatGetSubMatrices_MPIDense" 48 int MatGetSubMatrices_MPIDense(Mat C,int ismax,IS *isrow,IS *iscol, 49 MatGetSubMatrixCall scall,Mat **submat) 50 { 51 Mat_MPIDense *c = (Mat_MPIDense *) C->data; 52 int nmax,nstages_local,nstages,i,pos,max_no,ierr; 53 54 PetscFunctionBegin; 55 /* Allocate memory to hold all the submatrices */ 56 if (scall != MAT_REUSE_MATRIX) { 57 *submat = (Mat *)PetscMalloc((ismax+1)*sizeof(Mat));CHKPTRQ(*submat); 58 } 59 /* Determine the number of stages through which submatrices are done */ 60 nmax = 20*1000000 / (c->N * sizeof(int)); 61 if (nmax == 0) nmax = 1; 62 nstages_local = ismax/nmax + ((ismax % nmax)?1:0); 63 64 /* Make sure every processor loops through the nstages */ 65 ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr); 66 67 68 for ( i=0,pos=0; i<nstages; i++ ) { 69 if (pos+nmax <= ismax) max_no = nmax; 70 else if (pos == ismax) max_no = 0; 71 else max_no = ismax-pos; 72 ierr = MatGetSubMatrices_MPIDense_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr); 73 pos += max_no; 74 } 75 PetscFunctionReturn(0); 76 } 77 /* -------------------------------------------------------------------------*/ 78 #undef __FUNC__ 79 #define __FUNC__ "MatGetSubMatrices_MPIDense_Local" 80 int MatGetSubMatrices_MPIDense_Local(Mat C,int ismax,IS *isrow,IS *iscol, 81 MatGetSubMatrixCall scall,Mat *submats) 82 { 83 Mat_MPIDense *c = (Mat_MPIDense *) C->data; 84 Mat A = c->A; 85 Mat_SeqDense *a = (Mat_SeqDense*)A->data, *mat; 86 int **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size; 87 int **sbuf1,**sbuf2, rank, m,i,j,k,l,ct1,ct2,ierr, **rbuf1,row,proc; 88 int nrqs, msz, **ptr,index,*req_size,*ctr,*pa,*tmp,tcol,bsz,nrqr; 89 int **rbuf3,*req_source,**sbuf_aj, **rbuf2, max1,max2,**rmap; 90 int **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i; 91 int len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i; 92 int *rmap_i,tag0,tag1,tag2,tag3; 93 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3; 94 MPI_Request *r_waits4,*s_waits3,*s_waits4; 95 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2; 96 MPI_Status *r_status3,*r_status4,*s_status4; 97 MPI_Comm comm; 98 Scalar **rbuf4, **sbuf_aa, *vals, *mat_a, *sbuf_aa_i; 99 100 PetscFunctionBegin; 101 comm = C->comm; 102 tag0 = C->tag; 103 size = c->size; 104 rank = c->rank; 105 m = c->M; 106 107 /* Get some new tags to keep the communication clean */ 108 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 109 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 110 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 111 112 /* Check if the col indices are sorted */ 113 for ( i=0; i<ismax; i++ ) { 114 ierr = ISSorted(isrow[i],(PetscTruth*)&j); 115 if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"ISrow is not sorted"); 116 ierr = ISSorted(iscol[i],(PetscTruth*)&j); 117 if (!j) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,0,"IScol is not sorted"); 118 } 119 120 len = (2*ismax+1)*(sizeof(int *) + sizeof(int)) + (m+1)*sizeof(int); 121 irow = (int **)PetscMalloc(len); CHKPTRQ(irow); 122 icol = irow + ismax; 123 nrow = (int *) (icol + ismax); 124 ncol = nrow + ismax; 125 rtable = ncol + ismax; 126 127 for ( i=0; i<ismax; i++ ) { 128 ierr = ISGetIndices(isrow[i],&irow[i]); CHKERRQ(ierr); 129 ierr = ISGetIndices(iscol[i],&icol[i]); CHKERRQ(ierr); 130 ierr = ISGetSize(isrow[i],&nrow[i]); CHKERRQ(ierr); 131 ierr = ISGetSize(iscol[i],&ncol[i]); CHKERRQ(ierr); 132 } 133 134 /* Create hash table for the mapping :row -> proc*/ 135 for ( i=0,j=0; i<size; i++ ) { 136 jmax = c->rowners[i+1]; 137 for ( ; j<jmax; j++ ) { 138 rtable[j] = i; 139 } 140 } 141 142 /* evaluate communication - mesg to who, length of mesg, and buffer space 143 required. Based on this, buffers are allocated, and data copied into them*/ 144 w1 = (int *)PetscMalloc(size*4*sizeof(int));CHKPTRQ(w1); /* mesg size */ 145 w2 = w1 + size; /* if w2[i] marked, then a message to proc i*/ 146 w3 = w2 + size; /* no of IS that needs to be sent to proc i */ 147 w4 = w3 + size; /* temp work space used in determining w1, w2, w3 */ 148 PetscMemzero(w1,size*3*sizeof(int)); /* initialize work vector*/ 149 for ( i=0; i<ismax; i++ ) { 150 PetscMemzero(w4,size*sizeof(int)); /* initialize work vector*/ 151 jmax = nrow[i]; 152 irow_i = irow[i]; 153 for ( j=0; j<jmax; j++ ) { 154 row = irow_i[j]; 155 proc = rtable[row]; 156 w4[proc]++; 157 } 158 for ( j=0; j<size; j++ ) { 159 if (w4[j]) { w1[j] += w4[j]; w3[j]++;} 160 } 161 } 162 163 nrqs = 0; /* no of outgoing messages */ 164 msz = 0; /* total mesg length (for all procs) */ 165 w1[rank] = 0; /* no mesg sent to self */ 166 w3[rank] = 0; 167 for ( i=0; i<size; i++ ) { 168 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */ 169 } 170 pa = (int *)PetscMalloc((nrqs+1)*sizeof(int));CHKPTRQ(pa); /*(proc -array)*/ 171 for ( i=0, j=0; i<size; i++ ) { 172 if (w1[i]) { pa[j] = i; j++; } 173 } 174 175 /* Each message would have a header = 1 + 2*(no of IS) + data */ 176 for ( i=0; i<nrqs; i++ ) { 177 j = pa[i]; 178 w1[j] += w2[j] + 2* w3[j]; 179 msz += w1[j]; 180 } 181 /* Do a global reduction to determine how many messages to expect*/ 182 { 183 int *rw1, *rw2; 184 rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1); 185 rw2 = rw1+size; 186 ierr = MPI_Allreduce(w1, rw1, size, MPI_INT, MPI_MAX, comm);CHKERRQ(ierr); 187 bsz = rw1[rank]; 188 ierr = MPI_Allreduce(w2, rw2, size, MPI_INT, MPI_SUM, comm);CHKERRQ(ierr); 189 nrqr = rw2[rank]; 190 PetscFree(rw1); 191 } 192 193 /* Allocate memory for recv buffers . Prob none if nrqr = 0 ???? */ 194 len = (nrqr+1)*sizeof(int*) + nrqr*bsz*sizeof(int); 195 rbuf1 = (int**) PetscMalloc(len); CHKPTRQ(rbuf1); 196 rbuf1[0] = (int *) (rbuf1 + nrqr); 197 for ( i=1; i<nrqr; ++i ) rbuf1[i] = rbuf1[i-1] + bsz; 198 199 /* Post the receives */ 200 r_waits1 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(r_waits1); 201 for ( i=0; i<nrqr; ++i ) { 202 ierr = MPI_Irecv(rbuf1[i],bsz,MPI_INT,MPI_ANY_SOURCE,tag0,comm,r_waits1+i);CHKERRQ(ierr); 203 } 204 205 /* Allocate Memory for outgoing messages */ 206 len = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int); 207 sbuf1 = (int **)PetscMalloc(len); CHKPTRQ(sbuf1); 208 ptr = sbuf1 + size; /* Pointers to the data in outgoing buffers */ 209 PetscMemzero(sbuf1,2*size*sizeof(int*)); 210 /* allocate memory for outgoing data + buf to receive the first reply */ 211 tmp = (int *) (ptr + size); 212 ctr = tmp + 2*msz; 213 214 { 215 int *iptr = tmp,ict = 0; 216 for ( i=0; i<nrqs; i++ ) { 217 j = pa[i]; 218 iptr += ict; 219 sbuf1[j] = iptr; 220 ict = w1[j]; 221 } 222 } 223 224 /* Form the outgoing messages */ 225 /* Initialize the header space */ 226 for ( i=0; i<nrqs; i++ ) { 227 j = pa[i]; 228 sbuf1[j][0] = 0; 229 PetscMemzero(sbuf1[j]+1, 2*w3[j]*sizeof(int)); 230 ptr[j] = sbuf1[j] + 2*w3[j] + 1; 231 } 232 233 /* Parse the isrow and copy data into outbuf */ 234 for ( i=0; i<ismax; i++ ) { 235 PetscMemzero(ctr,size*sizeof(int)); 236 irow_i = irow[i]; 237 jmax = nrow[i]; 238 for ( j=0; j<jmax; j++ ) { /* parse the indices of each IS */ 239 row = irow_i[j]; 240 proc = rtable[row]; 241 if (proc != rank) { /* copy to the outgoing buf*/ 242 ctr[proc]++; 243 *ptr[proc] = row; 244 ptr[proc]++; 245 } 246 } 247 /* Update the headers for the current IS */ 248 for ( j=0; j<size; j++ ) { /* Can Optimise this loop too */ 249 if ((ctr_j = ctr[j])) { 250 sbuf1_j = sbuf1[j]; 251 k = ++sbuf1_j[0]; 252 sbuf1_j[2*k] = ctr_j; 253 sbuf1_j[2*k-1] = i; 254 } 255 } 256 } 257 258 /* Now post the sends */ 259 s_waits1 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(s_waits1); 260 for ( i=0; i<nrqs; ++i ) { 261 j = pa[i]; 262 /* printf("[%d] Send Req to %d: size %d \n", rank,j, w1[j]); */ 263 ierr = MPI_Isend( sbuf1[j], w1[j], MPI_INT, j, tag0, comm, s_waits1+i);CHKERRQ(ierr); 264 } 265 266 /* Post Receives to capture the buffer size */ 267 r_waits2 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits2); 268 rbuf2 = (int**)PetscMalloc((nrqs+1)*sizeof(int *));CHKPTRQ(rbuf2); 269 rbuf2[0] = tmp + msz; 270 for ( i=1; i<nrqs; ++i ) { 271 j = pa[i]; 272 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]]; 273 } 274 for ( i=0; i<nrqs; ++i ) { 275 j = pa[i]; 276 ierr = MPI_Irecv( rbuf2[i], w1[j], MPI_INT, j, tag1, comm, r_waits2+i);CHKERRQ(ierr); 277 } 278 279 /* Send to other procs the buf size they should allocate */ 280 281 282 /* Receive messages*/ 283 s_waits2 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits2); 284 r_status1 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(r_status1); 285 len = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*); 286 sbuf2 = (int**) PetscMalloc(len);CHKPTRQ(sbuf2); 287 req_size = (int *) (sbuf2 + nrqr); 288 req_source = req_size + nrqr; 289 290 { 291 int *sbuf2_i; 292 293 for ( i=0; i<nrqr; ++i ) { 294 ierr = MPI_Waitany(nrqr, r_waits1, &index, r_status1+i);CHKERRQ(ierr); 295 req_size[index] = 0; 296 rbuf1_i = rbuf1[index]; 297 start = 2*rbuf1_i[0] + 1; 298 MPI_Get_count(r_status1+i,MPI_INT, &end); 299 sbuf2[index] = (int *)PetscMalloc((end+1)*sizeof(int));CHKPTRQ(sbuf2[index]); 300 sbuf2_i = sbuf2[index]; 301 for ( j=start; j<end; j++ ) { 302 /** ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id]; */ 303 /** ncols is now the whole row in the dense case */ 304 /** Can get rid of this unnecessary communication */ 305 ncols = c->N; 306 sbuf2_i[j] = ncols; 307 req_size[index] += ncols; 308 } 309 req_source[index] = r_status1[i].MPI_SOURCE; 310 /* form the header */ 311 sbuf2_i[0] = req_size[index]; 312 for ( j=1; j<start; j++ ) { sbuf2_i[j] = rbuf1_i[j]; } 313 ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i); CHKERRQ(ierr); 314 } 315 } 316 PetscFree(r_status1); PetscFree(r_waits1); 317 318 /* recv buffer sizes */ 319 /* Receive messages*/ 320 321 rbuf3 = (int**)PetscMalloc((nrqs+1)*sizeof(int*)); CHKPTRQ(rbuf3); 322 rbuf4 = (Scalar**)PetscMalloc((nrqs+1)*sizeof(Scalar*));CHKPTRQ(rbuf4); 323 r_waits3 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits3); 324 r_waits4 = (MPI_Request *) PetscMalloc((nrqs+1)*sizeof(MPI_Request));CHKPTRQ(r_waits4); 325 r_status2 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status2); 326 327 for ( i=0; i<nrqs; ++i ) { 328 ierr = MPI_Waitany(nrqs, r_waits2, &index, r_status2+i);CHKERRQ(ierr); 329 rbuf3[index] = (int *)PetscMalloc((rbuf2[index][0]+1)*sizeof(int));CHKPTRQ(rbuf3[index]); 330 rbuf4[index] = (Scalar *)PetscMalloc((rbuf2[index][0]+1)*sizeof(Scalar));CHKPTRQ(rbuf4[index]); 331 ierr = MPI_Irecv(rbuf3[index],rbuf2[index][0], MPI_INT, 332 r_status2[i].MPI_SOURCE, tag2, comm, r_waits3+index); CHKERRQ(ierr); 333 ierr = MPI_Irecv(rbuf4[index],rbuf2[index][0], MPIU_SCALAR, 334 r_status2[i].MPI_SOURCE, tag3, comm, r_waits4+index); CHKERRQ(ierr); 335 } 336 PetscFree(r_status2); PetscFree(r_waits2); 337 338 /* Wait on sends1 and sends2 */ 339 s_status1 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(s_status1); 340 s_status2 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status2); 341 342 ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr); 343 ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr); 344 PetscFree(s_status1); PetscFree(s_status2); 345 PetscFree(s_waits1); PetscFree(s_waits2); 346 347 /* Now allocate buffers for a->j, and send them off */ 348 /** No space required for a->j... as the complete row is packed and sent */ 349 /** sbuf_aj = (int **)PetscMalloc((nrqr+1)*sizeof(int *));CHKPTRQ(sbuf_aj); */ 350 /** for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; */ 351 /** sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]); */ 352 /** for ( i=1; i<nrqr; i++ ) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1]; */ 353 354 s_waits3 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits3); 355 { 356 int nzA, nzB, *a_i = a->i, *b_i = b->i, imark; 357 int *cworkA, *cworkB, cstart = c->cstart, rstart = c->rstart, *bmap = c->garray; 358 int *a_j = a->j, *b_j = b->j, ctmp, *t_cols; 359 360 for ( i=0; i<nrqr; i++ ) { 361 rbuf1_i = rbuf1[i]; 362 sbuf_aj_i = sbuf_aj[i]; 363 ct1 = 2*rbuf1_i[0] + 1; 364 ct2 = 0; 365 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 366 kmax = rbuf1[i][2*j]; 367 for ( k=0; k<kmax; k++,ct1++ ) { 368 row = rbuf1_i[ct1] - rstart; 369 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 370 ncols = nzA + nzB; 371 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row]; 372 373 /* load the column indices for this row into cols*/ 374 cols = sbuf_aj_i + ct2; 375 for ( l=0; l<nzB; l++ ) { 376 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp; 377 else break; 378 } 379 imark = l; 380 for ( l=0; l<nzA; l++ ) cols[imark+l] = cstart + cworkA[l]; 381 for ( l=imark; l<nzB; l++ ) cols[nzA+l] = bmap[cworkB[l]]; 382 383 ct2 += ncols; 384 } 385 } 386 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr); 387 } 388 } 389 r_status3 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status3); 390 s_status3 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status3); 391 392 /* Allocate buffers for a->a, and send them off */ 393 sbuf_aa = (Scalar **)PetscMalloc((nrqr+1)*sizeof(Scalar *));CHKPTRQ(sbuf_aa); 394 for ( i=0,j=0; i<nrqr; i++ ) j += req_size[i]; 395 sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*sizeof(Scalar));CHKPTRQ(sbuf_aa[0]); 396 for ( i=1; i<nrqr; i++ ) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]; 397 398 s_waits4 = (MPI_Request *) PetscMalloc((nrqr+1)*sizeof(MPI_Request));CHKPTRQ(s_waits4); 399 { 400 int nzA, nzB, *a_i = a->i, *b_i = b->i, *cworkB, imark; 401 int cstart = c->cstart, rstart = c->rstart, *bmap = c->garray; 402 int *b_j = b->j; 403 Scalar *vworkA, *vworkB, *a_a = a->a, *b_a = b->a,*t_vals; 404 405 for ( i=0; i<nrqr; i++ ) { 406 rbuf1_i = rbuf1[i]; 407 sbuf_aa_i = sbuf_aa[i]; 408 ct1 = 2*rbuf1_i[0]+1; 409 ct2 = 0; 410 for ( j=1,max1=rbuf1_i[0]; j<=max1; j++ ) { 411 kmax = rbuf1_i[2*j]; 412 for ( k=0; k<kmax; k++,ct1++ ) { 413 row = rbuf1_i[ct1] - rstart; 414 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row]; 415 ncols = nzA + nzB; 416 cworkB = b_j + b_i[row]; 417 vworkA = a_a + a_i[row]; 418 vworkB = b_a + b_i[row]; 419 420 /* load the column values for this row into vals*/ 421 vals = sbuf_aa_i+ct2; 422 for ( l=0; l<nzB; l++ ) { 423 if ((bmap[cworkB[l]]) < cstart) vals[l] = vworkB[l]; 424 else break; 425 } 426 imark = l; 427 for ( l=0; l<nzA; l++ ) vals[imark+l] = vworkA[l]; 428 for ( l=imark; l<nzB; l++ ) vals[nzA+l] = vworkB[l]; 429 ct2 += ncols; 430 } 431 } 432 ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr); 433 } 434 } 435 r_status4 = (MPI_Status *) PetscMalloc((nrqs+1)*sizeof(MPI_Status));CHKPTRQ(r_status4); 436 s_status4 = (MPI_Status *) PetscMalloc((nrqr+1)*sizeof(MPI_Status));CHKPTRQ(s_status4); 437 PetscFree(rbuf1); 438 439 /* Form the matrix */ 440 /* create col map */ 441 { 442 int *icol_i; 443 444 len = (1+ismax)*sizeof(int *) + ismax*c->N*sizeof(int); 445 cmap = (int **)PetscMalloc(len); CHKPTRQ(cmap); 446 cmap[0] = (int *)(cmap + ismax); 447 PetscMemzero(cmap[0],(1+ismax*c->N)*sizeof(int)); 448 for ( i=1; i<ismax; i++ ) { cmap[i] = cmap[i-1] + c->N; } 449 for ( i=0; i<ismax; i++ ) { 450 jmax = ncol[i]; 451 icol_i = icol[i]; 452 cmap_i = cmap[i]; 453 for ( j=0; j<jmax; j++ ) { 454 cmap_i[icol_i[j]] = j+1; 455 } 456 } 457 } 458 459 /* Create lens which is required for MatCreate... */ 460 for ( i=0,j=0; i<ismax; i++ ) { j += nrow[i]; } 461 len = (1+ismax)*sizeof(int *) + j*sizeof(int); 462 lens = (int **)PetscMalloc(len); CHKPTRQ(lens); 463 lens[0] = (int *)(lens + ismax); 464 PetscMemzero(lens[0], j*sizeof(int)); 465 for ( i=1; i<ismax; i++ ) { lens[i] = lens[i-1] + nrow[i-1]; } 466 467 /* Update lens from local data */ 468 for ( i=0; i<ismax; i++ ) { 469 jmax = nrow[i]; 470 cmap_i = cmap[i]; 471 irow_i = irow[i]; 472 lens_i = lens[i]; 473 for ( j=0; j<jmax; j++ ) { 474 row = irow_i[j]; 475 proc = rtable[row]; 476 if (proc == rank) { 477 ierr = MatGetRow_MPIDense(C,row,&ncols,&cols,0); CHKERRQ(ierr); 478 for ( k=0; k<ncols; k++ ) { 479 if (cmap_i[cols[k]]) { lens_i[j]++;} 480 } 481 ierr = MatRestoreRow_MPIDense(C,row,&ncols,&cols,0); CHKERRQ(ierr); 482 } 483 } 484 } 485 486 /* Create row map*/ 487 len = (1+ismax)*sizeof(int *) + ismax*c->M*sizeof(int); 488 rmap = (int **)PetscMalloc(len); CHKPTRQ(rmap); 489 rmap[0] = (int *)(rmap + ismax); 490 PetscMemzero(rmap[0],ismax*c->M*sizeof(int)); 491 for ( i=1; i<ismax; i++ ) { rmap[i] = rmap[i-1] + c->M;} 492 for ( i=0; i<ismax; i++ ) { 493 rmap_i = rmap[i]; 494 irow_i = irow[i]; 495 jmax = nrow[i]; 496 for ( j=0; j<jmax; j++ ) { 497 rmap_i[irow_i[j]] = j; 498 } 499 } 500 501 /* Update lens from offproc data */ 502 { 503 int *rbuf2_i, *rbuf3_i, *sbuf1_i; 504 505 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 506 ierr = MPI_Waitany(nrqs, r_waits3, &i, r_status3+tmp2);CHKERRQ(ierr); 507 index = pa[i]; 508 sbuf1_i = sbuf1[index]; 509 jmax = sbuf1_i[0]; 510 ct1 = 2*jmax+1; 511 ct2 = 0; 512 rbuf2_i = rbuf2[i]; 513 rbuf3_i = rbuf3[i]; 514 for ( j=1; j<=jmax; j++ ) { 515 is_no = sbuf1_i[2*j-1]; 516 max1 = sbuf1_i[2*j]; 517 lens_i = lens[is_no]; 518 cmap_i = cmap[is_no]; 519 rmap_i = rmap[is_no]; 520 for ( k=0; k<max1; k++,ct1++ ) { 521 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */ 522 max2 = rbuf2_i[ct1]; 523 for ( l=0; l<max2; l++,ct2++ ) { 524 if (cmap_i[rbuf3_i[ct2]]) { 525 lens_i[row]++; 526 } 527 } 528 } 529 } 530 } 531 } 532 PetscFree(r_status3); PetscFree(r_waits3); 533 ierr = MPI_Waitall(nrqr,s_waits3,s_status3); CHKERRQ(ierr); 534 PetscFree(s_status3); PetscFree(s_waits3); 535 536 /* Create the submatrices */ 537 if (scall == MAT_REUSE_MATRIX) { 538 /* 539 Assumes new rows are same length as the old rows, hence bug! 540 */ 541 for ( i=0; i<ismax; i++ ) { 542 mat = (Mat_SeqDense *)(submats[i]->data); 543 if ((mat->m != nrow[i]) || (mat->n != ncol[i])) { 544 SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong size"); 545 } 546 if (PetscMemcmp(mat->ilen,lens[i], mat->m *sizeof(int))) { 547 SETERRQ(PETSC_ERR_ARG_SIZ,0,"Cannot reuse matrix. wrong no of nonzeros"); 548 } 549 /* Initial matrix as if empty */ 550 PetscMemzero(mat->ilen,mat->m*sizeof(int)); 551 submats[i]->factor = C->factor; 552 } 553 } else { 554 for ( i=0; i<ismax; i++ ) { 555 ierr = MatCreateSeqDense(PETSC_COMM_SELF,nrow[i],ncol[i],0,lens[i],submats+i);CHKERRQ(ierr); 556 } 557 } 558 559 /* Assemble the matrices */ 560 /* First assemble the local rows */ 561 { 562 int ilen_row,*imat_ilen, *imat_j, *imat_i,old_row; 563 Scalar *imat_a; 564 565 for ( i=0; i<ismax; i++ ) { 566 mat = (Mat_SeqDense *) submats[i]->data; 567 imat_ilen = mat->ilen; 568 imat_j = mat->j; 569 imat_i = mat->i; 570 imat_a = mat->a; 571 cmap_i = cmap[i]; 572 rmap_i = rmap[i]; 573 irow_i = irow[i]; 574 jmax = nrow[i]; 575 for ( j=0; j<jmax; j++ ) { 576 row = irow_i[j]; 577 proc = rtable[row]; 578 if (proc == rank) { 579 old_row = row; 580 row = rmap_i[row]; 581 ilen_row = imat_ilen[row]; 582 ierr = MatGetRow_MPIDense(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 583 mat_i = imat_i[row]; 584 mat_a = imat_a + mat_i; 585 mat_j = imat_j + mat_i; 586 for ( k=0; k<ncols; k++ ) { 587 if ((tcol = cmap_i[cols[k]])) { 588 *mat_j++ = tcol - -1; 589 *mat_a++ = vals[k]; 590 ilen_row++; 591 } 592 } 593 ierr = MatRestoreRow_MPIDense(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr); 594 imat_ilen[row] = ilen_row; 595 } 596 } 597 } 598 } 599 600 /* Now assemble the off proc rows*/ 601 { 602 int *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen; 603 int *imat_j,*imat_i; 604 Scalar *imat_a,*rbuf4_i; 605 606 for ( tmp2=0; tmp2<nrqs; tmp2++ ) { 607 ierr = MPI_Waitany(nrqs, r_waits4, &i, r_status4+tmp2);CHKERRQ(ierr); 608 index = pa[i]; 609 sbuf1_i = sbuf1[index]; 610 jmax = sbuf1_i[0]; 611 ct1 = 2*jmax + 1; 612 ct2 = 0; 613 rbuf2_i = rbuf2[i]; 614 rbuf3_i = rbuf3[i]; 615 rbuf4_i = rbuf4[i]; 616 for ( j=1; j<=jmax; j++ ) { 617 is_no = sbuf1_i[2*j-1]; 618 rmap_i = rmap[is_no]; 619 cmap_i = cmap[is_no]; 620 mat = (Mat_SeqDense *) submats[is_no]->data; 621 imat_ilen = mat->ilen; 622 imat_j = mat->j; 623 imat_i = mat->i; 624 imat_a = mat->a; 625 max1 = sbuf1_i[2*j]; 626 for ( k=0; k<max1; k++, ct1++ ) { 627 row = sbuf1_i[ct1]; 628 row = rmap_i[row]; 629 ilen = imat_ilen[row]; 630 mat_i = imat_i[row]; 631 mat_a = imat_a + mat_i; 632 mat_j = imat_j + mat_i; 633 max2 = rbuf2_i[ct1]; 634 for ( l=0; l<max2; l++,ct2++ ) { 635 if ((tcol = cmap_i[rbuf3_i[ct2]])) { 636 *mat_j++ = tcol - 1; 637 *mat_a++ = rbuf4_i[ct2]; 638 ilen++; 639 } 640 } 641 imat_ilen[row] = ilen; 642 } 643 } 644 } 645 } 646 PetscFree(r_status4); PetscFree(r_waits4); 647 ierr = MPI_Waitall(nrqr,s_waits4,s_status4); CHKERRQ(ierr); 648 PetscFree(s_waits4); PetscFree(s_status4); 649 650 /* Restore the indices */ 651 for ( i=0; i<ismax; i++ ) { 652 ierr = ISRestoreIndices(isrow[i], irow+i); CHKERRQ(ierr); 653 ierr = ISRestoreIndices(iscol[i], icol+i); CHKERRQ(ierr); 654 } 655 656 /* Destroy allocated memory */ 657 PetscFree(irow); 658 PetscFree(w1); 659 PetscFree(pa); 660 661 PetscFree(sbuf1); 662 PetscFree(rbuf2); 663 for ( i=0; i<nrqr; ++i ) { 664 PetscFree(sbuf2[i]); 665 } 666 for ( i=0; i<nrqs; ++i ) { 667 PetscFree(rbuf3[i]); 668 PetscFree(rbuf4[i]); 669 } 670 671 PetscFree(sbuf2); 672 PetscFree(rbuf3); 673 PetscFree(rbuf4 ); 674 PetscFree(sbuf_aj[0]); 675 PetscFree(sbuf_aj); 676 PetscFree(sbuf_aa[0]); 677 PetscFree(sbuf_aa); 678 679 PetscFree(cmap); 680 PetscFree(rmap); 681 PetscFree(lens); 682 683 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag3); CHKERRQ(ierr); 684 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); 685 ierr = PetscObjectRestoreNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); 686 687 for ( i=0; i<ismax; i++ ) { 688 ierr = MatAssemblyBegin(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 689 ierr = MatAssemblyEnd(submats[i], MAT_FINAL_ASSEMBLY); CHKERRQ(ierr); 690 } 691 692 PetscFunctionReturn(0); 693 } 694 695 #endif 696