1 2 /* 3 Defines matrix-matrix product routines for pairs of MPIAIJ matrices 4 C = A * B 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <../src/mat/impls/aij/mpi/mpiaij.h> 9 #include <petscbt.h> 10 #include <../src/mat/impls/dense/mpi/mpidense.h> 11 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ" 14 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C) 15 { 16 PetscErrorCode ierr; 17 18 PetscFunctionBegin; 19 if (scall == MAT_INITIAL_MATRIX){ 20 ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr); 21 } 22 23 ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr); 24 PetscFunctionReturn(0); 25 } 26 27 #undef __FUNCT__ 28 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult" 29 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A) 30 { 31 PetscErrorCode ierr; 32 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 33 Mat_PtAPMPI *ptap=a->ptap; 34 35 PetscFunctionBegin; 36 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 37 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 38 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 39 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 40 ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr); 41 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 42 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 43 ierr = PetscFree(ptap->apa);CHKERRQ(ierr); 44 ierr = ptap->destroy(A);CHKERRQ(ierr); 45 ierr = PetscFree(ptap);CHKERRQ(ierr); 46 PetscFunctionReturn(0); 47 } 48 49 #undef __FUNCT__ 50 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult" 51 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M) 52 { 53 PetscErrorCode ierr; 54 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 55 Mat_PtAPMPI *ptap=a->ptap; 56 57 PetscFunctionBegin; 58 ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr); 59 (*M)->ops->destroy = ptap->destroy; /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */ 60 (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */ 61 PetscFunctionReturn(0); 62 } 63 64 #undef __FUNCT__ 65 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ" 66 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 67 { 68 PetscErrorCode ierr; 69 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data; 70 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 71 Mat_SeqAIJ *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data; 72 PetscInt *adi=ad->i,*adj,*aoi=ao->i,*aoj; 73 PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a; 74 Mat_SeqAIJ *p_loc,*p_oth; 75 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj; 76 PetscScalar *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca; 77 PetscInt cm=C->rmap->n,anz,pnz; 78 Mat_PtAPMPI *ptap=c->ptap; 79 PetscInt *api,*apj,*apJ,i,j,k,row; 80 PetscInt cstart=C->cmap->rstart; 81 PetscInt cdnz,conz,k0,k1; 82 83 PetscFunctionBegin; 84 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 85 /*-----------------------------------------------------*/ 86 /* update numerical values of P_oth and P_loc */ 87 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 88 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 89 90 /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */ 91 /*----------------------------------------------------------*/ 92 /* get data from symbolic products */ 93 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 94 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 95 pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a; 96 pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 97 98 /* get apa for storing dense row A[i,:]*P */ 99 apa = ptap->apa; 100 101 api = ptap->api; 102 apj = ptap->apj; 103 for (i=0; i<cm; i++) { 104 /* diagonal portion of A */ 105 anz = adi[i+1] - adi[i]; 106 adj = ad->j + adi[i]; 107 ada = ad->a + adi[i]; 108 for (j=0; j<anz; j++) { 109 row = adj[j]; 110 pnz = pi_loc[row+1] - pi_loc[row]; 111 pj = pj_loc + pi_loc[row]; 112 pa = pa_loc + pi_loc[row]; 113 114 /* perform dense axpy */ 115 valtmp = ada[j]; 116 for (k=0; k<pnz; k++){ 117 apa[pj[k]] += valtmp*pa[k]; 118 } 119 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 120 } 121 122 /* off-diagonal portion of A */ 123 anz = aoi[i+1] - aoi[i]; 124 aoj = ao->j + aoi[i]; 125 aoa = ao->a + aoi[i]; 126 for (j=0; j<anz; j++) { 127 row = aoj[j]; 128 pnz = pi_oth[row+1] - pi_oth[row]; 129 pj = pj_oth + pi_oth[row]; 130 pa = pa_oth + pi_oth[row]; 131 132 /* perform dense axpy */ 133 valtmp = aoa[j]; 134 for (k=0; k<pnz; k++){ 135 apa[pj[k]] += valtmp*pa[k]; 136 } 137 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 138 } 139 140 /* set values in C */ 141 apJ = apj + api[i]; 142 cdnz = cd->i[i+1] - cd->i[i]; 143 conz = co->i[i+1] - co->i[i]; 144 145 /* 1st off-diagoanl part of C */ 146 ca = coa + co->i[i]; 147 k = 0; 148 for (k0=0; k0<conz; k0++){ 149 if (apJ[k] >= cstart) break; 150 ca[k0] = apa[apJ[k]]; 151 apa[apJ[k]] = 0.0; 152 k++; 153 } 154 155 /* diagonal part of C */ 156 ca = cda + cd->i[i]; 157 for (k1=0; k1<cdnz; k1++){ 158 ca[k1] = apa[apJ[k]]; 159 apa[apJ[k]] = 0.0; 160 k++; 161 } 162 163 /* 2nd off-diagoanl part of C */ 164 ca = coa + co->i[i]; 165 for (; k0<conz; k0++){ 166 ca[k0] = apa[apJ[k]]; 167 apa[apJ[k]] = 0.0; 168 k++; 169 } 170 } 171 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 172 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ" 178 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 179 { 180 PetscErrorCode ierr; 181 MPI_Comm comm=((PetscObject)A)->comm; 182 Mat Cmpi; 183 Mat_PtAPMPI *ptap; 184 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 185 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c; 186 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth; 187 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz; 188 PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart; 189 PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi; 190 PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n; 191 PetscBT lnkbt; 192 PetscScalar *apa; 193 PetscReal afill; 194 PetscBool scalable=PETSC_TRUE; 195 PetscInt nlnk_max,armax,prmax; 196 197 PetscFunctionBegin; 198 if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend){ 199 SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend); 200 } 201 202 ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 203 ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr); 204 if (scalable){ 205 ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(A,P,fill,C);CHKERRQ(ierr); 206 PetscFunctionReturn(0); 207 } 208 ierr = PetscOptionsEnd();CHKERRQ(ierr); 209 210 /* create struct Mat_PtAPMPI and attached it to C later */ 211 ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr); 212 213 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 214 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 215 216 /* get P_loc by taking all local rows of P */ 217 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 218 219 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 220 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 221 pi_loc = p_loc->i; pj_loc = p_loc->j; 222 pi_oth = p_oth->i; pj_oth = p_oth->j; 223 224 /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */ 225 /*-------------------------------------------------------------------*/ 226 ierr = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr); 227 ptap->api = api; 228 api[0] = 0; 229 230 /* create and initialize a linked list */ 231 armax = ad->rmax+ao->rmax; 232 prmax = PetscMax(p_loc->rmax,p_oth->rmax); 233 nlnk_max = armax*prmax; 234 if (!nlnk_max || nlnk_max > pN) nlnk_max = pN; 235 ierr = PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);CHKERRQ(ierr); 236 237 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 238 ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr); 239 current_space = free_space; 240 241 ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr); 242 for (i=0; i<am; i++) { 243 /* diagonal portion of A */ 244 nzi = adi[i+1] - adi[i]; 245 for (j=0; j<nzi; j++){ 246 row = *adj++; 247 pnz = pi_loc[row+1] - pi_loc[row]; 248 Jptr = pj_loc + pi_loc[row]; 249 /* add non-zero cols of P into the sorted linked list lnk */ 250 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 251 } 252 /* off-diagonal portion of A */ 253 nzi = aoi[i+1] - aoi[i]; 254 for (j=0; j<nzi; j++){ 255 row = *aoj++; 256 pnz = pi_oth[row+1] - pi_oth[row]; 257 Jptr = pj_oth + pi_oth[row]; 258 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 259 } 260 261 apnz = lnk[0]; 262 api[i+1] = api[i] + apnz; 263 264 /* if free space is not available, double the total space in the list */ 265 if (current_space->local_remaining<apnz) { 266 ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 267 nspacedouble++; 268 } 269 270 /* Copy data into free space, then initialize lnk */ 271 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 272 ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr); 273 current_space->array += apnz; 274 current_space->local_used += apnz; 275 current_space->local_remaining -= apnz; 276 } 277 278 /* Allocate space for apj, initialize apj, and */ 279 /* destroy list of free space and other temporary array(s) */ 280 ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr); 281 apj = ptap->apj; 282 ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr); 283 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 284 285 /* malloc apa to store dense row A[i,:]*P */ 286 ierr = PetscMalloc(pN*sizeof(PetscScalar),&apa);CHKERRQ(ierr); 287 ierr = PetscMemzero(apa,pN*sizeof(PetscScalar));CHKERRQ(ierr); 288 ptap->apa = apa; 289 290 /* create and assemble symbolic parallel matrix Cmpi */ 291 /*----------------------------------------------------*/ 292 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 293 ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 294 ierr = MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);CHKERRQ(ierr); 295 296 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 297 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 298 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 299 for (i=0; i<am; i++){ 300 row = i + rstart; 301 apnz = api[i+1] - api[i]; 302 ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr); 303 apj += apnz; 304 } 305 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 306 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 307 308 ptap->destroy = Cmpi->ops->destroy; 309 ptap->duplicate = Cmpi->ops->duplicate; 310 Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult; 311 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult; 312 313 /* attach the supporting struct to Cmpi for reuse */ 314 c = (Mat_MPIAIJ*)Cmpi->data; 315 c->ptap = ptap; 316 317 *C = Cmpi; 318 319 /* set MatInfo */ 320 afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5; 321 if (afill < 1.0) afill = 1.0; 322 Cmpi->info.mallocs = nspacedouble; 323 Cmpi->info.fill_ratio_given = fill; 324 Cmpi->info.fill_ratio_needed = afill; 325 326 #if defined(PETSC_USE_INFO) 327 if (api[am]) { 328 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr); 329 ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr); 330 } else { 331 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 332 } 333 #endif 334 PetscFunctionReturn(0); 335 } 336 337 #undef __FUNCT__ 338 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense" 339 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 340 { 341 PetscErrorCode ierr; 342 343 PetscFunctionBegin; 344 if (scall == MAT_INITIAL_MATRIX){ 345 ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr); 346 } 347 ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr); 348 PetscFunctionReturn(0); 349 } 350 351 typedef struct { 352 Mat workB; 353 PetscScalar *rvalues,*svalues; 354 MPI_Request *rwaits,*swaits; 355 } MPIAIJ_MPIDense; 356 357 #undef __FUNCT__ 358 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy" 359 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx) 360 { 361 MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx; 362 PetscErrorCode ierr; 363 364 PetscFunctionBegin; 365 ierr = MatDestroy(&contents->workB);CHKERRQ(ierr); 366 ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr); 367 ierr = PetscFree(contents);CHKERRQ(ierr); 368 PetscFunctionReturn(0); 369 } 370 371 #undef __FUNCT__ 372 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense" 373 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C) 374 { 375 PetscErrorCode ierr; 376 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data; 377 PetscInt nz = aij->B->cmap->n; 378 PetscContainer container; 379 MPIAIJ_MPIDense *contents; 380 VecScatter ctx = aij->Mvctx; 381 VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata; 382 VecScatter_MPI_General *to = ( VecScatter_MPI_General*) ctx->todata; 383 PetscInt m=A->rmap->n,n=B->cmap->n; 384 385 PetscFunctionBegin; 386 ierr = MatCreate(((PetscObject)B)->comm,C);CHKERRQ(ierr); 387 ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr); 388 ierr = MatSetBlockSizes(*C,A->rmap->bs,B->cmap->bs);CHKERRQ(ierr); 389 ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr); 390 ierr = MatMPIDenseSetPreallocation(*C,PETSC_NULL);CHKERRQ(ierr); 391 ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 392 ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 393 (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense; 394 395 ierr = PetscNew(MPIAIJ_MPIDense,&contents);CHKERRQ(ierr); 396 /* Create work matrix used to store off processor rows of B needed for local product */ 397 ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,PETSC_NULL,&contents->workB);CHKERRQ(ierr); 398 /* Create work arrays needed */ 399 ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],PetscScalar,&contents->rvalues, 400 B->cmap->N*to->starts[to->n],PetscScalar,&contents->svalues, 401 from->n,MPI_Request,&contents->rwaits, 402 to->n,MPI_Request,&contents->swaits);CHKERRQ(ierr); 403 404 ierr = PetscContainerCreate(((PetscObject)A)->comm,&container);CHKERRQ(ierr); 405 ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr); 406 ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr); 407 ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr); 408 ierr = PetscContainerDestroy(&container);CHKERRQ(ierr); 409 PetscFunctionReturn(0); 410 } 411 412 #undef __FUNCT__ 413 #define __FUNCT__ "MatMPIDenseScatter" 414 /* 415 Performs an efficient scatter on the rows of B needed by this process; this is 416 a modification of the VecScatterBegin_() routines. 417 */ 418 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB) 419 { 420 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 421 PetscErrorCode ierr; 422 PetscScalar *b,*w,*svalues,*rvalues; 423 VecScatter ctx = aij->Mvctx; 424 VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata; 425 VecScatter_MPI_General *to = ( VecScatter_MPI_General*) ctx->todata; 426 PetscInt i,j,k; 427 PetscInt *sindices,*sstarts,*rindices,*rstarts; 428 PetscMPIInt *sprocs,*rprocs,nrecvs; 429 MPI_Request *swaits,*rwaits; 430 MPI_Comm comm = ((PetscObject)A)->comm; 431 PetscMPIInt tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n; 432 MPI_Status status; 433 MPIAIJ_MPIDense *contents; 434 PetscContainer container; 435 Mat workB; 436 437 PetscFunctionBegin; 438 ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr); 439 if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist"); 440 ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr); 441 442 workB = *outworkB = contents->workB; 443 if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n); 444 sindices = to->indices; 445 sstarts = to->starts; 446 sprocs = to->procs; 447 swaits = contents->swaits; 448 svalues = contents->svalues; 449 450 rindices = from->indices; 451 rstarts = from->starts; 452 rprocs = from->procs; 453 rwaits = contents->rwaits; 454 rvalues = contents->rvalues; 455 456 ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr); 457 ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr); 458 459 for (i=0; i<from->n; i++) { 460 ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr); 461 } 462 463 for (i=0; i<to->n; i++) { 464 /* pack a message at a time */ 465 CHKMEMQ; 466 for (j=0; j<sstarts[i+1]-sstarts[i]; j++){ 467 for (k=0; k<ncols; k++) { 468 svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k]; 469 } 470 } 471 CHKMEMQ; 472 ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr); 473 } 474 475 nrecvs = from->n; 476 while (nrecvs) { 477 ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr); 478 nrecvs--; 479 /* unpack a message at a time */ 480 CHKMEMQ; 481 for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++){ 482 for (k=0; k<ncols; k++) { 483 w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k]; 484 } 485 } 486 CHKMEMQ; 487 } 488 if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);} 489 490 ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr); 491 ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr); 492 ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 493 ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 494 PetscFunctionReturn(0); 495 } 496 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat); 497 498 #undef __FUNCT__ 499 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense" 500 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C) 501 { 502 PetscErrorCode ierr; 503 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 504 Mat_MPIDense *bdense = (Mat_MPIDense*)B->data; 505 Mat_MPIDense *cdense = (Mat_MPIDense*)C->data; 506 Mat workB; 507 508 PetscFunctionBegin; 509 510 /* diagonal block of A times all local rows of B*/ 511 ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr); 512 513 /* get off processor parts of B needed to complete the product */ 514 ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr); 515 516 /* off-diagonal block of A times nonlocal rows of B */ 517 ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr); 518 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 519 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 520 PetscFunctionReturn(0); 521 } 522 523 #undef __FUNCT__ 524 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable" 525 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,Mat C) 526 { 527 PetscErrorCode ierr; 528 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data; 529 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 530 Mat_SeqAIJ *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data; 531 PetscInt *adi=ad->i,*adj,*aoi=ao->i,*aoj; 532 PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a; 533 Mat_SeqAIJ *p_loc,*p_oth; 534 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj; 535 PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca; 536 PetscInt cm=C->rmap->n,anz,pnz; 537 Mat_PtAPMPI *ptap=c->ptap; 538 PetscScalar *apa_sparse=ptap->apa; 539 PetscInt *api,*apj,*apJ,i,j,k,row; 540 PetscInt cstart=C->cmap->rstart; 541 PetscInt cdnz,conz,k0,k1,nextp; 542 543 PetscFunctionBegin; 544 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 545 /*-----------------------------------------------------*/ 546 /* update numerical values of P_oth and P_loc */ 547 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 548 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 549 550 /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */ 551 /*----------------------------------------------------------*/ 552 /* get data from symbolic products */ 553 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 554 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 555 pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a; 556 pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 557 558 api = ptap->api; 559 apj = ptap->apj; 560 for (i=0; i<cm; i++) { 561 apJ = apj + api[i]; 562 563 /* diagonal portion of A */ 564 anz = adi[i+1] - adi[i]; 565 adj = ad->j + adi[i]; 566 ada = ad->a + adi[i]; 567 for (j=0; j<anz; j++) { 568 row = adj[j]; 569 pnz = pi_loc[row+1] - pi_loc[row]; 570 pj = pj_loc + pi_loc[row]; 571 pa = pa_loc + pi_loc[row]; 572 /* perform sparse axpy */ 573 valtmp = ada[j]; 574 nextp = 0; 575 for (k=0; nextp<pnz; k++) { 576 if (apJ[k] == pj[nextp]) { /* column of AP == column of P */ 577 apa_sparse[k] += valtmp*pa[nextp++]; 578 } 579 } 580 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 581 } 582 583 /* off-diagonal portion of A */ 584 anz = aoi[i+1] - aoi[i]; 585 aoj = ao->j + aoi[i]; 586 aoa = ao->a + aoi[i]; 587 for (j=0; j<anz; j++) { 588 row = aoj[j]; 589 pnz = pi_oth[row+1] - pi_oth[row]; 590 pj = pj_oth + pi_oth[row]; 591 pa = pa_oth + pi_oth[row]; 592 /* perform sparse axpy */ 593 valtmp = aoa[j]; 594 nextp = 0; 595 for (k=0; nextp<pnz; k++) { 596 if (apJ[k] == pj[nextp]) { /* column of AP == column of P */ 597 apa_sparse[k] += valtmp*pa[nextp++]; 598 } 599 } 600 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 601 } 602 603 /* set values in C */ 604 cdnz = cd->i[i+1] - cd->i[i]; 605 conz = co->i[i+1] - co->i[i]; 606 607 /* 1st off-diagoanl part of C */ 608 ca = coa + co->i[i]; 609 k = 0; 610 for (k0=0; k0<conz; k0++){ 611 if (apJ[k] >= cstart) break; 612 ca[k0] = apa_sparse[k]; 613 apa_sparse[k] = 0.0; 614 k++; 615 } 616 617 /* diagonal part of C */ 618 ca = cda + cd->i[i]; 619 for (k1=0; k1<cdnz; k1++){ 620 ca[k1] = apa_sparse[k]; 621 apa_sparse[k] = 0.0; 622 k++; 623 } 624 625 /* 2nd off-diagoanl part of C */ 626 ca = coa + co->i[i]; 627 for (; k0<conz; k0++){ 628 ca[k0] = apa_sparse[k]; 629 apa_sparse[k] = 0.0; 630 k++; 631 } 632 } 633 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 634 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 635 PetscFunctionReturn(0); 636 } 637 638 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ(), except using LLCondensed to avoid O(BN) memory requirement */ 639 #undef __FUNCT__ 640 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable" 641 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,PetscReal fill,Mat *C) 642 { 643 PetscErrorCode ierr; 644 MPI_Comm comm=((PetscObject)A)->comm; 645 Mat Cmpi; 646 Mat_PtAPMPI *ptap; 647 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 648 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c; 649 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth; 650 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz; 651 PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart; 652 PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0; 653 PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n; 654 PetscInt nlnk_max,armax,prmax; 655 PetscReal afill; 656 PetscScalar *apa; 657 658 PetscFunctionBegin; 659 /* create struct Mat_PtAPMPI and attached it to C later */ 660 ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr); 661 662 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 663 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 664 665 /* get P_loc by taking all local rows of P */ 666 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 667 668 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 669 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 670 pi_loc = p_loc->i; pj_loc = p_loc->j; 671 pi_oth = p_oth->i; pj_oth = p_oth->j; 672 673 /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */ 674 /*-------------------------------------------------------------------*/ 675 ierr = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr); 676 ptap->api = api; 677 api[0] = 0; 678 679 /* create and initialize a linked list */ 680 armax = ad->rmax+ao->rmax; 681 prmax = PetscMax(p_loc->rmax,p_oth->rmax); 682 nlnk_max = armax*prmax; 683 if (!nlnk_max || nlnk_max > pN) nlnk_max = pN; 684 ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr); 685 686 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 687 ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr); 688 current_space = free_space; 689 690 ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr); 691 for (i=0; i<am; i++) { 692 /* diagonal portion of A */ 693 nzi = adi[i+1] - adi[i]; 694 for (j=0; j<nzi; j++){ 695 row = *adj++; 696 pnz = pi_loc[row+1] - pi_loc[row]; 697 Jptr = pj_loc + pi_loc[row]; 698 /* add non-zero cols of P into the sorted linked list lnk */ 699 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 700 } 701 /* off-diagonal portion of A */ 702 nzi = aoi[i+1] - aoi[i]; 703 for (j=0; j<nzi; j++){ 704 row = *aoj++; 705 pnz = pi_oth[row+1] - pi_oth[row]; 706 Jptr = pj_oth + pi_oth[row]; 707 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 708 } 709 710 apnz = *lnk; 711 api[i+1] = api[i] + apnz; 712 if (apnz > apnz_max) apnz_max = apnz; 713 714 /* if free space is not available, double the total space in the list */ 715 if (current_space->local_remaining<apnz) { 716 ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 717 nspacedouble++; 718 } 719 720 /* Copy data into free space, then initialize lnk */ 721 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 722 ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr); 723 current_space->array += apnz; 724 current_space->local_used += apnz; 725 current_space->local_remaining -= apnz; 726 } 727 728 /* Allocate space for apj, initialize apj, and */ 729 /* destroy list of free space and other temporary array(s) */ 730 ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr); 731 apj = ptap->apj; 732 ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr); 733 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 734 735 /* create and assemble symbolic parallel matrix Cmpi */ 736 /*----------------------------------------------------*/ 737 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 738 ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 739 ierr = MatSetBlockSizes(Cmpi,A->rmap->bs,P->cmap->bs);CHKERRQ(ierr); 740 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 741 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 742 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 743 744 /* malloc apa for assembly Cmpi */ 745 ierr = PetscMalloc(apnz_max*sizeof(PetscScalar),&apa);CHKERRQ(ierr); 746 ierr = PetscMemzero(apa,apnz_max*sizeof(PetscScalar));CHKERRQ(ierr); 747 ptap->apa = apa; 748 for (i=0; i<am; i++){ 749 row = i + rstart; 750 apnz = api[i+1] - api[i]; 751 ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr); 752 apj += apnz; 753 } 754 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 755 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 756 757 ptap->destroy = Cmpi->ops->destroy; 758 ptap->duplicate = Cmpi->ops->duplicate; 759 Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable; 760 Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult; 761 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult; 762 763 /* attach the supporting struct to Cmpi for reuse */ 764 c = (Mat_MPIAIJ*)Cmpi->data; 765 c->ptap = ptap; 766 767 *C = Cmpi; 768 769 /* set MatInfo */ 770 afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5; 771 if (afill < 1.0) afill = 1.0; 772 Cmpi->info.mallocs = nspacedouble; 773 Cmpi->info.fill_ratio_given = fill; 774 Cmpi->info.fill_ratio_needed = afill; 775 776 #if defined(PETSC_USE_INFO) 777 if (api[am]) { 778 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr); 779 ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr); 780 } else { 781 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 782 } 783 #endif 784 PetscFunctionReturn(0); 785 } 786 787 /*-------------------------------------------------------------------------*/ 788 #undef __FUNCT__ 789 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ" 790 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C) 791 { 792 PetscErrorCode ierr; 793 PetscBool scalable=PETSC_TRUE,viamatmatmult=PETSC_FALSE; 794 795 PetscFunctionBegin; 796 ierr = PetscOptionsBool("-mattransposematmult_viamatmatmult","Use R=Pt and C=R*A","",viamatmatmult,&viamatmatmult,PETSC_NULL);CHKERRQ(ierr); 797 if (viamatmatmult){ 798 Mat Pt; 799 Mat_PtAPMPI *ptap; 800 Mat_MPIAIJ *c; 801 if (scall == MAT_INITIAL_MATRIX){ 802 ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr); 803 ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr); 804 805 c = (Mat_MPIAIJ*)(*C)->data; 806 ptap = c->ptap; 807 ptap->Pt = Pt; 808 } else if (scall == MAT_REUSE_MATRIX){ 809 c = (Mat_MPIAIJ*)(*C)->data; 810 ptap = c->ptap; 811 Pt = ptap->Pt; 812 ierr = MatTranspose(P,scall,&Pt);CHKERRQ(ierr); 813 ierr = MatMatMult(Pt,A,scall,fill,C);CHKERRQ(ierr); 814 } else { 815 SETERRQ(((PetscObject)A)->comm,PETSC_ERR_ARG_WRONGSTATE,"Not supported"); 816 } 817 PetscFunctionReturn(0); 818 } 819 820 if (scall == MAT_INITIAL_MATRIX){ 821 ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 822 ierr = PetscOptionsBool("-mattransposematmult_scalable","Use a scalable but slower C=Pt*A","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr); 823 if (scalable){ 824 ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(P,A,fill,C);CHKERRQ(ierr); 825 } else { 826 ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr); 827 } 828 ierr = PetscOptionsEnd();CHKERRQ(ierr); 829 } 830 ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr); 831 PetscFunctionReturn(0); 832 } 833 834 #undef __FUNCT__ 835 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ" 836 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C) 837 { 838 PetscErrorCode ierr; 839 Mat_Merge_SeqsToMPI *merge; 840 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 841 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data; 842 Mat_PtAPMPI *ptap; 843 PetscInt *adj,*aJ; 844 PetscInt i,j,k,anz,pnz,row,*cj; 845 MatScalar *ada,*aval,*ca,valtmp; 846 PetscInt am=A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n; 847 MPI_Comm comm=((PetscObject)C)->comm; 848 PetscMPIInt size,rank,taga,*len_s; 849 PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci; 850 PetscInt **buf_ri,**buf_rj; 851 PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */ 852 MPI_Request *s_waits,*r_waits; 853 MPI_Status *status; 854 MatScalar **abuf_r,*ba_i,*pA,*coa,*ba; 855 PetscInt *ai,*aj,*coi,*coj; 856 PetscInt *poJ,*pdJ; 857 Mat A_loc; 858 Mat_SeqAIJ *a_loc; 859 860 PetscFunctionBegin; 861 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 862 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 863 864 ptap = c->ptap; 865 merge = ptap->merge; 866 867 /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */ 868 /*--------------------------------------------------------------*/ 869 /* get data from symbolic products */ 870 coi = merge->coi; coj = merge->coj; 871 ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr); 872 ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr); 873 874 bi = merge->bi; bj = merge->bj; 875 owners = merge->rowmap->range; 876 ierr = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr); 877 ierr = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr); 878 879 /* get A_loc by taking all local rows of A */ 880 A_loc = ptap->A_loc; 881 ierr = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr); 882 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 883 ai = a_loc->i; 884 aj = a_loc->j; 885 886 ierr = PetscMalloc((A->cmap->N)*sizeof(PetscScalar),&aval);CHKERRQ(ierr); /* non-scalable!!! */ 887 ierr = PetscMemzero(aval,A->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr); 888 889 for (i=0; i<am; i++) { 890 /* 2-a) put A[i,:] to dense array aval */ 891 anz = ai[i+1] - ai[i]; 892 adj = aj + ai[i]; 893 ada = a_loc->a + ai[i]; 894 for (j=0; j<anz; j++){ 895 aval[adj[j]] = ada[j]; 896 } 897 898 /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */ 899 /*--------------------------------------------------------------*/ 900 /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */ 901 pnz = po->i[i+1] - po->i[i]; 902 poJ = po->j + po->i[i]; 903 pA = po->a + po->i[i]; 904 for (j=0; j<pnz; j++){ 905 row = poJ[j]; 906 cnz = coi[row+1] - coi[row]; 907 cj = coj + coi[row]; 908 ca = coa + coi[row]; 909 /* perform dense axpy */ 910 valtmp = pA[j]; 911 for (k=0; k<cnz; k++) { 912 ca[k] += valtmp*aval[cj[k]]; 913 } 914 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 915 } 916 917 /* put the value into Cd (diagonal part) */ 918 pnz = pd->i[i+1] - pd->i[i]; 919 pdJ = pd->j + pd->i[i]; 920 pA = pd->a + pd->i[i]; 921 for (j=0; j<pnz; j++){ 922 row = pdJ[j]; 923 cnz = bi[row+1] - bi[row]; 924 cj = bj + bi[row]; 925 ca = ba + bi[row]; 926 /* perform dense axpy */ 927 valtmp = pA[j]; 928 for (k=0; k<cnz; k++) { 929 ca[k] += valtmp*aval[cj[k]]; 930 } 931 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 932 } 933 934 /* zero the current row of Pt*A */ 935 aJ = aj + ai[i]; 936 for (k=0; k<anz; k++) aval[aJ[k]] = 0.0; 937 } 938 939 /* 3) send and recv matrix values coa */ 940 /*------------------------------------*/ 941 buf_ri = merge->buf_ri; 942 buf_rj = merge->buf_rj; 943 len_s = merge->len_s; 944 ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr); 945 ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 946 947 ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr); 948 for (proc=0,k=0; proc<size; proc++){ 949 if (!len_s[proc]) continue; 950 i = merge->owners_co[proc]; 951 ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 952 k++; 953 } 954 if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);} 955 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);} 956 957 ierr = PetscFree2(s_waits,status);CHKERRQ(ierr); 958 ierr = PetscFree(r_waits);CHKERRQ(ierr); 959 ierr = PetscFree(coa);CHKERRQ(ierr); 960 961 /* 4) insert local Cseq and received values into Cmpi */ 962 /*----------------------------------------------------*/ 963 ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr); 964 for (k=0; k<merge->nrecv; k++){ 965 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 966 nrows = *(buf_ri_k[k]); 967 nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 968 nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 969 } 970 971 for (i=0; i<cm; i++) { 972 row = owners[rank] + i; /* global row index of C_seq */ 973 bj_i = bj + bi[i]; /* col indices of the i-th row of C */ 974 ba_i = ba + bi[i]; 975 bnz = bi[i+1] - bi[i]; 976 /* add received vals into ba */ 977 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 978 /* i-th row */ 979 if (i == *nextrow[k]) { 980 cnz = *(nextci[k]+1) - *nextci[k]; 981 cj = buf_rj[k] + *(nextci[k]); 982 ca = abuf_r[k] + *(nextci[k]); 983 nextcj = 0; 984 for (j=0; nextcj<cnz; j++){ 985 if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */ 986 ba_i[j] += ca[nextcj++]; 987 } 988 } 989 nextrow[k]++; nextci[k]++; 990 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 991 } 992 } 993 ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 994 } 995 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 996 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 997 998 ierr = PetscFree(ba);CHKERRQ(ierr); 999 ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr); 1000 ierr = PetscFree(abuf_r);CHKERRQ(ierr); 1001 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1002 ierr = PetscFree(aval);CHKERRQ(ierr); 1003 PetscFunctionReturn(0); 1004 } 1005 1006 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */ 1007 #undef __FUNCT__ 1008 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ" 1009 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C) 1010 { 1011 PetscErrorCode ierr; 1012 Mat Cmpi,A_loc,POt,PDt; 1013 Mat_PtAPMPI *ptap; 1014 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 1015 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 1016 PetscInt *pdti,*pdtj,*poti,*potj,*ptJ; 1017 PetscInt nnz; 1018 PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row; 1019 PetscInt am=A->rmap->n,pn=P->cmap->n; 1020 PetscBT lnkbt; 1021 MPI_Comm comm=((PetscObject)A)->comm; 1022 PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri; 1023 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1024 PetscInt len,proc,*dnz,*onz,*owners; 1025 PetscInt nzi,*bi,*bj; 1026 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1027 MPI_Request *swaits,*rwaits; 1028 MPI_Status *sstatus,rstatus; 1029 Mat_Merge_SeqsToMPI *merge; 1030 PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j; 1031 PetscReal afill=1.0,afill_tmp; 1032 PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax; 1033 PetscScalar *vals; 1034 Mat_SeqAIJ *a_loc, *pdt,*pot; 1035 1036 PetscFunctionBegin; 1037 /* check if matrix local sizes are compatible */ 1038 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend){ 1039 SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 1040 } 1041 1042 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1043 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1044 1045 /* create struct Mat_PtAPMPI and attached it to C later */ 1046 ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr); 1047 1048 /* get A_loc by taking all local rows of A */ 1049 ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr); 1050 ptap->A_loc = A_loc; 1051 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 1052 ai = a_loc->i; 1053 aj = a_loc->j; 1054 1055 /* determine symbolic Co=(p->B)^T*A - send to others */ 1056 /*----------------------------------------------------*/ 1057 ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr); 1058 pdt = (Mat_SeqAIJ*)PDt->data; 1059 pdti = pdt->i; pdtj = pdt->j; 1060 1061 ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr); 1062 pot = (Mat_SeqAIJ*)POt->data; 1063 poti = pot->i; potj = pot->j; 1064 1065 /* then, compute symbolic Co = (p->B)^T*A */ 1066 pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors 1067 >= (num of nonzero rows of C_seq) - pn */ 1068 ierr = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr); 1069 coi[0] = 0; 1070 1071 /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */ 1072 nnz = fill*(poti[pon] + ai[am]); 1073 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1074 current_space = free_space; 1075 1076 /* create and initialize a linked list */ 1077 i = PetscMax(pdt->rmax,pot->rmax); 1078 Crmax = i*a_loc->rmax*size; 1079 if (!Crmax || Crmax > aN) Crmax = aN; 1080 ierr = PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);CHKERRQ(ierr); 1081 1082 for (i=0; i<pon; i++) { 1083 pnz = poti[i+1] - poti[i]; 1084 ptJ = potj + poti[i]; 1085 for (j=0; j<pnz; j++){ 1086 row = ptJ[j]; /* row of A_loc == col of Pot */ 1087 anz = ai[row+1] - ai[row]; 1088 Jptr = aj + ai[row]; 1089 /* add non-zero cols of AP into the sorted linked list lnk */ 1090 ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1091 } 1092 nnz = lnk[0]; 1093 1094 /* If free space is not available, double the total space in the list */ 1095 if (current_space->local_remaining<nnz) { 1096 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1097 nspacedouble++; 1098 } 1099 1100 /* Copy data into free space, and zero out denserows */ 1101 ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1102 current_space->array += nnz; 1103 current_space->local_used += nnz; 1104 current_space->local_remaining -= nnz; 1105 coi[i+1] = coi[i] + nnz; 1106 } 1107 1108 ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr); 1109 ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr); 1110 afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1); 1111 if (afill_tmp > afill) afill = afill_tmp; 1112 1113 /* send j-array (coj) of Co to other processors */ 1114 /*----------------------------------------------*/ 1115 /* determine row ownership */ 1116 ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr); 1117 ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr); 1118 merge->rowmap->n = pn; 1119 merge->rowmap->bs = 1; 1120 ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr); 1121 owners = merge->rowmap->range; 1122 1123 /* determine the number of messages to send, their lengths */ 1124 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr); 1125 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1126 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr); 1127 len_s = merge->len_s; 1128 merge->nsend = 0; 1129 1130 ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr); 1131 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1132 1133 proc = 0; 1134 for (i=0; i<pon; i++){ 1135 while (prmap[i] >= owners[proc+1]) proc++; 1136 len_si[proc]++; /* num of rows in Co to be sent to [proc] */ 1137 len_s[proc] += coi[i+1] - coi[i]; 1138 } 1139 1140 len = 0; /* max length of buf_si[] */ 1141 owners_co[0] = 0; 1142 for (proc=0; proc<size; proc++){ 1143 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1144 if (len_si[proc]){ 1145 merge->nsend++; 1146 len_si[proc] = 2*(len_si[proc] + 1); 1147 len += len_si[proc]; 1148 } 1149 } 1150 1151 /* determine the number and length of messages to receive for coi and coj */ 1152 ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 1153 ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 1154 1155 /* post the Irecv and Isend of coj */ 1156 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1157 ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1158 ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr); 1159 for (proc=0, k=0; proc<size; proc++){ 1160 if (!len_s[proc]) continue; 1161 i = owners_co[proc]; 1162 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1163 k++; 1164 } 1165 1166 /* receives and sends of coj are complete */ 1167 ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr); 1168 for (i=0; i<merge->nrecv; i++){ 1169 PetscMPIInt icompleted; 1170 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1171 } 1172 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1173 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1174 1175 /* send and recv coi */ 1176 /*-------------------*/ 1177 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1178 ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1179 ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr); 1180 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1181 for (proc=0,k=0; proc<size; proc++){ 1182 if (!len_s[proc]) continue; 1183 /* form outgoing message for i-structure: 1184 buf_si[0]: nrows to be sent 1185 [1:nrows]: row index (global) 1186 [nrows+1:2*nrows+1]: i-structure index 1187 */ 1188 /*-------------------------------------------*/ 1189 nrows = len_si[proc]/2 - 1; 1190 buf_si_i = buf_si + nrows+1; 1191 buf_si[0] = nrows; 1192 buf_si_i[0] = 0; 1193 nrows = 0; 1194 for (i=owners_co[proc]; i<owners_co[proc+1]; i++){ 1195 nzi = coi[i+1] - coi[i]; 1196 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1197 buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */ 1198 nrows++; 1199 } 1200 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1201 k++; 1202 buf_si += len_si[proc]; 1203 } 1204 i = merge->nrecv; 1205 while (i--) { 1206 PetscMPIInt icompleted; 1207 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1208 } 1209 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1210 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1211 ierr = PetscFree(len_si);CHKERRQ(ierr); 1212 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1213 ierr = PetscFree(swaits);CHKERRQ(ierr); 1214 ierr = PetscFree(sstatus);CHKERRQ(ierr); 1215 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1216 1217 /* compute the local portion of C (mpi mat) */ 1218 /*------------------------------------------*/ 1219 /* allocate bi array and free space for accumulating nonzero column info */ 1220 ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 1221 bi[0] = 0; 1222 1223 /* set initial free space to be fill*(nnz(P) + nnz(A)) */ 1224 nnz = fill*(pdti[pn] + poti[pon] + ai[am]); 1225 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1226 current_space = free_space; 1227 1228 ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr); 1229 for (k=0; k<merge->nrecv; k++){ 1230 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1231 nrows = *buf_ri_k[k]; 1232 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1233 nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 1234 } 1235 1236 ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr); 1237 rmax = 0; 1238 for (i=0; i<pn; i++) { 1239 /* add pdt[i,:]*AP into lnk */ 1240 pnz = pdti[i+1] - pdti[i]; 1241 ptJ = pdtj + pdti[i]; 1242 for (j=0; j<pnz; j++){ 1243 row = ptJ[j]; /* row of AP == col of Pt */ 1244 anz = ai[row+1] - ai[row]; 1245 Jptr = aj + ai[row]; 1246 /* add non-zero cols of AP into the sorted linked list lnk */ 1247 ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1248 } 1249 1250 /* add received col data into lnk */ 1251 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 1252 if (i == *nextrow[k]) { /* i-th row */ 1253 nzi = *(nextci[k]+1) - *nextci[k]; 1254 Jptr = buf_rj[k] + *nextci[k]; 1255 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1256 nextrow[k]++; nextci[k]++; 1257 } 1258 } 1259 nnz = lnk[0]; 1260 1261 /* if free space is not available, make more free space */ 1262 if (current_space->local_remaining<nnz) { 1263 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1264 nspacedouble++; 1265 } 1266 /* copy data into free space, then initialize lnk */ 1267 ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1268 ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr); 1269 current_space->array += nnz; 1270 current_space->local_used += nnz; 1271 current_space->local_remaining -= nnz; 1272 bi[i+1] = bi[i] + nnz; 1273 if (nnz > rmax) rmax = nnz; 1274 } 1275 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1276 1277 ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 1278 ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr); 1279 afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1); 1280 if (afill_tmp > afill) afill = afill_tmp; 1281 ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr); 1282 ierr = MatDestroy(&POt);CHKERRQ(ierr); 1283 ierr = MatDestroy(&PDt);CHKERRQ(ierr); 1284 1285 /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */ 1286 /*----------------------------------------------------------------------------------*/ 1287 ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr); 1288 ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr); 1289 1290 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1291 ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1292 ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);CHKERRQ(ierr); 1293 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 1294 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1295 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1296 ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr); 1297 for (i=0; i<pn; i++){ 1298 row = i + rstart; 1299 nnz = bi[i+1] - bi[i]; 1300 Jptr = bj + bi[i]; 1301 ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr); 1302 } 1303 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1304 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1305 ierr = PetscFree(vals);CHKERRQ(ierr); 1306 1307 merge->bi = bi; 1308 merge->bj = bj; 1309 merge->coi = coi; 1310 merge->coj = coj; 1311 merge->buf_ri = buf_ri; 1312 merge->buf_rj = buf_rj; 1313 merge->owners_co = owners_co; 1314 merge->destroy = Cmpi->ops->destroy; 1315 merge->duplicate = Cmpi->ops->duplicate; 1316 1317 Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ; 1318 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1319 1320 /* attach the supporting struct to Cmpi for reuse */ 1321 c = (Mat_MPIAIJ*)Cmpi->data; 1322 c->ptap = ptap; 1323 ptap->api = PETSC_NULL; 1324 ptap->apj = PETSC_NULL; 1325 ptap->merge = merge; 1326 ptap->rmax = rmax; 1327 1328 *C = Cmpi; 1329 #if defined(PETSC_USE_INFO) 1330 if (bi[pn] != 0) { 1331 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr); 1332 ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr); 1333 } else { 1334 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 1335 } 1336 #endif 1337 PetscFunctionReturn(0); 1338 } 1339 1340 #undef __FUNCT__ 1341 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable" 1342 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,Mat C) 1343 { 1344 PetscErrorCode ierr; 1345 Mat_Merge_SeqsToMPI *merge; 1346 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 1347 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data; 1348 Mat_PtAPMPI *ptap; 1349 PetscInt *adj; 1350 PetscInt i,j,k,anz,pnz,row,*cj,nexta; 1351 MatScalar *ada,*ca,valtmp; 1352 PetscInt am=A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n; 1353 MPI_Comm comm=((PetscObject)C)->comm; 1354 PetscMPIInt size,rank,taga,*len_s; 1355 PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci; 1356 PetscInt **buf_ri,**buf_rj; 1357 PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */ 1358 MPI_Request *s_waits,*r_waits; 1359 MPI_Status *status; 1360 MatScalar **abuf_r,*ba_i,*pA,*coa,*ba; 1361 PetscInt *ai,*aj,*coi,*coj; 1362 PetscInt *poJ,*pdJ; 1363 Mat A_loc; 1364 Mat_SeqAIJ *a_loc; 1365 1366 PetscFunctionBegin; 1367 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1368 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1369 1370 ptap = c->ptap; 1371 merge = ptap->merge; 1372 1373 /* 2) compute numeric C_seq = P_loc^T*A_loc */ 1374 /*------------------------------------------*/ 1375 /* get data from symbolic products */ 1376 coi = merge->coi; coj = merge->coj; 1377 ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr); 1378 ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr); 1379 bi = merge->bi; bj = merge->bj; 1380 owners = merge->rowmap->range; 1381 ierr = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr); 1382 ierr = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr); 1383 1384 /* get A_loc by taking all local rows of A */ 1385 A_loc = ptap->A_loc; 1386 ierr = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr); 1387 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 1388 ai = a_loc->i; 1389 aj = a_loc->j; 1390 1391 for (i=0; i<am; i++) { 1392 anz = ai[i+1] - ai[i]; 1393 adj = aj + ai[i]; 1394 ada = a_loc->a + ai[i]; 1395 1396 /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */ 1397 /*-------------------------------------------------------------*/ 1398 /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */ 1399 pnz = po->i[i+1] - po->i[i]; 1400 poJ = po->j + po->i[i]; 1401 pA = po->a + po->i[i]; 1402 for (j=0; j<pnz; j++){ 1403 row = poJ[j]; 1404 cj = coj + coi[row]; 1405 ca = coa + coi[row]; 1406 /* perform sparse axpy */ 1407 nexta = 0; 1408 valtmp = pA[j]; 1409 for (k=0; nexta<anz; k++) { 1410 if (cj[k] == adj[nexta]){ 1411 ca[k] += valtmp*ada[nexta]; 1412 nexta++; 1413 } 1414 } 1415 ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr); 1416 } 1417 1418 /* put the value into Cd (diagonal part) */ 1419 pnz = pd->i[i+1] - pd->i[i]; 1420 pdJ = pd->j + pd->i[i]; 1421 pA = pd->a + pd->i[i]; 1422 for (j=0; j<pnz; j++){ 1423 row = pdJ[j]; 1424 cj = bj + bi[row]; 1425 ca = ba + bi[row]; 1426 /* perform sparse axpy */ 1427 nexta = 0; 1428 valtmp = pA[j]; 1429 for (k=0; nexta<anz; k++) { 1430 if (cj[k] == adj[nexta]){ 1431 ca[k] += valtmp*ada[nexta]; 1432 nexta++; 1433 } 1434 } 1435 ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr); 1436 } 1437 } 1438 1439 /* 3) send and recv matrix values coa */ 1440 /*------------------------------------*/ 1441 buf_ri = merge->buf_ri; 1442 buf_rj = merge->buf_rj; 1443 len_s = merge->len_s; 1444 ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr); 1445 ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 1446 1447 ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr); 1448 for (proc=0,k=0; proc<size; proc++){ 1449 if (!len_s[proc]) continue; 1450 i = merge->owners_co[proc]; 1451 ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 1452 k++; 1453 } 1454 if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);} 1455 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);} 1456 1457 ierr = PetscFree2(s_waits,status);CHKERRQ(ierr); 1458 ierr = PetscFree(r_waits);CHKERRQ(ierr); 1459 ierr = PetscFree(coa);CHKERRQ(ierr); 1460 1461 /* 4) insert local Cseq and received values into Cmpi */ 1462 /*----------------------------------------------------*/ 1463 ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr); 1464 for (k=0; k<merge->nrecv; k++){ 1465 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1466 nrows = *(buf_ri_k[k]); 1467 nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 1468 nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 1469 } 1470 1471 for (i=0; i<cm; i++) { 1472 row = owners[rank] + i; /* global row index of C_seq */ 1473 bj_i = bj + bi[i]; /* col indices of the i-th row of C */ 1474 ba_i = ba + bi[i]; 1475 bnz = bi[i+1] - bi[i]; 1476 /* add received vals into ba */ 1477 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 1478 /* i-th row */ 1479 if (i == *nextrow[k]) { 1480 cnz = *(nextci[k]+1) - *nextci[k]; 1481 cj = buf_rj[k] + *(nextci[k]); 1482 ca = abuf_r[k] + *(nextci[k]); 1483 nextcj = 0; 1484 for (j=0; nextcj<cnz; j++){ 1485 if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */ 1486 ba_i[j] += ca[nextcj++]; 1487 } 1488 } 1489 nextrow[k]++; nextci[k]++; 1490 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 1491 } 1492 } 1493 ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 1494 } 1495 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1496 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1497 1498 ierr = PetscFree(ba);CHKERRQ(ierr); 1499 ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr); 1500 ierr = PetscFree(abuf_r);CHKERRQ(ierr); 1501 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1502 PetscFunctionReturn(0); 1503 } 1504 1505 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */ 1506 #undef __FUNCT__ 1507 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable" 1508 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,PetscReal fill,Mat *C) 1509 { 1510 PetscErrorCode ierr; 1511 Mat Cmpi,A_loc,POt,PDt; 1512 Mat_PtAPMPI *ptap; 1513 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 1514 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 1515 PetscInt *pdti,*pdtj,*poti,*potj,*ptJ; 1516 PetscInt nnz; 1517 PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row; 1518 PetscInt am=A->rmap->n,pn=P->cmap->n; 1519 MPI_Comm comm=((PetscObject)A)->comm; 1520 PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri; 1521 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1522 PetscInt len,proc,*dnz,*onz,*owners; 1523 PetscInt nzi,*bi,*bj; 1524 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1525 MPI_Request *swaits,*rwaits; 1526 MPI_Status *sstatus,rstatus; 1527 Mat_Merge_SeqsToMPI *merge; 1528 PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j; 1529 PetscReal afill=1.0,afill_tmp; 1530 PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax; 1531 PetscScalar *vals; 1532 Mat_SeqAIJ *a_loc, *pdt,*pot; 1533 1534 PetscFunctionBegin; 1535 /* check if matrix local sizes are compatible */ 1536 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend){ 1537 SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 1538 } 1539 1540 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1541 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1542 1543 /* create struct Mat_PtAPMPI and attached it to C later */ 1544 ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr); 1545 1546 /* get A_loc by taking all local rows of A */ 1547 ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr); 1548 ptap->A_loc = A_loc; 1549 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 1550 ai = a_loc->i; 1551 aj = a_loc->j; 1552 1553 /* determine symbolic Co=(p->B)^T*A - send to others */ 1554 /*----------------------------------------------------*/ 1555 ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr); 1556 pdt = (Mat_SeqAIJ*)PDt->data; 1557 pdti = pdt->i; pdtj = pdt->j; 1558 1559 ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr); 1560 pot = (Mat_SeqAIJ*)POt->data; 1561 poti = pot->i; potj = pot->j; 1562 1563 /* then, compute symbolic Co = (p->B)^T*A */ 1564 pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors 1565 >= (num of nonzero rows of C_seq) - pn */ 1566 ierr = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr); 1567 coi[0] = 0; 1568 1569 /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */ 1570 nnz = fill*(poti[pon] + ai[am]); 1571 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1572 current_space = free_space; 1573 1574 /* create and initialize a linked list */ 1575 i = PetscMax(pdt->rmax,pot->rmax); 1576 Crmax = i*a_loc->rmax*size; /* non-scalable! */ 1577 if (!Crmax || Crmax > aN) Crmax = aN; 1578 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 1579 1580 for (i=0; i<pon; i++) { 1581 pnz = poti[i+1] - poti[i]; 1582 ptJ = potj + poti[i]; 1583 for (j=0; j<pnz; j++){ 1584 row = ptJ[j]; /* row of A_loc == col of Pot */ 1585 anz = ai[row+1] - ai[row]; 1586 Jptr = aj + ai[row]; 1587 /* add non-zero cols of AP into the sorted linked list lnk */ 1588 ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr); 1589 } 1590 nnz = lnk[0]; 1591 1592 /* If free space is not available, double the total space in the list */ 1593 if (current_space->local_remaining<nnz) { 1594 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1595 nspacedouble++; 1596 } 1597 1598 /* Copy data into free space, and zero out denserows */ 1599 ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr); 1600 current_space->array += nnz; 1601 current_space->local_used += nnz; 1602 current_space->local_remaining -= nnz; 1603 coi[i+1] = coi[i] + nnz; 1604 } 1605 1606 ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr); 1607 ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr); 1608 afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1); 1609 if (afill_tmp > afill) afill = afill_tmp; 1610 1611 /* send j-array (coj) of Co to other processors */ 1612 /*----------------------------------------------*/ 1613 /* determine row ownership */ 1614 ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr); 1615 ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr); 1616 merge->rowmap->n = pn; 1617 merge->rowmap->bs = 1; 1618 ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr); 1619 owners = merge->rowmap->range; 1620 1621 /* determine the number of messages to send, their lengths */ 1622 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr); 1623 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1624 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr); 1625 len_s = merge->len_s; 1626 merge->nsend = 0; 1627 1628 ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr); 1629 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1630 1631 proc = 0; 1632 for (i=0; i<pon; i++){ 1633 while (prmap[i] >= owners[proc+1]) proc++; 1634 len_si[proc]++; /* num of rows in Co to be sent to [proc] */ 1635 len_s[proc] += coi[i+1] - coi[i]; 1636 } 1637 1638 len = 0; /* max length of buf_si[] */ 1639 owners_co[0] = 0; 1640 for (proc=0; proc<size; proc++){ 1641 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1642 if (len_si[proc]){ 1643 merge->nsend++; 1644 len_si[proc] = 2*(len_si[proc] + 1); 1645 len += len_si[proc]; 1646 } 1647 } 1648 1649 /* determine the number and length of messages to receive for coi and coj */ 1650 ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 1651 ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 1652 1653 /* post the Irecv and Isend of coj */ 1654 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1655 ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1656 ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr); 1657 for (proc=0, k=0; proc<size; proc++){ 1658 if (!len_s[proc]) continue; 1659 i = owners_co[proc]; 1660 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1661 k++; 1662 } 1663 1664 /* receives and sends of coj are complete */ 1665 ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr); 1666 for (i=0; i<merge->nrecv; i++){ 1667 PetscMPIInt icompleted; 1668 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1669 } 1670 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1671 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1672 1673 /* send and recv coi */ 1674 /*-------------------*/ 1675 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1676 ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1677 ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr); 1678 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1679 for (proc=0,k=0; proc<size; proc++){ 1680 if (!len_s[proc]) continue; 1681 /* form outgoing message for i-structure: 1682 buf_si[0]: nrows to be sent 1683 [1:nrows]: row index (global) 1684 [nrows+1:2*nrows+1]: i-structure index 1685 */ 1686 /*-------------------------------------------*/ 1687 nrows = len_si[proc]/2 - 1; 1688 buf_si_i = buf_si + nrows+1; 1689 buf_si[0] = nrows; 1690 buf_si_i[0] = 0; 1691 nrows = 0; 1692 for (i=owners_co[proc]; i<owners_co[proc+1]; i++){ 1693 nzi = coi[i+1] - coi[i]; 1694 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1695 buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */ 1696 nrows++; 1697 } 1698 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1699 k++; 1700 buf_si += len_si[proc]; 1701 } 1702 i = merge->nrecv; 1703 while (i--) { 1704 PetscMPIInt icompleted; 1705 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1706 } 1707 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1708 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1709 ierr = PetscFree(len_si);CHKERRQ(ierr); 1710 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1711 ierr = PetscFree(swaits);CHKERRQ(ierr); 1712 ierr = PetscFree(sstatus);CHKERRQ(ierr); 1713 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1714 1715 /* compute the local portion of C (mpi mat) */ 1716 /*------------------------------------------*/ 1717 /* allocate bi array and free space for accumulating nonzero column info */ 1718 ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 1719 bi[0] = 0; 1720 1721 /* set initial free space to be fill*(nnz(P) + nnz(AP)) */ 1722 nnz = fill*(pdti[pn] + poti[pon] + ai[am]); 1723 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1724 current_space = free_space; 1725 1726 ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr); 1727 for (k=0; k<merge->nrecv; k++){ 1728 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1729 nrows = *buf_ri_k[k]; 1730 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1731 nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 1732 } 1733 1734 ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr); 1735 rmax = 0; 1736 for (i=0; i<pn; i++) { 1737 /* add pdt[i,:]*AP into lnk */ 1738 pnz = pdti[i+1] - pdti[i]; 1739 ptJ = pdtj + pdti[i]; 1740 for (j=0; j<pnz; j++){ 1741 row = ptJ[j]; /* row of AP == col of Pt */ 1742 anz = ai[row+1] - ai[row]; 1743 Jptr = aj + ai[row]; 1744 /* add non-zero cols of AP into the sorted linked list lnk */ 1745 ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr); 1746 } 1747 1748 /* add received col data into lnk */ 1749 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 1750 if (i == *nextrow[k]) { /* i-th row */ 1751 nzi = *(nextci[k]+1) - *nextci[k]; 1752 Jptr = buf_rj[k] + *nextci[k]; 1753 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 1754 nextrow[k]++; nextci[k]++; 1755 } 1756 } 1757 nnz = lnk[0]; 1758 1759 /* if free space is not available, make more free space */ 1760 if (current_space->local_remaining<nnz) { 1761 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1762 nspacedouble++; 1763 } 1764 /* copy data into free space, then initialize lnk */ 1765 ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr); 1766 ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr); 1767 current_space->array += nnz; 1768 current_space->local_used += nnz; 1769 current_space->local_remaining -= nnz; 1770 bi[i+1] = bi[i] + nnz; 1771 if (nnz > rmax) rmax = nnz; 1772 } 1773 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1774 1775 ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 1776 ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr); 1777 afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1); 1778 if (afill_tmp > afill) afill = afill_tmp; 1779 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 1780 ierr = MatDestroy(&POt);CHKERRQ(ierr); 1781 ierr = MatDestroy(&PDt);CHKERRQ(ierr); 1782 1783 /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */ 1784 /*----------------------------------------------------------------------------------*/ 1785 ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr); 1786 ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr); 1787 1788 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1789 ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1790 ierr = MatSetBlockSizes(Cmpi,P->cmap->bs,A->cmap->bs);CHKERRQ(ierr); 1791 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 1792 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1793 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1794 ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr); 1795 for (i=0; i<pn; i++){ 1796 row = i + rstart; 1797 nnz = bi[i+1] - bi[i]; 1798 Jptr = bj + bi[i]; 1799 ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr); 1800 } 1801 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1802 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1803 ierr = PetscFree(vals);CHKERRQ(ierr); 1804 1805 merge->bi = bi; 1806 merge->bj = bj; 1807 merge->coi = coi; 1808 merge->coj = coj; 1809 merge->buf_ri = buf_ri; 1810 merge->buf_rj = buf_rj; 1811 merge->owners_co = owners_co; 1812 merge->destroy = Cmpi->ops->destroy; 1813 merge->duplicate = Cmpi->ops->duplicate; 1814 1815 Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable; 1816 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1817 1818 /* attach the supporting struct to Cmpi for reuse */ 1819 c = (Mat_MPIAIJ*)Cmpi->data; 1820 c->ptap = ptap; 1821 ptap->api = PETSC_NULL; 1822 ptap->apj = PETSC_NULL; 1823 ptap->merge = merge; 1824 ptap->rmax = rmax; 1825 ptap->apa = PETSC_NULL; 1826 1827 *C = Cmpi; 1828 #if defined(PETSC_USE_INFO) 1829 if (bi[pn] != 0) { 1830 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr); 1831 ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr); 1832 } else { 1833 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 1834 } 1835 #endif 1836 PetscFunctionReturn(0); 1837 } 1838