1 2 /* 3 Defines matrix-matrix product routines for pairs of MPIAIJ matrices 4 C = A * B 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <../src/mat/impls/aij/mpi/mpiaij.h> 9 #include <petscbt.h> 10 #include <../src/mat/impls/dense/mpi/mpidense.h> 11 #include <petsc-private/vecimpl.h> 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ" 15 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C) 16 { 17 PetscErrorCode ierr; 18 const char *algTypes[2] = {"scalable","nonscalable"}; 19 PetscInt alg=0; /* set default algorithm */ 20 21 PetscFunctionBegin; 22 if (scall == MAT_INITIAL_MATRIX) { 23 ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 24 ierr = PetscOptionsEList("-matmatmult_via","Algorithmic approach","MatMatMult",algTypes,2,algTypes[0],&alg,NULL);CHKERRQ(ierr); 25 ierr = PetscOptionsEnd();CHKERRQ(ierr); 26 27 ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 28 switch (alg) { 29 case 1: 30 ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A,B,fill,C);CHKERRQ(ierr); 31 break; 32 default: 33 ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr); 34 break; 35 } 36 ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 37 } 38 ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 39 ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr); 40 ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 41 PetscFunctionReturn(0); 42 } 43 44 #undef __FUNCT__ 45 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult" 46 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A) 47 { 48 PetscErrorCode ierr; 49 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 50 Mat_PtAPMPI *ptap = a->ptap; 51 52 PetscFunctionBegin; 53 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 54 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 55 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 56 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 57 ierr = MatDestroy(&ptap->Pt);CHKERRQ(ierr); 58 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 59 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 60 ierr = PetscFree(ptap->apa);CHKERRQ(ierr); 61 ierr = ptap->destroy(A);CHKERRQ(ierr); 62 ierr = PetscFree(ptap);CHKERRQ(ierr); 63 PetscFunctionReturn(0); 64 } 65 66 #undef __FUNCT__ 67 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult" 68 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M) 69 { 70 PetscErrorCode ierr; 71 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 72 Mat_PtAPMPI *ptap = a->ptap; 73 74 PetscFunctionBegin; 75 ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr); 76 77 (*M)->ops->destroy = ptap->destroy; /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */ 78 (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */ 79 PetscFunctionReturn(0); 80 } 81 82 #undef __FUNCT__ 83 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ" 84 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 85 { 86 PetscErrorCode ierr; 87 Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data; 88 Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 89 Mat_SeqAIJ *cd =(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data; 90 PetscInt *adi=ad->i,*adj,*aoi=ao->i,*aoj; 91 PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a; 92 Mat_SeqAIJ *p_loc,*p_oth; 93 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj; 94 PetscScalar *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca; 95 PetscInt cm =C->rmap->n,anz,pnz; 96 Mat_PtAPMPI *ptap=c->ptap; 97 PetscInt *api,*apj,*apJ,i,j,k,row; 98 PetscInt cstart=C->cmap->rstart; 99 PetscInt cdnz,conz,k0,k1; 100 101 PetscFunctionBegin; 102 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 103 /*-----------------------------------------------------*/ 104 /* update numerical values of P_oth and P_loc */ 105 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 106 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 107 108 /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */ 109 /*----------------------------------------------------------*/ 110 /* get data from symbolic products */ 111 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 112 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 113 pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a; 114 pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 115 116 /* get apa for storing dense row A[i,:]*P */ 117 apa = ptap->apa; 118 119 api = ptap->api; 120 apj = ptap->apj; 121 for (i=0; i<cm; i++) { 122 /* diagonal portion of A */ 123 anz = adi[i+1] - adi[i]; 124 adj = ad->j + adi[i]; 125 ada = ad->a + adi[i]; 126 for (j=0; j<anz; j++) { 127 row = adj[j]; 128 pnz = pi_loc[row+1] - pi_loc[row]; 129 pj = pj_loc + pi_loc[row]; 130 pa = pa_loc + pi_loc[row]; 131 132 /* perform dense axpy */ 133 valtmp = ada[j]; 134 for (k=0; k<pnz; k++) { 135 apa[pj[k]] += valtmp*pa[k]; 136 } 137 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 138 } 139 140 /* off-diagonal portion of A */ 141 anz = aoi[i+1] - aoi[i]; 142 aoj = ao->j + aoi[i]; 143 aoa = ao->a + aoi[i]; 144 for (j=0; j<anz; j++) { 145 row = aoj[j]; 146 pnz = pi_oth[row+1] - pi_oth[row]; 147 pj = pj_oth + pi_oth[row]; 148 pa = pa_oth + pi_oth[row]; 149 150 /* perform dense axpy */ 151 valtmp = aoa[j]; 152 for (k=0; k<pnz; k++) { 153 apa[pj[k]] += valtmp*pa[k]; 154 } 155 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 156 } 157 158 /* set values in C */ 159 apJ = apj + api[i]; 160 cdnz = cd->i[i+1] - cd->i[i]; 161 conz = co->i[i+1] - co->i[i]; 162 163 /* 1st off-diagoanl part of C */ 164 ca = coa + co->i[i]; 165 k = 0; 166 for (k0=0; k0<conz; k0++) { 167 if (apJ[k] >= cstart) break; 168 ca[k0] = apa[apJ[k]]; 169 apa[apJ[k]] = 0.0; 170 k++; 171 } 172 173 /* diagonal part of C */ 174 ca = cda + cd->i[i]; 175 for (k1=0; k1<cdnz; k1++) { 176 ca[k1] = apa[apJ[k]]; 177 apa[apJ[k]] = 0.0; 178 k++; 179 } 180 181 /* 2nd off-diagoanl part of C */ 182 ca = coa + co->i[i]; 183 for (; k0<conz; k0++) { 184 ca[k0] = apa[apJ[k]]; 185 apa[apJ[k]] = 0.0; 186 k++; 187 } 188 } 189 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 190 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 } 193 194 #undef __FUNCT__ 195 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable" 196 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A,Mat P,PetscReal fill,Mat *C) 197 { 198 PetscErrorCode ierr; 199 MPI_Comm comm; 200 Mat Cmpi; 201 Mat_PtAPMPI *ptap; 202 PetscFreeSpaceList free_space=NULL,current_space=NULL; 203 Mat_MPIAIJ *a =(Mat_MPIAIJ*)A->data,*c; 204 Mat_SeqAIJ *ad =(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth; 205 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz; 206 PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart; 207 PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi; 208 PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n; 209 PetscBT lnkbt; 210 PetscScalar *apa; 211 PetscReal afill; 212 PetscInt nlnk_max,armax,prmax; 213 214 PetscFunctionBegin; 215 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 216 if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend) { 217 SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend); 218 } 219 220 /* create struct Mat_PtAPMPI and attached it to C later */ 221 ierr = PetscNew(&ptap);CHKERRQ(ierr); 222 223 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 224 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 225 226 /* get P_loc by taking all local rows of P */ 227 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 228 229 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 230 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 231 pi_loc = p_loc->i; pj_loc = p_loc->j; 232 pi_oth = p_oth->i; pj_oth = p_oth->j; 233 234 /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */ 235 /*-------------------------------------------------------------------*/ 236 ierr = PetscMalloc1((am+2),&api);CHKERRQ(ierr); 237 ptap->api = api; 238 api[0] = 0; 239 240 /* create and initialize a linked list */ 241 armax = ad->rmax+ao->rmax; 242 prmax = PetscMax(p_loc->rmax,p_oth->rmax); 243 nlnk_max = armax*prmax; 244 if (!nlnk_max || nlnk_max > pN) nlnk_max = pN; 245 ierr = PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);CHKERRQ(ierr); 246 247 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 248 ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr); 249 250 current_space = free_space; 251 252 ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr); 253 for (i=0; i<am; i++) { 254 /* diagonal portion of A */ 255 nzi = adi[i+1] - adi[i]; 256 for (j=0; j<nzi; j++) { 257 row = *adj++; 258 pnz = pi_loc[row+1] - pi_loc[row]; 259 Jptr = pj_loc + pi_loc[row]; 260 /* add non-zero cols of P into the sorted linked list lnk */ 261 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 262 } 263 /* off-diagonal portion of A */ 264 nzi = aoi[i+1] - aoi[i]; 265 for (j=0; j<nzi; j++) { 266 row = *aoj++; 267 pnz = pi_oth[row+1] - pi_oth[row]; 268 Jptr = pj_oth + pi_oth[row]; 269 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 270 } 271 272 apnz = lnk[0]; 273 api[i+1] = api[i] + apnz; 274 275 /* if free space is not available, double the total space in the list */ 276 if (current_space->local_remaining<apnz) { 277 ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 278 nspacedouble++; 279 } 280 281 /* Copy data into free space, then initialize lnk */ 282 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 283 ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr); 284 285 current_space->array += apnz; 286 current_space->local_used += apnz; 287 current_space->local_remaining -= apnz; 288 } 289 290 /* Allocate space for apj, initialize apj, and */ 291 /* destroy list of free space and other temporary array(s) */ 292 ierr = PetscMalloc1((api[am]+1),&ptap->apj);CHKERRQ(ierr); 293 apj = ptap->apj; 294 ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr); 295 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 296 297 /* malloc apa to store dense row A[i,:]*P */ 298 ierr = PetscCalloc1(pN,&apa);CHKERRQ(ierr); 299 300 ptap->apa = apa; 301 302 /* create and assemble symbolic parallel matrix Cmpi */ 303 /*----------------------------------------------------*/ 304 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 305 ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 306 ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr); 307 308 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 309 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 310 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 311 for (i=0; i<am; i++) { 312 row = i + rstart; 313 apnz = api[i+1] - api[i]; 314 ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr); 315 apj += apnz; 316 } 317 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 318 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 319 320 ptap->destroy = Cmpi->ops->destroy; 321 ptap->duplicate = Cmpi->ops->duplicate; 322 Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult; 323 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult; 324 325 /* attach the supporting struct to Cmpi for reuse */ 326 c = (Mat_MPIAIJ*)Cmpi->data; 327 c->ptap = ptap; 328 329 *C = Cmpi; 330 331 /* set MatInfo */ 332 afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5; 333 if (afill < 1.0) afill = 1.0; 334 Cmpi->info.mallocs = nspacedouble; 335 Cmpi->info.fill_ratio_given = fill; 336 Cmpi->info.fill_ratio_needed = afill; 337 338 #if defined(PETSC_USE_INFO) 339 if (api[am]) { 340 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr); 341 ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 342 } else { 343 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 344 } 345 #endif 346 PetscFunctionReturn(0); 347 } 348 349 #undef __FUNCT__ 350 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense" 351 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 352 { 353 PetscErrorCode ierr; 354 355 PetscFunctionBegin; 356 if (scall == MAT_INITIAL_MATRIX) { 357 ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 358 ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr); 359 ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 360 } 361 ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 362 ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr); 363 ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 364 PetscFunctionReturn(0); 365 } 366 367 typedef struct { 368 Mat workB; 369 PetscScalar *rvalues,*svalues; 370 MPI_Request *rwaits,*swaits; 371 } MPIAIJ_MPIDense; 372 373 #undef __FUNCT__ 374 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy" 375 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx) 376 { 377 MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx; 378 PetscErrorCode ierr; 379 380 PetscFunctionBegin; 381 ierr = MatDestroy(&contents->workB);CHKERRQ(ierr); 382 ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr); 383 ierr = PetscFree(contents);CHKERRQ(ierr); 384 PetscFunctionReturn(0); 385 } 386 387 #undef __FUNCT__ 388 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense" 389 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C) 390 { 391 PetscErrorCode ierr; 392 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data; 393 PetscInt nz = aij->B->cmap->n; 394 PetscContainer container; 395 MPIAIJ_MPIDense *contents; 396 VecScatter ctx = aij->Mvctx; 397 VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata; 398 VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata; 399 PetscInt m = A->rmap->n,n=B->cmap->n; 400 401 PetscFunctionBegin; 402 ierr = MatCreate(PetscObjectComm((PetscObject)B),C);CHKERRQ(ierr); 403 ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr); 404 ierr = MatSetBlockSizesFromMats(*C,A,B);CHKERRQ(ierr); 405 ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr); 406 ierr = MatMPIDenseSetPreallocation(*C,NULL);CHKERRQ(ierr); 407 ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 408 ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 409 410 (*C)->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense; 411 412 ierr = PetscNew(&contents);CHKERRQ(ierr); 413 /* Create work matrix used to store off processor rows of B needed for local product */ 414 ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,NULL,&contents->workB);CHKERRQ(ierr); 415 /* Create work arrays needed */ 416 ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],&contents->rvalues, 417 B->cmap->N*to->starts[to->n],&contents->svalues, 418 from->n,&contents->rwaits, 419 to->n,&contents->swaits);CHKERRQ(ierr); 420 421 ierr = PetscContainerCreate(PetscObjectComm((PetscObject)A),&container);CHKERRQ(ierr); 422 ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr); 423 ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr); 424 ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr); 425 ierr = PetscContainerDestroy(&container);CHKERRQ(ierr); 426 PetscFunctionReturn(0); 427 } 428 429 #undef __FUNCT__ 430 #define __FUNCT__ "MatMPIDenseScatter" 431 /* 432 Performs an efficient scatter on the rows of B needed by this process; this is 433 a modification of the VecScatterBegin_() routines. 434 */ 435 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB) 436 { 437 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 438 PetscErrorCode ierr; 439 PetscScalar *b,*w,*svalues,*rvalues; 440 VecScatter ctx = aij->Mvctx; 441 VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata; 442 VecScatter_MPI_General *to = (VecScatter_MPI_General*) ctx->todata; 443 PetscInt i,j,k; 444 PetscInt *sindices,*sstarts,*rindices,*rstarts; 445 PetscMPIInt *sprocs,*rprocs,nrecvs; 446 MPI_Request *swaits,*rwaits; 447 MPI_Comm comm; 448 PetscMPIInt tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n; 449 MPI_Status status; 450 MPIAIJ_MPIDense *contents; 451 PetscContainer container; 452 Mat workB; 453 454 PetscFunctionBegin; 455 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 456 ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr); 457 if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist"); 458 ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr); 459 460 workB = *outworkB = contents->workB; 461 if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n); 462 sindices = to->indices; 463 sstarts = to->starts; 464 sprocs = to->procs; 465 swaits = contents->swaits; 466 svalues = contents->svalues; 467 468 rindices = from->indices; 469 rstarts = from->starts; 470 rprocs = from->procs; 471 rwaits = contents->rwaits; 472 rvalues = contents->rvalues; 473 474 ierr = MatDenseGetArray(B,&b);CHKERRQ(ierr); 475 ierr = MatDenseGetArray(workB,&w);CHKERRQ(ierr); 476 477 for (i=0; i<from->n; i++) { 478 ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr); 479 } 480 481 for (i=0; i<to->n; i++) { 482 /* pack a message at a time */ 483 for (j=0; j<sstarts[i+1]-sstarts[i]; j++) { 484 for (k=0; k<ncols; k++) { 485 svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k]; 486 } 487 } 488 ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr); 489 } 490 491 nrecvs = from->n; 492 while (nrecvs) { 493 ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr); 494 nrecvs--; 495 /* unpack a message at a time */ 496 for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++) { 497 for (k=0; k<ncols; k++) { 498 w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k]; 499 } 500 } 501 } 502 if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);} 503 504 ierr = MatDenseRestoreArray(B,&b);CHKERRQ(ierr); 505 ierr = MatDenseRestoreArray(workB,&w);CHKERRQ(ierr); 506 ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 507 ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 508 PetscFunctionReturn(0); 509 } 510 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat); 511 512 #undef __FUNCT__ 513 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense" 514 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C) 515 { 516 PetscErrorCode ierr; 517 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 518 Mat_MPIDense *bdense = (Mat_MPIDense*)B->data; 519 Mat_MPIDense *cdense = (Mat_MPIDense*)C->data; 520 Mat workB; 521 522 PetscFunctionBegin; 523 /* diagonal block of A times all local rows of B*/ 524 ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr); 525 526 /* get off processor parts of B needed to complete the product */ 527 ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr); 528 529 /* off-diagonal block of A times nonlocal rows of B */ 530 ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr); 531 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 532 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 533 PetscFunctionReturn(0); 534 } 535 536 #undef __FUNCT__ 537 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable" 538 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,Mat C) 539 { 540 PetscErrorCode ierr; 541 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data; 542 Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 543 Mat_SeqAIJ *cd = (Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data; 544 PetscInt *adi = ad->i,*adj,*aoi=ao->i,*aoj; 545 PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a; 546 Mat_SeqAIJ *p_loc,*p_oth; 547 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj; 548 PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca; 549 PetscInt cm = C->rmap->n,anz,pnz; 550 Mat_PtAPMPI *ptap = c->ptap; 551 PetscScalar *apa_sparse = ptap->apa; 552 PetscInt *api,*apj,*apJ,i,j,k,row; 553 PetscInt cstart = C->cmap->rstart; 554 PetscInt cdnz,conz,k0,k1,nextp; 555 556 PetscFunctionBegin; 557 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 558 /*-----------------------------------------------------*/ 559 /* update numerical values of P_oth and P_loc */ 560 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 561 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 562 563 /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */ 564 /*----------------------------------------------------------*/ 565 /* get data from symbolic products */ 566 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 567 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 568 pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a; 569 pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 570 571 api = ptap->api; 572 apj = ptap->apj; 573 for (i=0; i<cm; i++) { 574 apJ = apj + api[i]; 575 576 /* diagonal portion of A */ 577 anz = adi[i+1] - adi[i]; 578 adj = ad->j + adi[i]; 579 ada = ad->a + adi[i]; 580 for (j=0; j<anz; j++) { 581 row = adj[j]; 582 pnz = pi_loc[row+1] - pi_loc[row]; 583 pj = pj_loc + pi_loc[row]; 584 pa = pa_loc + pi_loc[row]; 585 /* perform sparse axpy */ 586 valtmp = ada[j]; 587 nextp = 0; 588 for (k=0; nextp<pnz; k++) { 589 if (apJ[k] == pj[nextp]) { /* column of AP == column of P */ 590 apa_sparse[k] += valtmp*pa[nextp++]; 591 } 592 } 593 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 594 } 595 596 /* off-diagonal portion of A */ 597 anz = aoi[i+1] - aoi[i]; 598 aoj = ao->j + aoi[i]; 599 aoa = ao->a + aoi[i]; 600 for (j=0; j<anz; j++) { 601 row = aoj[j]; 602 pnz = pi_oth[row+1] - pi_oth[row]; 603 pj = pj_oth + pi_oth[row]; 604 pa = pa_oth + pi_oth[row]; 605 /* perform sparse axpy */ 606 valtmp = aoa[j]; 607 nextp = 0; 608 for (k=0; nextp<pnz; k++) { 609 if (apJ[k] == pj[nextp]) { /* column of AP == column of P */ 610 apa_sparse[k] += valtmp*pa[nextp++]; 611 } 612 } 613 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 614 } 615 616 /* set values in C */ 617 cdnz = cd->i[i+1] - cd->i[i]; 618 conz = co->i[i+1] - co->i[i]; 619 620 /* 1st off-diagoanl part of C */ 621 ca = coa + co->i[i]; 622 k = 0; 623 for (k0=0; k0<conz; k0++) { 624 if (apJ[k] >= cstart) break; 625 ca[k0] = apa_sparse[k]; 626 apa_sparse[k] = 0.0; 627 k++; 628 } 629 630 /* diagonal part of C */ 631 ca = cda + cd->i[i]; 632 for (k1=0; k1<cdnz; k1++) { 633 ca[k1] = apa_sparse[k]; 634 apa_sparse[k] = 0.0; 635 k++; 636 } 637 638 /* 2nd off-diagoanl part of C */ 639 ca = coa + co->i[i]; 640 for (; k0<conz; k0++) { 641 ca[k0] = apa_sparse[k]; 642 apa_sparse[k] = 0.0; 643 k++; 644 } 645 } 646 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 647 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 648 PetscFunctionReturn(0); 649 } 650 651 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */ 652 #undef __FUNCT__ 653 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ" 654 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 655 { 656 PetscErrorCode ierr; 657 MPI_Comm comm; 658 Mat Cmpi; 659 Mat_PtAPMPI *ptap; 660 PetscFreeSpaceList free_space = NULL,current_space=NULL; 661 Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data,*c; 662 Mat_SeqAIJ *ad = (Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth; 663 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz; 664 PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart; 665 PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0; 666 PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n; 667 PetscInt nlnk_max,armax,prmax; 668 PetscReal afill; 669 PetscScalar *apa; 670 671 PetscFunctionBegin; 672 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 673 /* create struct Mat_PtAPMPI and attached it to C later */ 674 ierr = PetscNew(&ptap);CHKERRQ(ierr); 675 676 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 677 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 678 679 /* get P_loc by taking all local rows of P */ 680 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 681 682 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 683 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 684 pi_loc = p_loc->i; pj_loc = p_loc->j; 685 pi_oth = p_oth->i; pj_oth = p_oth->j; 686 687 /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */ 688 /*-------------------------------------------------------------------*/ 689 ierr = PetscMalloc1((am+2),&api);CHKERRQ(ierr); 690 ptap->api = api; 691 api[0] = 0; 692 693 /* create and initialize a linked list */ 694 armax = ad->rmax+ao->rmax; 695 prmax = PetscMax(p_loc->rmax,p_oth->rmax); 696 nlnk_max = armax*prmax; 697 if (!nlnk_max || nlnk_max > pN) nlnk_max = pN; 698 ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr); 699 700 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 701 ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr); 702 703 current_space = free_space; 704 705 ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr); 706 for (i=0; i<am; i++) { 707 /* diagonal portion of A */ 708 nzi = adi[i+1] - adi[i]; 709 for (j=0; j<nzi; j++) { 710 row = *adj++; 711 pnz = pi_loc[row+1] - pi_loc[row]; 712 Jptr = pj_loc + pi_loc[row]; 713 /* add non-zero cols of P into the sorted linked list lnk */ 714 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 715 } 716 /* off-diagonal portion of A */ 717 nzi = aoi[i+1] - aoi[i]; 718 for (j=0; j<nzi; j++) { 719 row = *aoj++; 720 pnz = pi_oth[row+1] - pi_oth[row]; 721 Jptr = pj_oth + pi_oth[row]; 722 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 723 } 724 725 apnz = *lnk; 726 api[i+1] = api[i] + apnz; 727 if (apnz > apnz_max) apnz_max = apnz; 728 729 /* if free space is not available, double the total space in the list */ 730 if (current_space->local_remaining<apnz) { 731 ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 732 nspacedouble++; 733 } 734 735 /* Copy data into free space, then initialize lnk */ 736 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 737 ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr); 738 739 current_space->array += apnz; 740 current_space->local_used += apnz; 741 current_space->local_remaining -= apnz; 742 } 743 744 /* Allocate space for apj, initialize apj, and */ 745 /* destroy list of free space and other temporary array(s) */ 746 ierr = PetscMalloc1((api[am]+1),&ptap->apj);CHKERRQ(ierr); 747 apj = ptap->apj; 748 ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr); 749 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 750 751 /* create and assemble symbolic parallel matrix Cmpi */ 752 /*----------------------------------------------------*/ 753 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 754 ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 755 ierr = MatSetBlockSizesFromMats(Cmpi,A,P);CHKERRQ(ierr); 756 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 757 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 758 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 759 760 /* malloc apa for assembly Cmpi */ 761 ierr = PetscCalloc1(apnz_max,&apa);CHKERRQ(ierr); 762 763 ptap->apa = apa; 764 for (i=0; i<am; i++) { 765 row = i + rstart; 766 apnz = api[i+1] - api[i]; 767 ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr); 768 apj += apnz; 769 } 770 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 771 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 772 773 ptap->destroy = Cmpi->ops->destroy; 774 ptap->duplicate = Cmpi->ops->duplicate; 775 Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable; 776 Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult; 777 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult; 778 779 /* attach the supporting struct to Cmpi for reuse */ 780 c = (Mat_MPIAIJ*)Cmpi->data; 781 c->ptap = ptap; 782 783 *C = Cmpi; 784 785 /* set MatInfo */ 786 afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]+1) + 1.e-5; 787 if (afill < 1.0) afill = 1.0; 788 Cmpi->info.mallocs = nspacedouble; 789 Cmpi->info.fill_ratio_given = fill; 790 Cmpi->info.fill_ratio_needed = afill; 791 792 #if defined(PETSC_USE_INFO) 793 if (api[am]) { 794 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr); 795 ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n",(double)afill);CHKERRQ(ierr); 796 } else { 797 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 798 } 799 #endif 800 PetscFunctionReturn(0); 801 } 802 803 /*-------------------------------------------------------------------------*/ 804 #undef __FUNCT__ 805 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ" 806 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C) 807 { 808 PetscErrorCode ierr; 809 const char *algTypes[3] = {"scalable","nonscalable","matmatmult"}; 810 PetscInt alg=0; /* set default algorithm */ 811 812 PetscFunctionBegin; 813 if (scall == MAT_INITIAL_MATRIX) { 814 ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 815 ierr = PetscOptionsEList("-mattransposematmult_via","Algorithmic approach","MatTransposeMatMult",algTypes,3,algTypes[0],&alg,NULL);CHKERRQ(ierr); 816 ierr = PetscOptionsEnd();CHKERRQ(ierr); 817 818 ierr = PetscLogEventBegin(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr); 819 switch (alg) { 820 case 1: 821 ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(P,A,fill,C);CHKERRQ(ierr); 822 break; 823 case 2: 824 { 825 Mat Pt; 826 Mat_PtAPMPI *ptap; 827 Mat_MPIAIJ *c; 828 ierr = MatTranspose(P,MAT_INITIAL_MATRIX,&Pt);CHKERRQ(ierr); 829 ierr = MatMatMult(Pt,A,MAT_INITIAL_MATRIX,fill,C);CHKERRQ(ierr); 830 c = (Mat_MPIAIJ*)(*C)->data; 831 ptap = c->ptap; 832 ptap->Pt = Pt; 833 (*C)->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult; 834 PetscFunctionReturn(0); 835 } 836 break; 837 default: 838 ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr); 839 break; 840 } 841 ierr = PetscLogEventEnd(MAT_TransposeMatMultSymbolic,P,A,0,0);CHKERRQ(ierr); 842 } 843 ierr = PetscLogEventBegin(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr); 844 ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr); 845 ierr = PetscLogEventEnd(MAT_TransposeMatMultNumeric,P,A,0,0);CHKERRQ(ierr); 846 PetscFunctionReturn(0); 847 } 848 849 /* This routine only works when scall=MAT_REUSE_MATRIX! */ 850 #undef __FUNCT__ 851 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult" 852 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P,Mat A,Mat C) 853 { 854 PetscErrorCode ierr; 855 Mat_MPIAIJ *c=(Mat_MPIAIJ*)C->data; 856 Mat_PtAPMPI *ptap= c->ptap; 857 Mat Pt=ptap->Pt; 858 859 PetscFunctionBegin; 860 ierr = MatTranspose(P,MAT_REUSE_MATRIX,&Pt);CHKERRQ(ierr); 861 ierr = MatMatMultNumeric(Pt,A,C);CHKERRQ(ierr); 862 PetscFunctionReturn(0); 863 } 864 865 /* Non-scalable version, use dense axpy */ 866 #undef __FUNCT__ 867 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable" 868 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,Mat C) 869 { 870 PetscErrorCode ierr; 871 Mat_Merge_SeqsToMPI *merge; 872 Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 873 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data; 874 Mat_PtAPMPI *ptap; 875 PetscInt *adj,*aJ; 876 PetscInt i,j,k,anz,pnz,row,*cj; 877 MatScalar *ada,*aval,*ca,valtmp; 878 PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n; 879 MPI_Comm comm; 880 PetscMPIInt size,rank,taga,*len_s; 881 PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci; 882 PetscInt **buf_ri,**buf_rj; 883 PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */ 884 MPI_Request *s_waits,*r_waits; 885 MPI_Status *status; 886 MatScalar **abuf_r,*ba_i,*pA,*coa,*ba; 887 PetscInt *ai,*aj,*coi,*coj; 888 PetscInt *poJ,*pdJ; 889 Mat A_loc; 890 Mat_SeqAIJ *a_loc; 891 892 PetscFunctionBegin; 893 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 894 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 895 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 896 897 ptap = c->ptap; 898 merge = ptap->merge; 899 900 /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */ 901 /*--------------------------------------------------------------*/ 902 /* get data from symbolic products */ 903 coi = merge->coi; coj = merge->coj; 904 ierr = PetscCalloc1((coi[pon]+1),&coa);CHKERRQ(ierr); 905 906 bi = merge->bi; bj = merge->bj; 907 owners = merge->rowmap->range; 908 ierr = PetscCalloc1((bi[cm]+1),&ba);CHKERRQ(ierr); 909 910 /* get A_loc by taking all local rows of A */ 911 A_loc = ptap->A_loc; 912 ierr = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr); 913 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 914 ai = a_loc->i; 915 aj = a_loc->j; 916 917 ierr = PetscCalloc1((A->cmap->N),&aval);CHKERRQ(ierr); /* non-scalable!!! */ 918 919 for (i=0; i<am; i++) { 920 /* 2-a) put A[i,:] to dense array aval */ 921 anz = ai[i+1] - ai[i]; 922 adj = aj + ai[i]; 923 ada = a_loc->a + ai[i]; 924 for (j=0; j<anz; j++) { 925 aval[adj[j]] = ada[j]; 926 } 927 928 /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */ 929 /*--------------------------------------------------------------*/ 930 /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */ 931 pnz = po->i[i+1] - po->i[i]; 932 poJ = po->j + po->i[i]; 933 pA = po->a + po->i[i]; 934 for (j=0; j<pnz; j++) { 935 row = poJ[j]; 936 cnz = coi[row+1] - coi[row]; 937 cj = coj + coi[row]; 938 ca = coa + coi[row]; 939 /* perform dense axpy */ 940 valtmp = pA[j]; 941 for (k=0; k<cnz; k++) { 942 ca[k] += valtmp*aval[cj[k]]; 943 } 944 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 945 } 946 947 /* put the value into Cd (diagonal part) */ 948 pnz = pd->i[i+1] - pd->i[i]; 949 pdJ = pd->j + pd->i[i]; 950 pA = pd->a + pd->i[i]; 951 for (j=0; j<pnz; j++) { 952 row = pdJ[j]; 953 cnz = bi[row+1] - bi[row]; 954 cj = bj + bi[row]; 955 ca = ba + bi[row]; 956 /* perform dense axpy */ 957 valtmp = pA[j]; 958 for (k=0; k<cnz; k++) { 959 ca[k] += valtmp*aval[cj[k]]; 960 } 961 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 962 } 963 964 /* zero the current row of Pt*A */ 965 aJ = aj + ai[i]; 966 for (k=0; k<anz; k++) aval[aJ[k]] = 0.0; 967 } 968 969 /* 3) send and recv matrix values coa */ 970 /*------------------------------------*/ 971 buf_ri = merge->buf_ri; 972 buf_rj = merge->buf_rj; 973 len_s = merge->len_s; 974 ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr); 975 ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 976 977 ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr); 978 for (proc=0,k=0; proc<size; proc++) { 979 if (!len_s[proc]) continue; 980 i = merge->owners_co[proc]; 981 ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 982 k++; 983 } 984 if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);} 985 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);} 986 987 ierr = PetscFree2(s_waits,status);CHKERRQ(ierr); 988 ierr = PetscFree(r_waits);CHKERRQ(ierr); 989 ierr = PetscFree(coa);CHKERRQ(ierr); 990 991 /* 4) insert local Cseq and received values into Cmpi */ 992 /*----------------------------------------------------*/ 993 ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr); 994 for (k=0; k<merge->nrecv; k++) { 995 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 996 nrows = *(buf_ri_k[k]); 997 nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 998 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 999 } 1000 1001 for (i=0; i<cm; i++) { 1002 row = owners[rank] + i; /* global row index of C_seq */ 1003 bj_i = bj + bi[i]; /* col indices of the i-th row of C */ 1004 ba_i = ba + bi[i]; 1005 bnz = bi[i+1] - bi[i]; 1006 /* add received vals into ba */ 1007 for (k=0; k<merge->nrecv; k++) { /* k-th received message */ 1008 /* i-th row */ 1009 if (i == *nextrow[k]) { 1010 cnz = *(nextci[k]+1) - *nextci[k]; 1011 cj = buf_rj[k] + *(nextci[k]); 1012 ca = abuf_r[k] + *(nextci[k]); 1013 nextcj = 0; 1014 for (j=0; nextcj<cnz; j++) { 1015 if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */ 1016 ba_i[j] += ca[nextcj++]; 1017 } 1018 } 1019 nextrow[k]++; nextci[k]++; 1020 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 1021 } 1022 } 1023 ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 1024 } 1025 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1026 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1027 1028 ierr = PetscFree(ba);CHKERRQ(ierr); 1029 ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr); 1030 ierr = PetscFree(abuf_r);CHKERRQ(ierr); 1031 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1032 ierr = PetscFree(aval);CHKERRQ(ierr); 1033 PetscFunctionReturn(0); 1034 } 1035 1036 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */ 1037 #undef __FUNCT__ 1038 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable" 1039 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P,Mat A,PetscReal fill,Mat *C) 1040 { 1041 PetscErrorCode ierr; 1042 Mat Cmpi,A_loc,POt,PDt; 1043 Mat_PtAPMPI *ptap; 1044 PetscFreeSpaceList free_space=NULL,current_space=NULL; 1045 Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c; 1046 PetscInt *pdti,*pdtj,*poti,*potj,*ptJ; 1047 PetscInt nnz; 1048 PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row; 1049 PetscInt am=A->rmap->n,pn=P->cmap->n; 1050 PetscBT lnkbt; 1051 MPI_Comm comm; 1052 PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri; 1053 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1054 PetscInt len,proc,*dnz,*onz,*owners; 1055 PetscInt nzi,*bi,*bj; 1056 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1057 MPI_Request *swaits,*rwaits; 1058 MPI_Status *sstatus,rstatus; 1059 Mat_Merge_SeqsToMPI *merge; 1060 PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j; 1061 PetscReal afill =1.0,afill_tmp; 1062 PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax; 1063 PetscScalar *vals; 1064 Mat_SeqAIJ *a_loc, *pdt,*pot; 1065 1066 PetscFunctionBegin; 1067 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1068 /* check if matrix local sizes are compatible */ 1069 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) { 1070 SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 1071 } 1072 1073 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1074 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1075 1076 /* create struct Mat_PtAPMPI and attached it to C later */ 1077 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1078 1079 /* get A_loc by taking all local rows of A */ 1080 ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr); 1081 1082 ptap->A_loc = A_loc; 1083 1084 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 1085 ai = a_loc->i; 1086 aj = a_loc->j; 1087 1088 /* determine symbolic Co=(p->B)^T*A - send to others */ 1089 /*----------------------------------------------------*/ 1090 ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr); 1091 pdt = (Mat_SeqAIJ*)PDt->data; 1092 pdti = pdt->i; pdtj = pdt->j; 1093 1094 ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr); 1095 pot = (Mat_SeqAIJ*)POt->data; 1096 poti = pot->i; potj = pot->j; 1097 1098 /* then, compute symbolic Co = (p->B)^T*A */ 1099 pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors >= (num of nonzero rows of C_seq) - pn */ 1100 ierr = PetscMalloc1((pon+1),&coi);CHKERRQ(ierr); 1101 coi[0] = 0; 1102 1103 /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */ 1104 nnz = fill*(poti[pon] + ai[am]); 1105 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1106 current_space = free_space; 1107 1108 /* create and initialize a linked list */ 1109 i = PetscMax(pdt->rmax,pot->rmax); 1110 Crmax = i*a_loc->rmax*size; 1111 if (!Crmax || Crmax > aN) Crmax = aN; 1112 ierr = PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);CHKERRQ(ierr); 1113 1114 for (i=0; i<pon; i++) { 1115 pnz = poti[i+1] - poti[i]; 1116 ptJ = potj + poti[i]; 1117 for (j=0; j<pnz; j++) { 1118 row = ptJ[j]; /* row of A_loc == col of Pot */ 1119 anz = ai[row+1] - ai[row]; 1120 Jptr = aj + ai[row]; 1121 /* add non-zero cols of AP into the sorted linked list lnk */ 1122 ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1123 } 1124 nnz = lnk[0]; 1125 1126 /* If free space is not available, double the total space in the list */ 1127 if (current_space->local_remaining<nnz) { 1128 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1129 nspacedouble++; 1130 } 1131 1132 /* Copy data into free space, and zero out denserows */ 1133 ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1134 1135 current_space->array += nnz; 1136 current_space->local_used += nnz; 1137 current_space->local_remaining -= nnz; 1138 1139 coi[i+1] = coi[i] + nnz; 1140 } 1141 1142 ierr = PetscMalloc1((coi[pon]+1),&coj);CHKERRQ(ierr); 1143 ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr); 1144 1145 afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1); 1146 if (afill_tmp > afill) afill = afill_tmp; 1147 1148 /* send j-array (coj) of Co to other processors */ 1149 /*----------------------------------------------*/ 1150 /* determine row ownership */ 1151 ierr = PetscNew(&merge);CHKERRQ(ierr); 1152 ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr); 1153 1154 merge->rowmap->n = pn; 1155 merge->rowmap->bs = 1; 1156 1157 ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr); 1158 owners = merge->rowmap->range; 1159 1160 /* determine the number of messages to send, their lengths */ 1161 ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr); 1162 ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr); 1163 1164 len_s = merge->len_s; 1165 merge->nsend = 0; 1166 1167 ierr = PetscMalloc1((size+2),&owners_co);CHKERRQ(ierr); 1168 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1169 1170 proc = 0; 1171 for (i=0; i<pon; i++) { 1172 while (prmap[i] >= owners[proc+1]) proc++; 1173 len_si[proc]++; /* num of rows in Co to be sent to [proc] */ 1174 len_s[proc] += coi[i+1] - coi[i]; 1175 } 1176 1177 len = 0; /* max length of buf_si[] */ 1178 owners_co[0] = 0; 1179 for (proc=0; proc<size; proc++) { 1180 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1181 if (len_si[proc]) { 1182 merge->nsend++; 1183 len_si[proc] = 2*(len_si[proc] + 1); 1184 len += len_si[proc]; 1185 } 1186 } 1187 1188 /* determine the number and length of messages to receive for coi and coj */ 1189 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 1190 ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 1191 1192 /* post the Irecv and Isend of coj */ 1193 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1194 ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1195 ierr = PetscMalloc1((merge->nsend+1),&swaits);CHKERRQ(ierr); 1196 for (proc=0, k=0; proc<size; proc++) { 1197 if (!len_s[proc]) continue; 1198 i = owners_co[proc]; 1199 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1200 k++; 1201 } 1202 1203 /* receives and sends of coj are complete */ 1204 ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr); 1205 for (i=0; i<merge->nrecv; i++) { 1206 PetscMPIInt icompleted; 1207 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1208 } 1209 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1210 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1211 1212 /* send and recv coi */ 1213 /*-------------------*/ 1214 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1215 ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1216 ierr = PetscMalloc1((len+1),&buf_s);CHKERRQ(ierr); 1217 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1218 for (proc=0,k=0; proc<size; proc++) { 1219 if (!len_s[proc]) continue; 1220 /* form outgoing message for i-structure: 1221 buf_si[0]: nrows to be sent 1222 [1:nrows]: row index (global) 1223 [nrows+1:2*nrows+1]: i-structure index 1224 */ 1225 /*-------------------------------------------*/ 1226 nrows = len_si[proc]/2 - 1; 1227 buf_si_i = buf_si + nrows+1; 1228 buf_si[0] = nrows; 1229 buf_si_i[0] = 0; 1230 nrows = 0; 1231 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 1232 nzi = coi[i+1] - coi[i]; 1233 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1234 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 1235 nrows++; 1236 } 1237 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1238 k++; 1239 buf_si += len_si[proc]; 1240 } 1241 i = merge->nrecv; 1242 while (i--) { 1243 PetscMPIInt icompleted; 1244 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1245 } 1246 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1247 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1248 ierr = PetscFree(len_si);CHKERRQ(ierr); 1249 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1250 ierr = PetscFree(swaits);CHKERRQ(ierr); 1251 ierr = PetscFree(sstatus);CHKERRQ(ierr); 1252 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1253 1254 /* compute the local portion of C (mpi mat) */ 1255 /*------------------------------------------*/ 1256 /* allocate bi array and free space for accumulating nonzero column info */ 1257 ierr = PetscMalloc1((pn+1),&bi);CHKERRQ(ierr); 1258 bi[0] = 0; 1259 1260 /* set initial free space to be fill*(nnz(P) + nnz(A)) */ 1261 nnz = fill*(pdti[pn] + poti[pon] + ai[am]); 1262 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1263 current_space = free_space; 1264 1265 ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr); 1266 for (k=0; k<merge->nrecv; k++) { 1267 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1268 nrows = *buf_ri_k[k]; 1269 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1270 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 1271 } 1272 1273 ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr); 1274 rmax = 0; 1275 for (i=0; i<pn; i++) { 1276 /* add pdt[i,:]*AP into lnk */ 1277 pnz = pdti[i+1] - pdti[i]; 1278 ptJ = pdtj + pdti[i]; 1279 for (j=0; j<pnz; j++) { 1280 row = ptJ[j]; /* row of AP == col of Pt */ 1281 anz = ai[row+1] - ai[row]; 1282 Jptr = aj + ai[row]; 1283 /* add non-zero cols of AP into the sorted linked list lnk */ 1284 ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1285 } 1286 1287 /* add received col data into lnk */ 1288 for (k=0; k<merge->nrecv; k++) { /* k-th received message */ 1289 if (i == *nextrow[k]) { /* i-th row */ 1290 nzi = *(nextci[k]+1) - *nextci[k]; 1291 Jptr = buf_rj[k] + *nextci[k]; 1292 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1293 nextrow[k]++; nextci[k]++; 1294 } 1295 } 1296 nnz = lnk[0]; 1297 1298 /* if free space is not available, make more free space */ 1299 if (current_space->local_remaining<nnz) { 1300 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1301 nspacedouble++; 1302 } 1303 /* copy data into free space, then initialize lnk */ 1304 ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1305 ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr); 1306 1307 current_space->array += nnz; 1308 current_space->local_used += nnz; 1309 current_space->local_remaining -= nnz; 1310 1311 bi[i+1] = bi[i] + nnz; 1312 if (nnz > rmax) rmax = nnz; 1313 } 1314 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1315 1316 ierr = PetscMalloc1((bi[pn]+1),&bj);CHKERRQ(ierr); 1317 ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr); 1318 1319 afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1); 1320 if (afill_tmp > afill) afill = afill_tmp; 1321 ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr); 1322 ierr = MatDestroy(&POt);CHKERRQ(ierr); 1323 ierr = MatDestroy(&PDt);CHKERRQ(ierr); 1324 1325 /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */ 1326 /*----------------------------------------------------------------------------------*/ 1327 ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr); 1328 1329 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1330 ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1331 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr); 1332 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 1333 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1334 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1335 ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr); 1336 for (i=0; i<pn; i++) { 1337 row = i + rstart; 1338 nnz = bi[i+1] - bi[i]; 1339 Jptr = bj + bi[i]; 1340 ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr); 1341 } 1342 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1343 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1344 ierr = PetscFree(vals);CHKERRQ(ierr); 1345 1346 merge->bi = bi; 1347 merge->bj = bj; 1348 merge->coi = coi; 1349 merge->coj = coj; 1350 merge->buf_ri = buf_ri; 1351 merge->buf_rj = buf_rj; 1352 merge->owners_co = owners_co; 1353 merge->destroy = Cmpi->ops->destroy; 1354 merge->duplicate = Cmpi->ops->duplicate; 1355 1356 Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable; 1357 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1358 1359 /* attach the supporting struct to Cmpi for reuse */ 1360 c = (Mat_MPIAIJ*)Cmpi->data; 1361 c->ptap = ptap; 1362 ptap->api = NULL; 1363 ptap->apj = NULL; 1364 ptap->merge = merge; 1365 ptap->rmax = rmax; 1366 1367 *C = Cmpi; 1368 #if defined(PETSC_USE_INFO) 1369 if (bi[pn] != 0) { 1370 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr); 1371 ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr); 1372 } else { 1373 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 1374 } 1375 #endif 1376 PetscFunctionReturn(0); 1377 } 1378 1379 #undef __FUNCT__ 1380 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ" 1381 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C) 1382 { 1383 PetscErrorCode ierr; 1384 Mat_Merge_SeqsToMPI *merge; 1385 Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 1386 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data; 1387 Mat_PtAPMPI *ptap; 1388 PetscInt *adj; 1389 PetscInt i,j,k,anz,pnz,row,*cj,nexta; 1390 MatScalar *ada,*ca,valtmp; 1391 PetscInt am =A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n; 1392 MPI_Comm comm; 1393 PetscMPIInt size,rank,taga,*len_s; 1394 PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci; 1395 PetscInt **buf_ri,**buf_rj; 1396 PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */ 1397 MPI_Request *s_waits,*r_waits; 1398 MPI_Status *status; 1399 MatScalar **abuf_r,*ba_i,*pA,*coa,*ba; 1400 PetscInt *ai,*aj,*coi,*coj; 1401 PetscInt *poJ,*pdJ; 1402 Mat A_loc; 1403 Mat_SeqAIJ *a_loc; 1404 1405 PetscFunctionBegin; 1406 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr); 1407 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1408 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1409 1410 ptap = c->ptap; 1411 merge = ptap->merge; 1412 1413 /* 2) compute numeric C_seq = P_loc^T*A_loc */ 1414 /*------------------------------------------*/ 1415 /* get data from symbolic products */ 1416 coi = merge->coi; coj = merge->coj; 1417 ierr = PetscCalloc1((coi[pon]+1),&coa);CHKERRQ(ierr); 1418 bi = merge->bi; bj = merge->bj; 1419 owners = merge->rowmap->range; 1420 ierr = PetscCalloc1(bi[cm]+1,&ba);CHKERRQ(ierr); 1421 1422 /* get A_loc by taking all local rows of A */ 1423 A_loc = ptap->A_loc; 1424 ierr = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr); 1425 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 1426 ai = a_loc->i; 1427 aj = a_loc->j; 1428 1429 for (i=0; i<am; i++) { 1430 anz = ai[i+1] - ai[i]; 1431 adj = aj + ai[i]; 1432 ada = a_loc->a + ai[i]; 1433 1434 /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */ 1435 /*-------------------------------------------------------------*/ 1436 /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */ 1437 pnz = po->i[i+1] - po->i[i]; 1438 poJ = po->j + po->i[i]; 1439 pA = po->a + po->i[i]; 1440 for (j=0; j<pnz; j++) { 1441 row = poJ[j]; 1442 cj = coj + coi[row]; 1443 ca = coa + coi[row]; 1444 /* perform sparse axpy */ 1445 nexta = 0; 1446 valtmp = pA[j]; 1447 for (k=0; nexta<anz; k++) { 1448 if (cj[k] == adj[nexta]) { 1449 ca[k] += valtmp*ada[nexta]; 1450 nexta++; 1451 } 1452 } 1453 ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr); 1454 } 1455 1456 /* put the value into Cd (diagonal part) */ 1457 pnz = pd->i[i+1] - pd->i[i]; 1458 pdJ = pd->j + pd->i[i]; 1459 pA = pd->a + pd->i[i]; 1460 for (j=0; j<pnz; j++) { 1461 row = pdJ[j]; 1462 cj = bj + bi[row]; 1463 ca = ba + bi[row]; 1464 /* perform sparse axpy */ 1465 nexta = 0; 1466 valtmp = pA[j]; 1467 for (k=0; nexta<anz; k++) { 1468 if (cj[k] == adj[nexta]) { 1469 ca[k] += valtmp*ada[nexta]; 1470 nexta++; 1471 } 1472 } 1473 ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr); 1474 } 1475 } 1476 1477 /* 3) send and recv matrix values coa */ 1478 /*------------------------------------*/ 1479 buf_ri = merge->buf_ri; 1480 buf_rj = merge->buf_rj; 1481 len_s = merge->len_s; 1482 ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr); 1483 ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 1484 1485 ierr = PetscMalloc2(merge->nsend+1,&s_waits,size,&status);CHKERRQ(ierr); 1486 for (proc=0,k=0; proc<size; proc++) { 1487 if (!len_s[proc]) continue; 1488 i = merge->owners_co[proc]; 1489 ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 1490 k++; 1491 } 1492 if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);} 1493 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);} 1494 1495 ierr = PetscFree2(s_waits,status);CHKERRQ(ierr); 1496 ierr = PetscFree(r_waits);CHKERRQ(ierr); 1497 ierr = PetscFree(coa);CHKERRQ(ierr); 1498 1499 /* 4) insert local Cseq and received values into Cmpi */ 1500 /*----------------------------------------------------*/ 1501 ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr); 1502 for (k=0; k<merge->nrecv; k++) { 1503 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1504 nrows = *(buf_ri_k[k]); 1505 nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 1506 nextci[k] = buf_ri_k[k] + (nrows + 1); /* poins to the next i-structure of k-th recved i-structure */ 1507 } 1508 1509 for (i=0; i<cm; i++) { 1510 row = owners[rank] + i; /* global row index of C_seq */ 1511 bj_i = bj + bi[i]; /* col indices of the i-th row of C */ 1512 ba_i = ba + bi[i]; 1513 bnz = bi[i+1] - bi[i]; 1514 /* add received vals into ba */ 1515 for (k=0; k<merge->nrecv; k++) { /* k-th received message */ 1516 /* i-th row */ 1517 if (i == *nextrow[k]) { 1518 cnz = *(nextci[k]+1) - *nextci[k]; 1519 cj = buf_rj[k] + *(nextci[k]); 1520 ca = abuf_r[k] + *(nextci[k]); 1521 nextcj = 0; 1522 for (j=0; nextcj<cnz; j++) { 1523 if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */ 1524 ba_i[j] += ca[nextcj++]; 1525 } 1526 } 1527 nextrow[k]++; nextci[k]++; 1528 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 1529 } 1530 } 1531 ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 1532 } 1533 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1534 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1535 1536 ierr = PetscFree(ba);CHKERRQ(ierr); 1537 ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr); 1538 ierr = PetscFree(abuf_r);CHKERRQ(ierr); 1539 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1540 PetscFunctionReturn(0); 1541 } 1542 1543 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ(); 1544 differ from MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable in using LLCondensedCreate_Scalable() */ 1545 #undef __FUNCT__ 1546 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ" 1547 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C) 1548 { 1549 PetscErrorCode ierr; 1550 Mat Cmpi,A_loc,POt,PDt; 1551 Mat_PtAPMPI *ptap; 1552 PetscFreeSpaceList free_space=NULL,current_space=NULL; 1553 Mat_MPIAIJ *p =(Mat_MPIAIJ*)P->data,*c; 1554 PetscInt *pdti,*pdtj,*poti,*potj,*ptJ; 1555 PetscInt nnz; 1556 PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row; 1557 PetscInt am =A->rmap->n,pn=P->cmap->n; 1558 MPI_Comm comm; 1559 PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri; 1560 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1561 PetscInt len,proc,*dnz,*onz,*owners; 1562 PetscInt nzi,*bi,*bj; 1563 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1564 MPI_Request *swaits,*rwaits; 1565 MPI_Status *sstatus,rstatus; 1566 Mat_Merge_SeqsToMPI *merge; 1567 PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j; 1568 PetscReal afill =1.0,afill_tmp; 1569 PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax; 1570 PetscScalar *vals; 1571 Mat_SeqAIJ *a_loc, *pdt,*pot; 1572 1573 PetscFunctionBegin; 1574 ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1575 /* check if matrix local sizes are compatible */ 1576 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend) { 1577 SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 1578 } 1579 1580 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1581 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1582 1583 /* create struct Mat_PtAPMPI and attached it to C later */ 1584 ierr = PetscNew(&ptap);CHKERRQ(ierr); 1585 1586 /* get A_loc by taking all local rows of A */ 1587 ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr); 1588 1589 ptap->A_loc = A_loc; 1590 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 1591 ai = a_loc->i; 1592 aj = a_loc->j; 1593 1594 /* determine symbolic Co=(p->B)^T*A - send to others */ 1595 /*----------------------------------------------------*/ 1596 ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr); 1597 pdt = (Mat_SeqAIJ*)PDt->data; 1598 pdti = pdt->i; pdtj = pdt->j; 1599 1600 ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr); 1601 pot = (Mat_SeqAIJ*)POt->data; 1602 poti = pot->i; potj = pot->j; 1603 1604 /* then, compute symbolic Co = (p->B)^T*A */ 1605 pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors 1606 >= (num of nonzero rows of C_seq) - pn */ 1607 ierr = PetscMalloc1((pon+1),&coi);CHKERRQ(ierr); 1608 coi[0] = 0; 1609 1610 /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */ 1611 nnz = fill*(poti[pon] + ai[am]); 1612 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1613 current_space = free_space; 1614 1615 /* create and initialize a linked list */ 1616 i = PetscMax(pdt->rmax,pot->rmax); 1617 Crmax = i*a_loc->rmax*size; /* non-scalable! */ 1618 if (!Crmax || Crmax > aN) Crmax = aN; 1619 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 1620 1621 for (i=0; i<pon; i++) { 1622 pnz = poti[i+1] - poti[i]; 1623 ptJ = potj + poti[i]; 1624 for (j=0; j<pnz; j++) { 1625 row = ptJ[j]; /* row of A_loc == col of Pot */ 1626 anz = ai[row+1] - ai[row]; 1627 Jptr = aj + ai[row]; 1628 /* add non-zero cols of AP into the sorted linked list lnk */ 1629 ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr); 1630 } 1631 nnz = lnk[0]; 1632 1633 /* If free space is not available, double the total space in the list */ 1634 if (current_space->local_remaining<nnz) { 1635 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1636 nspacedouble++; 1637 } 1638 1639 /* Copy data into free space, and zero out denserows */ 1640 ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr); 1641 1642 current_space->array += nnz; 1643 current_space->local_used += nnz; 1644 current_space->local_remaining -= nnz; 1645 1646 coi[i+1] = coi[i] + nnz; 1647 } 1648 1649 ierr = PetscMalloc1((coi[pon]+1),&coj);CHKERRQ(ierr); 1650 ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr); 1651 1652 afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]+1); 1653 if (afill_tmp > afill) afill = afill_tmp; 1654 1655 /* send j-array (coj) of Co to other processors */ 1656 /*----------------------------------------------*/ 1657 /* determine row ownership */ 1658 ierr = PetscNew(&merge);CHKERRQ(ierr); 1659 ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr); 1660 1661 merge->rowmap->n = pn; 1662 merge->rowmap->bs = 1; 1663 1664 ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr); 1665 owners = merge->rowmap->range; 1666 1667 /* determine the number of messages to send, their lengths */ 1668 ierr = PetscCalloc1(size,&len_si);CHKERRQ(ierr); 1669 ierr = PetscMalloc1(size,&merge->len_s);CHKERRQ(ierr); 1670 1671 len_s = merge->len_s; 1672 merge->nsend = 0; 1673 1674 ierr = PetscMalloc1((size+2),&owners_co);CHKERRQ(ierr); 1675 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1676 1677 proc = 0; 1678 for (i=0; i<pon; i++) { 1679 while (prmap[i] >= owners[proc+1]) proc++; 1680 len_si[proc]++; /* num of rows in Co to be sent to [proc] */ 1681 len_s[proc] += coi[i+1] - coi[i]; 1682 } 1683 1684 len = 0; /* max length of buf_si[] */ 1685 owners_co[0] = 0; 1686 for (proc=0; proc<size; proc++) { 1687 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1688 if (len_si[proc]) { 1689 merge->nsend++; 1690 len_si[proc] = 2*(len_si[proc] + 1); 1691 len += len_si[proc]; 1692 } 1693 } 1694 1695 /* determine the number and length of messages to receive for coi and coj */ 1696 ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 1697 ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 1698 1699 /* post the Irecv and Isend of coj */ 1700 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1701 ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1702 ierr = PetscMalloc1((merge->nsend+1),&swaits);CHKERRQ(ierr); 1703 for (proc=0, k=0; proc<size; proc++) { 1704 if (!len_s[proc]) continue; 1705 i = owners_co[proc]; 1706 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1707 k++; 1708 } 1709 1710 /* receives and sends of coj are complete */ 1711 ierr = PetscMalloc1(size,&sstatus);CHKERRQ(ierr); 1712 for (i=0; i<merge->nrecv; i++) { 1713 PetscMPIInt icompleted; 1714 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1715 } 1716 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1717 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1718 1719 /* send and recv coi */ 1720 /*-------------------*/ 1721 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1722 ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1723 ierr = PetscMalloc1((len+1),&buf_s);CHKERRQ(ierr); 1724 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1725 for (proc=0,k=0; proc<size; proc++) { 1726 if (!len_s[proc]) continue; 1727 /* form outgoing message for i-structure: 1728 buf_si[0]: nrows to be sent 1729 [1:nrows]: row index (global) 1730 [nrows+1:2*nrows+1]: i-structure index 1731 */ 1732 /*-------------------------------------------*/ 1733 nrows = len_si[proc]/2 - 1; 1734 buf_si_i = buf_si + nrows+1; 1735 buf_si[0] = nrows; 1736 buf_si_i[0] = 0; 1737 nrows = 0; 1738 for (i=owners_co[proc]; i<owners_co[proc+1]; i++) { 1739 nzi = coi[i+1] - coi[i]; 1740 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1741 buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */ 1742 nrows++; 1743 } 1744 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1745 k++; 1746 buf_si += len_si[proc]; 1747 } 1748 i = merge->nrecv; 1749 while (i--) { 1750 PetscMPIInt icompleted; 1751 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1752 } 1753 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1754 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1755 ierr = PetscFree(len_si);CHKERRQ(ierr); 1756 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1757 ierr = PetscFree(swaits);CHKERRQ(ierr); 1758 ierr = PetscFree(sstatus);CHKERRQ(ierr); 1759 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1760 1761 /* compute the local portion of C (mpi mat) */ 1762 /*------------------------------------------*/ 1763 /* allocate bi array and free space for accumulating nonzero column info */ 1764 ierr = PetscMalloc1((pn+1),&bi);CHKERRQ(ierr); 1765 bi[0] = 0; 1766 1767 /* set initial free space to be fill*(nnz(P) + nnz(AP)) */ 1768 nnz = fill*(pdti[pn] + poti[pon] + ai[am]); 1769 ierr = PetscFreeSpaceGet(nnz,&free_space);CHKERRQ(ierr); 1770 current_space = free_space; 1771 1772 ierr = PetscMalloc3(merge->nrecv,&buf_ri_k,merge->nrecv,&nextrow,merge->nrecv,&nextci);CHKERRQ(ierr); 1773 for (k=0; k<merge->nrecv; k++) { 1774 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1775 nrows = *buf_ri_k[k]; 1776 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1777 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recieved i-structure */ 1778 } 1779 1780 ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr); 1781 rmax = 0; 1782 for (i=0; i<pn; i++) { 1783 /* add pdt[i,:]*AP into lnk */ 1784 pnz = pdti[i+1] - pdti[i]; 1785 ptJ = pdtj + pdti[i]; 1786 for (j=0; j<pnz; j++) { 1787 row = ptJ[j]; /* row of AP == col of Pt */ 1788 anz = ai[row+1] - ai[row]; 1789 Jptr = aj + ai[row]; 1790 /* add non-zero cols of AP into the sorted linked list lnk */ 1791 ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr); 1792 } 1793 1794 /* add received col data into lnk */ 1795 for (k=0; k<merge->nrecv; k++) { /* k-th received message */ 1796 if (i == *nextrow[k]) { /* i-th row */ 1797 nzi = *(nextci[k]+1) - *nextci[k]; 1798 Jptr = buf_rj[k] + *nextci[k]; 1799 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 1800 nextrow[k]++; nextci[k]++; 1801 } 1802 } 1803 nnz = lnk[0]; 1804 1805 /* if free space is not available, make more free space */ 1806 if (current_space->local_remaining<nnz) { 1807 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1808 nspacedouble++; 1809 } 1810 /* copy data into free space, then initialize lnk */ 1811 ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr); 1812 ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr); 1813 1814 current_space->array += nnz; 1815 current_space->local_used += nnz; 1816 current_space->local_remaining -= nnz; 1817 1818 bi[i+1] = bi[i] + nnz; 1819 if (nnz > rmax) rmax = nnz; 1820 } 1821 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1822 1823 ierr = PetscMalloc1((bi[pn]+1),&bj);CHKERRQ(ierr); 1824 ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr); 1825 afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]+1); 1826 if (afill_tmp > afill) afill = afill_tmp; 1827 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 1828 ierr = MatDestroy(&POt);CHKERRQ(ierr); 1829 ierr = MatDestroy(&PDt);CHKERRQ(ierr); 1830 1831 /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */ 1832 /*----------------------------------------------------------------------------------*/ 1833 ierr = PetscCalloc1(rmax+1,&vals);CHKERRQ(ierr); 1834 1835 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1836 ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1837 ierr = MatSetBlockSizes(Cmpi,PetscAbs(P->cmap->bs),PetscAbs(A->cmap->bs));CHKERRQ(ierr); 1838 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 1839 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1840 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1841 ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr); 1842 for (i=0; i<pn; i++) { 1843 row = i + rstart; 1844 nnz = bi[i+1] - bi[i]; 1845 Jptr = bj + bi[i]; 1846 ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr); 1847 } 1848 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1849 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1850 ierr = PetscFree(vals);CHKERRQ(ierr); 1851 1852 merge->bi = bi; 1853 merge->bj = bj; 1854 merge->coi = coi; 1855 merge->coj = coj; 1856 merge->buf_ri = buf_ri; 1857 merge->buf_rj = buf_rj; 1858 merge->owners_co = owners_co; 1859 merge->destroy = Cmpi->ops->destroy; 1860 merge->duplicate = Cmpi->ops->duplicate; 1861 1862 Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ; 1863 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1864 1865 /* attach the supporting struct to Cmpi for reuse */ 1866 c = (Mat_MPIAIJ*)Cmpi->data; 1867 1868 c->ptap = ptap; 1869 ptap->api = NULL; 1870 ptap->apj = NULL; 1871 ptap->merge = merge; 1872 ptap->rmax = rmax; 1873 ptap->apa = NULL; 1874 1875 *C = Cmpi; 1876 #if defined(PETSC_USE_INFO) 1877 if (bi[pn] != 0) { 1878 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)afill);CHKERRQ(ierr); 1879 ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n",(double)afill);CHKERRQ(ierr); 1880 } else { 1881 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 1882 } 1883 #endif 1884 PetscFunctionReturn(0); 1885 } 1886