1 2 /* 3 Defines matrix-matrix product routines for pairs of MPIAIJ matrices 4 C = A * B 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <../src/mat/impls/aij/mpi/mpiaij.h> 9 #include <petscbt.h> 10 #include <../src/mat/impls/dense/mpi/mpidense.h> 11 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIAIJ" 14 PetscErrorCode MatMatMult_MPIAIJ_MPIAIJ(Mat A,Mat B,MatReuse scall,PetscReal fill, Mat *C) 15 { 16 PetscErrorCode ierr; 17 18 PetscFunctionBegin; 19 if (scall == MAT_INITIAL_MATRIX){ 20 ierr = PetscLogEventBegin(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 21 ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ(A,B,fill,C);CHKERRQ(ierr); 22 ierr = PetscLogEventEnd(MAT_MatMultSymbolic,A,B,0,0);CHKERRQ(ierr); 23 } 24 ierr = PetscLogEventBegin(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 25 ierr = (*(*C)->ops->matmultnumeric)(A,B,*C);CHKERRQ(ierr); 26 ierr = PetscLogEventEnd(MAT_MatMultNumeric,A,B,0,0);CHKERRQ(ierr); 27 PetscFunctionReturn(0); 28 } 29 30 #undef __FUNCT__ 31 #define __FUNCT__ "MatDestroy_MPIAIJ_MatMatMult" 32 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(Mat A) 33 { 34 PetscErrorCode ierr; 35 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 36 Mat_PtAPMPI *ptap=a->ptap; 37 38 PetscFunctionBegin; 39 ierr = PetscFree2(ptap->startsj_s,ptap->startsj_r);CHKERRQ(ierr); 40 ierr = PetscFree(ptap->bufa);CHKERRQ(ierr); 41 ierr = MatDestroy(&ptap->P_loc);CHKERRQ(ierr); 42 ierr = MatDestroy(&ptap->P_oth);CHKERRQ(ierr); 43 ierr = PetscFree(ptap->api);CHKERRQ(ierr); 44 ierr = PetscFree(ptap->apj);CHKERRQ(ierr); 45 ierr = PetscFree(ptap->apa);CHKERRQ(ierr); 46 ierr = ptap->destroy(A);CHKERRQ(ierr); 47 ierr = PetscFree(ptap);CHKERRQ(ierr); 48 PetscFunctionReturn(0); 49 } 50 51 #undef __FUNCT__ 52 #define __FUNCT__ "MatDuplicate_MPIAIJ_MatMatMult" 53 PetscErrorCode MatDuplicate_MPIAIJ_MatMatMult(Mat A, MatDuplicateOption op, Mat *M) 54 { 55 PetscErrorCode ierr; 56 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data; 57 Mat_PtAPMPI *ptap=a->ptap; 58 59 PetscFunctionBegin; 60 ierr = (*ptap->duplicate)(A,op,M);CHKERRQ(ierr); 61 (*M)->ops->destroy = ptap->destroy; /* = MatDestroy_MPIAIJ, *M doesn't duplicate A's special structure! */ 62 (*M)->ops->duplicate = ptap->duplicate; /* = MatDuplicate_MPIAIJ */ 63 PetscFunctionReturn(0); 64 } 65 66 #undef __FUNCT__ 67 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ" 68 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C) 69 { 70 PetscErrorCode ierr; 71 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data; 72 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 73 Mat_SeqAIJ *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data; 74 PetscInt *adi=ad->i,*adj,*aoi=ao->i,*aoj; 75 PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a; 76 Mat_SeqAIJ *p_loc,*p_oth; 77 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj; 78 PetscScalar *pa_loc,*pa_oth,*pa,*apa,valtmp,*ca; 79 PetscInt cm=C->rmap->n,anz,pnz; 80 Mat_PtAPMPI *ptap=c->ptap; 81 PetscInt *api,*apj,*apJ,i,j,k,row; 82 PetscInt cstart=C->cmap->rstart; 83 PetscInt cdnz,conz,k0,k1; 84 85 PetscFunctionBegin; 86 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 87 /*-----------------------------------------------------*/ 88 /* update numerical values of P_oth and P_loc */ 89 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 90 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 91 92 /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */ 93 /*----------------------------------------------------------*/ 94 /* get data from symbolic products */ 95 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 96 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 97 pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a; 98 pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 99 100 /* get apa for storing dense row A[i,:]*P */ 101 apa = ptap->apa; 102 103 api = ptap->api; 104 apj = ptap->apj; 105 for (i=0; i<cm; i++) { 106 /* diagonal portion of A */ 107 anz = adi[i+1] - adi[i]; 108 adj = ad->j + adi[i]; 109 ada = ad->a + adi[i]; 110 for (j=0; j<anz; j++) { 111 row = adj[j]; 112 pnz = pi_loc[row+1] - pi_loc[row]; 113 pj = pj_loc + pi_loc[row]; 114 pa = pa_loc + pi_loc[row]; 115 116 /* perform dense axpy */ 117 valtmp = ada[j]; 118 for (k=0; k<pnz; k++){ 119 apa[pj[k]] += valtmp*pa[k]; 120 } 121 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 122 } 123 124 /* off-diagonal portion of A */ 125 anz = aoi[i+1] - aoi[i]; 126 aoj = ao->j + aoi[i]; 127 aoa = ao->a + aoi[i]; 128 for (j=0; j<anz; j++) { 129 row = aoj[j]; 130 pnz = pi_oth[row+1] - pi_oth[row]; 131 pj = pj_oth + pi_oth[row]; 132 pa = pa_oth + pi_oth[row]; 133 134 /* perform dense axpy */ 135 valtmp = aoa[j]; 136 for (k=0; k<pnz; k++){ 137 apa[pj[k]] += valtmp*pa[k]; 138 } 139 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 140 } 141 142 /* set values in C */ 143 apJ = apj + api[i]; 144 cdnz = cd->i[i+1] - cd->i[i]; 145 conz = co->i[i+1] - co->i[i]; 146 147 /* 1st off-diagoanl part of C */ 148 ca = coa + co->i[i]; 149 k = 0; 150 for (k0=0; k0<conz; k0++){ 151 if (apJ[k] >= cstart) break; 152 ca[k0] = apa[apJ[k]]; 153 apa[apJ[k]] = 0.0; 154 k++; 155 } 156 157 /* diagonal part of C */ 158 ca = cda + cd->i[i]; 159 for (k1=0; k1<cdnz; k1++){ 160 ca[k1] = apa[apJ[k]]; 161 apa[apJ[k]] = 0.0; 162 k++; 163 } 164 165 /* 2nd off-diagoanl part of C */ 166 ca = coa + co->i[i]; 167 for (; k0<conz; k0++){ 168 ca[k0] = apa[apJ[k]]; 169 apa[apJ[k]] = 0.0; 170 k++; 171 } 172 } 173 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 174 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 175 PetscFunctionReturn(0); 176 } 177 178 #undef __FUNCT__ 179 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ" 180 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat *C) 181 { 182 PetscErrorCode ierr; 183 MPI_Comm comm=((PetscObject)A)->comm; 184 Mat Cmpi; 185 Mat_PtAPMPI *ptap; 186 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 187 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c; 188 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth; 189 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz; 190 PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart; 191 PetscInt *lnk,i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi; 192 PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n; 193 PetscBT lnkbt; 194 PetscScalar *apa; 195 PetscReal afill; 196 PetscBool scalable=PETSC_FALSE; 197 PetscInt nlnk_max,armax,prmax; 198 199 PetscFunctionBegin; 200 if (A->cmap->rstart != P->rmap->rstart || A->cmap->rend != P->rmap->rend){ 201 SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, (%D, %D) != (%D,%D)",A->cmap->rstart,A->cmap->rend,P->rmap->rstart,P->rmap->rend); 202 } 203 204 ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 205 ierr = PetscOptionsBool("-matmatmult_scalable","Use a scalable but slower C=A*B","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr); 206 if (scalable){ 207 ierr = MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(A,P,fill,C);;CHKERRQ(ierr); 208 PetscFunctionReturn(0); 209 } 210 ierr = PetscOptionsEnd();CHKERRQ(ierr); 211 212 /* create struct Mat_PtAPMPI and attached it to C later */ 213 ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr); 214 215 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 216 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 217 218 /* get P_loc by taking all local rows of P */ 219 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 220 221 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 222 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 223 pi_loc = p_loc->i; pj_loc = p_loc->j; 224 pi_oth = p_oth->i; pj_oth = p_oth->j; 225 226 /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */ 227 /*-------------------------------------------------------------------*/ 228 ierr = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr); 229 ptap->api = api; 230 api[0] = 0; 231 232 /* create and initialize a linked list */ 233 armax = ad->rmax+ao->rmax; 234 prmax = PetscMax(p_loc->rmax,p_oth->rmax); 235 nlnk_max = armax*prmax; 236 if (!nlnk_max || nlnk_max > pN) nlnk_max = pN; 237 ierr = PetscLLCondensedCreate(nlnk_max,pN,&lnk,&lnkbt);CHKERRQ(ierr); 238 239 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 240 ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr); 241 current_space = free_space; 242 243 ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr); 244 for (i=0; i<am; i++) { 245 apnz = 0; 246 /* diagonal portion of A */ 247 nzi = adi[i+1] - adi[i]; 248 for (j=0; j<nzi; j++){ 249 row = *adj++; 250 pnz = pi_loc[row+1] - pi_loc[row]; 251 Jptr = pj_loc + pi_loc[row]; 252 /* add non-zero cols of P into the sorted linked list lnk */ 253 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 254 } 255 /* off-diagonal portion of A */ 256 nzi = aoi[i+1] - aoi[i]; 257 for (j=0; j<nzi; j++){ 258 row = *aoj++; 259 pnz = pi_oth[row+1] - pi_oth[row]; 260 Jptr = pj_oth + pi_oth[row]; 261 ierr = PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 262 } 263 264 apnz = lnk[0]; 265 api[i+1] = api[i] + apnz; 266 267 /* if free space is not available, double the total space in the list */ 268 if (current_space->local_remaining<apnz) { 269 ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 270 nspacedouble++; 271 } 272 273 /* Copy data into free space, then initialize lnk */ 274 ierr = PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 275 ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr); 276 current_space->array += apnz; 277 current_space->local_used += apnz; 278 current_space->local_remaining -= apnz; 279 } 280 281 /* Allocate space for apj, initialize apj, and */ 282 /* destroy list of free space and other temporary array(s) */ 283 ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr); 284 apj = ptap->apj; 285 ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr); 286 ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 287 288 /* malloc apa to store dense row A[i,:]*P */ 289 ierr = PetscMalloc(pN*sizeof(PetscScalar),&apa);CHKERRQ(ierr); 290 ierr = PetscMemzero(apa,pN*sizeof(PetscScalar));CHKERRQ(ierr); 291 ptap->apa = apa; 292 293 /* create and assemble symbolic parallel matrix Cmpi */ 294 /*----------------------------------------------------*/ 295 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 296 ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 297 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 298 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 299 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 300 for (i=0; i<am; i++){ 301 row = i + rstart; 302 apnz = api[i+1] - api[i]; 303 ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr); 304 apj += apnz; 305 } 306 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 307 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 308 309 ptap->destroy = Cmpi->ops->destroy; 310 ptap->duplicate = Cmpi->ops->duplicate; 311 Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ; 312 Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult; 313 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult; 314 315 /* attach the supporting struct to Cmpi for reuse */ 316 c = (Mat_MPIAIJ*)Cmpi->data; 317 c->ptap = ptap; 318 319 *C = Cmpi; 320 321 /* set MatInfo */ 322 afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]) + 1.e-5; 323 if (afill < 1.0) afill = 1.0; 324 Cmpi->info.mallocs = nspacedouble; 325 Cmpi->info.fill_ratio_given = fill; 326 Cmpi->info.fill_ratio_needed = afill; 327 328 #if defined(PETSC_USE_INFO) 329 if (api[am]) { 330 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr); 331 ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr); 332 } else { 333 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 334 } 335 #endif 336 PetscFunctionReturn(0); 337 } 338 339 #undef __FUNCT__ 340 #define __FUNCT__ "MatMatMult_MPIAIJ_MPIDense" 341 PetscErrorCode MatMatMult_MPIAIJ_MPIDense(Mat A,Mat B,MatReuse scall,PetscReal fill,Mat *C) 342 { 343 PetscErrorCode ierr; 344 345 PetscFunctionBegin; 346 if (scall == MAT_INITIAL_MATRIX){ 347 ierr = MatMatMultSymbolic_MPIAIJ_MPIDense(A,B,fill,C);CHKERRQ(ierr); 348 } 349 ierr = MatMatMultNumeric_MPIAIJ_MPIDense(A,B,*C);CHKERRQ(ierr); 350 PetscFunctionReturn(0); 351 } 352 353 typedef struct { 354 Mat workB; 355 PetscScalar *rvalues,*svalues; 356 MPI_Request *rwaits,*swaits; 357 } MPIAIJ_MPIDense; 358 359 #undef __FUNCT__ 360 #define __FUNCT__ "MatMPIAIJ_MPIDenseDestroy" 361 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx) 362 { 363 MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense*) ctx; 364 PetscErrorCode ierr; 365 366 PetscFunctionBegin; 367 ierr = MatDestroy(&contents->workB);CHKERRQ(ierr); 368 ierr = PetscFree4(contents->rvalues,contents->svalues,contents->rwaits,contents->swaits);CHKERRQ(ierr); 369 ierr = PetscFree(contents);CHKERRQ(ierr); 370 PetscFunctionReturn(0); 371 } 372 373 #undef __FUNCT__ 374 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIDense" 375 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A,Mat B,PetscReal fill,Mat *C) 376 { 377 PetscErrorCode ierr; 378 Mat_MPIAIJ *aij = (Mat_MPIAIJ*) A->data; 379 PetscInt nz = aij->B->cmap->n; 380 PetscContainer container; 381 MPIAIJ_MPIDense *contents; 382 VecScatter ctx = aij->Mvctx; 383 VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata; 384 VecScatter_MPI_General *to = ( VecScatter_MPI_General*) ctx->todata; 385 PetscInt m=A->rmap->n,n=B->cmap->n; 386 387 PetscFunctionBegin; 388 ierr = MatCreate(((PetscObject)B)->comm,C);CHKERRQ(ierr); 389 ierr = MatSetSizes(*C,m,n,A->rmap->N,B->cmap->N);CHKERRQ(ierr); 390 ierr = MatSetType(*C,MATMPIDENSE);CHKERRQ(ierr); 391 ierr = MatMPIDenseSetPreallocation(*C,PETSC_NULL);CHKERRQ(ierr); 392 ierr = MatAssemblyBegin(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 393 ierr = MatAssemblyEnd(*C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 394 (*C)->ops->matmult = MatMatMult_MPIAIJ_MPIDense; 395 396 ierr = PetscNew(MPIAIJ_MPIDense,&contents);CHKERRQ(ierr); 397 /* Create work matrix used to store off processor rows of B needed for local product */ 398 ierr = MatCreateSeqDense(PETSC_COMM_SELF,nz,B->cmap->N,PETSC_NULL,&contents->workB);CHKERRQ(ierr); 399 /* Create work arrays needed */ 400 ierr = PetscMalloc4(B->cmap->N*from->starts[from->n],PetscScalar,&contents->rvalues, 401 B->cmap->N*to->starts[to->n],PetscScalar,&contents->svalues, 402 from->n,MPI_Request,&contents->rwaits, 403 to->n,MPI_Request,&contents->swaits);CHKERRQ(ierr); 404 405 ierr = PetscContainerCreate(((PetscObject)A)->comm,&container);CHKERRQ(ierr); 406 ierr = PetscContainerSetPointer(container,contents);CHKERRQ(ierr); 407 ierr = PetscContainerSetUserDestroy(container,MatMPIAIJ_MPIDenseDestroy);CHKERRQ(ierr); 408 ierr = PetscObjectCompose((PetscObject)(*C),"workB",(PetscObject)container);CHKERRQ(ierr); 409 ierr = PetscContainerDestroy(&container);CHKERRQ(ierr); 410 PetscFunctionReturn(0); 411 } 412 413 #undef __FUNCT__ 414 #define __FUNCT__ "MatMPIDenseScatter" 415 /* 416 Performs an efficient scatter on the rows of B needed by this process; this is 417 a modification of the VecScatterBegin_() routines. 418 */ 419 PetscErrorCode MatMPIDenseScatter(Mat A,Mat B,Mat C,Mat *outworkB) 420 { 421 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 422 PetscErrorCode ierr; 423 PetscScalar *b,*w,*svalues,*rvalues; 424 VecScatter ctx = aij->Mvctx; 425 VecScatter_MPI_General *from = (VecScatter_MPI_General*) ctx->fromdata; 426 VecScatter_MPI_General *to = ( VecScatter_MPI_General*) ctx->todata; 427 PetscInt i,j,k; 428 PetscInt *sindices,*sstarts,*rindices,*rstarts; 429 PetscMPIInt *sprocs,*rprocs,nrecvs; 430 MPI_Request *swaits,*rwaits; 431 MPI_Comm comm = ((PetscObject)A)->comm; 432 PetscMPIInt tag = ((PetscObject)ctx)->tag,ncols = B->cmap->N, nrows = aij->B->cmap->n,imdex,nrowsB = B->rmap->n; 433 MPI_Status status; 434 MPIAIJ_MPIDense *contents; 435 PetscContainer container; 436 Mat workB; 437 438 PetscFunctionBegin; 439 ierr = PetscObjectQuery((PetscObject)C,"workB",(PetscObject*)&container);CHKERRQ(ierr); 440 if (!container) SETERRQ(comm,PETSC_ERR_PLIB,"Container does not exist"); 441 ierr = PetscContainerGetPointer(container,(void**)&contents);CHKERRQ(ierr); 442 443 workB = *outworkB = contents->workB; 444 if (nrows != workB->rmap->n) SETERRQ2(comm,PETSC_ERR_PLIB,"Number of rows of workB %D not equal to columns of aij->B %D",nrows,workB->cmap->n); 445 sindices = to->indices; 446 sstarts = to->starts; 447 sprocs = to->procs; 448 swaits = contents->swaits; 449 svalues = contents->svalues; 450 451 rindices = from->indices; 452 rstarts = from->starts; 453 rprocs = from->procs; 454 rwaits = contents->rwaits; 455 rvalues = contents->rvalues; 456 457 ierr = MatGetArray(B,&b);CHKERRQ(ierr); 458 ierr = MatGetArray(workB,&w);CHKERRQ(ierr); 459 460 for (i=0; i<from->n; i++) { 461 ierr = MPI_Irecv(rvalues+ncols*rstarts[i],ncols*(rstarts[i+1]-rstarts[i]),MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr); 462 } 463 464 for (i=0; i<to->n; i++) { 465 /* pack a message at a time */ 466 CHKMEMQ; 467 for (j=0; j<sstarts[i+1]-sstarts[i]; j++){ 468 for (k=0; k<ncols; k++) { 469 svalues[ncols*(sstarts[i] + j) + k] = b[sindices[sstarts[i]+j] + nrowsB*k]; 470 } 471 } 472 CHKMEMQ; 473 ierr = MPI_Isend(svalues+ncols*sstarts[i],ncols*(sstarts[i+1]-sstarts[i]),MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr); 474 } 475 476 nrecvs = from->n; 477 while (nrecvs) { 478 ierr = MPI_Waitany(from->n,rwaits,&imdex,&status);CHKERRQ(ierr); 479 nrecvs--; 480 /* unpack a message at a time */ 481 CHKMEMQ; 482 for (j=0; j<rstarts[imdex+1]-rstarts[imdex]; j++){ 483 for (k=0; k<ncols; k++) { 484 w[rindices[rstarts[imdex]+j] + nrows*k] = rvalues[ncols*(rstarts[imdex] + j) + k]; 485 } 486 } 487 CHKMEMQ; 488 } 489 if (to->n) {ierr = MPI_Waitall(to->n,swaits,to->sstatus);CHKERRQ(ierr);} 490 491 ierr = MatRestoreArray(B,&b);CHKERRQ(ierr); 492 ierr = MatRestoreArray(workB,&w);CHKERRQ(ierr); 493 ierr = MatAssemblyBegin(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 494 ierr = MatAssemblyEnd(workB,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 extern PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat,Mat,Mat); 498 499 #undef __FUNCT__ 500 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIDense" 501 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A,Mat B,Mat C) 502 { 503 PetscErrorCode ierr; 504 Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 505 Mat_MPIDense *bdense = (Mat_MPIDense*)B->data; 506 Mat_MPIDense *cdense = (Mat_MPIDense*)C->data; 507 Mat workB; 508 509 PetscFunctionBegin; 510 511 /* diagonal block of A times all local rows of B*/ 512 ierr = MatMatMultNumeric_SeqAIJ_SeqDense(aij->A,bdense->A,cdense->A);CHKERRQ(ierr); 513 514 /* get off processor parts of B needed to complete the product */ 515 ierr = MatMPIDenseScatter(A,B,C,&workB);CHKERRQ(ierr); 516 517 /* off-diagonal block of A times nonlocal rows of B */ 518 ierr = MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B,workB,cdense->A);CHKERRQ(ierr); 519 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 520 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 521 PetscFunctionReturn(0); 522 } 523 524 #undef __FUNCT__ 525 #define __FUNCT__ "MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable" 526 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,Mat C) 527 { 528 PetscErrorCode ierr; 529 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c=(Mat_MPIAIJ*)C->data; 530 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data; 531 Mat_SeqAIJ *cd=(Mat_SeqAIJ*)(c->A)->data,*co=(Mat_SeqAIJ*)(c->B)->data; 532 PetscInt *adi=ad->i,*adj,*aoi=ao->i,*aoj; 533 PetscScalar *ada,*aoa,*cda=cd->a,*coa=co->a; 534 Mat_SeqAIJ *p_loc,*p_oth; 535 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*pj; 536 PetscScalar *pa_loc,*pa_oth,*pa,valtmp,*ca; 537 PetscInt cm=C->rmap->n,anz,pnz; 538 Mat_PtAPMPI *ptap=c->ptap; 539 PetscScalar *apa_sparse=ptap->apa; 540 PetscInt *api,*apj,*apJ,i,j,k,row; 541 PetscInt cstart=C->cmap->rstart; 542 PetscInt cdnz,conz,k0,k1,nextp; 543 544 PetscFunctionBegin; 545 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 546 /*-----------------------------------------------------*/ 547 /* update numerical values of P_oth and P_loc */ 548 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 549 ierr = MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 550 551 /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */ 552 /*----------------------------------------------------------*/ 553 /* get data from symbolic products */ 554 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 555 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 556 pi_loc=p_loc->i; pj_loc=p_loc->j; pa_loc=p_loc->a; 557 pi_oth=p_oth->i; pj_oth=p_oth->j; pa_oth=p_oth->a; 558 559 api = ptap->api; 560 apj = ptap->apj; 561 for (i=0; i<cm; i++) { 562 apJ = apj + api[i]; 563 564 /* diagonal portion of A */ 565 anz = adi[i+1] - adi[i]; 566 adj = ad->j + adi[i]; 567 ada = ad->a + adi[i]; 568 for (j=0; j<anz; j++) { 569 row = adj[j]; 570 pnz = pi_loc[row+1] - pi_loc[row]; 571 pj = pj_loc + pi_loc[row]; 572 pa = pa_loc + pi_loc[row]; 573 /* perform sparse axpy */ 574 valtmp = ada[j]; 575 nextp = 0; 576 for (k=0; nextp<pnz; k++) { 577 if (apJ[k] == pj[nextp]) { /* column of AP == column of P */ 578 apa_sparse[k] += valtmp*pa[nextp++]; 579 } 580 } 581 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 582 } 583 584 /* off-diagonal portion of A */ 585 anz = aoi[i+1] - aoi[i]; 586 aoj = ao->j + aoi[i]; 587 aoa = ao->a + aoi[i]; 588 for (j=0; j<anz; j++) { 589 row = aoj[j]; 590 pnz = pi_oth[row+1] - pi_oth[row]; 591 pj = pj_oth + pi_oth[row]; 592 pa = pa_oth + pi_oth[row]; 593 /* perform sparse axpy */ 594 valtmp = aoa[j]; 595 nextp = 0; 596 for (k=0; nextp<pnz; k++) { 597 if (apJ[k] == pj[nextp]) { /* column of AP == column of P */ 598 apa_sparse[k] += valtmp*pa[nextp++]; 599 } 600 } 601 ierr = PetscLogFlops(2.0*pnz);CHKERRQ(ierr); 602 } 603 604 /* set values in C */ 605 cdnz = cd->i[i+1] - cd->i[i]; 606 conz = co->i[i+1] - co->i[i]; 607 608 /* 1st off-diagoanl part of C */ 609 ca = coa + co->i[i]; 610 k = 0; 611 for (k0=0; k0<conz; k0++){ 612 if (apJ[k] >= cstart) break; 613 ca[k0] = apa_sparse[k]; 614 apa_sparse[k] = 0.0; 615 k++; 616 } 617 618 /* diagonal part of C */ 619 ca = cda + cd->i[i]; 620 for (k1=0; k1<cdnz; k1++){ 621 ca[k1] = apa_sparse[k]; 622 apa_sparse[k] = 0.0; 623 k++; 624 } 625 626 /* 2nd off-diagoanl part of C */ 627 ca = coa + co->i[i]; 628 for (; k0<conz; k0++){ 629 ca[k0] = apa_sparse[k]; 630 apa_sparse[k] = 0.0; 631 k++; 632 } 633 } 634 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 635 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 636 PetscFunctionReturn(0); 637 } 638 639 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ(), except using LLCondensed to avoid O(BN) memory requirement */ 640 #undef __FUNCT__ 641 #define __FUNCT__ "MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable" 642 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat A,Mat P,PetscReal fill,Mat *C) 643 { 644 PetscErrorCode ierr; 645 MPI_Comm comm=((PetscObject)A)->comm; 646 Mat Cmpi; 647 Mat_PtAPMPI *ptap; 648 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 649 Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*c; 650 Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_loc,*p_oth; 651 PetscInt *pi_loc,*pj_loc,*pi_oth,*pj_oth,*dnz,*onz; 652 PetscInt *adi=ad->i,*adj=ad->j,*aoi=ao->i,*aoj=ao->j,rstart=A->rmap->rstart; 653 PetscInt i,pnz,row,*api,*apj,*Jptr,apnz,nspacedouble=0,j,nzi,*lnk,apnz_max=0; 654 PetscInt am=A->rmap->n,pN=P->cmap->N,pn=P->cmap->n,pm=P->rmap->n; 655 PetscInt nlnk_max,armax,prmax; 656 PetscReal afill; 657 PetscScalar *apa; 658 659 PetscFunctionBegin; 660 /* create struct Mat_PtAPMPI and attached it to C later */ 661 ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr); 662 663 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 664 ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);CHKERRQ(ierr); 665 666 /* get P_loc by taking all local rows of P */ 667 ierr = MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);CHKERRQ(ierr); 668 669 p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data; 670 p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data; 671 pi_loc = p_loc->i; pj_loc = p_loc->j; 672 pi_oth = p_oth->i; pj_oth = p_oth->j; 673 674 /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */ 675 /*-------------------------------------------------------------------*/ 676 ierr = PetscMalloc((am+2)*sizeof(PetscInt),&api);CHKERRQ(ierr); 677 ptap->api = api; 678 api[0] = 0; 679 680 /* create and initialize a linked list */ 681 armax = ad->rmax+ao->rmax; 682 prmax = PetscMax(p_loc->rmax,p_oth->rmax); 683 nlnk_max = armax*prmax; 684 if (!nlnk_max || nlnk_max > pN) nlnk_max = pN; 685 ierr = PetscLLCondensedCreate_Scalable(nlnk_max,&lnk);CHKERRQ(ierr); 686 687 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 688 ierr = PetscFreeSpaceGet((PetscInt)(fill*(adi[am]+aoi[am]+pi_loc[pm])),&free_space);CHKERRQ(ierr); 689 current_space = free_space; 690 691 ierr = MatPreallocateInitialize(comm,am,pn,dnz,onz);CHKERRQ(ierr); 692 for (i=0; i<am; i++) { 693 apnz = 0; 694 /* diagonal portion of A */ 695 nzi = adi[i+1] - adi[i]; 696 for (j=0; j<nzi; j++){ 697 row = *adj++; 698 pnz = pi_loc[row+1] - pi_loc[row]; 699 Jptr = pj_loc + pi_loc[row]; 700 /* add non-zero cols of P into the sorted linked list lnk */ 701 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 702 } 703 /* off-diagonal portion of A */ 704 nzi = aoi[i+1] - aoi[i]; 705 for (j=0; j<nzi; j++){ 706 row = *aoj++; 707 pnz = pi_oth[row+1] - pi_oth[row]; 708 Jptr = pj_oth + pi_oth[row]; 709 ierr = PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);CHKERRQ(ierr); 710 } 711 712 apnz = *lnk; 713 api[i+1] = api[i] + apnz; 714 if (apnz > apnz_max) apnz_max = apnz; 715 716 /* if free space is not available, double the total space in the list */ 717 if (current_space->local_remaining<apnz) { 718 ierr = PetscFreeSpaceGet(apnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 719 nspacedouble++; 720 } 721 722 /* Copy data into free space, then initialize lnk */ 723 ierr = PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);CHKERRQ(ierr); 724 ierr = MatPreallocateSet(i+rstart,apnz,current_space->array,dnz,onz);CHKERRQ(ierr); 725 current_space->array += apnz; 726 current_space->local_used += apnz; 727 current_space->local_remaining -= apnz; 728 } 729 730 /* Allocate space for apj, initialize apj, and */ 731 /* destroy list of free space and other temporary array(s) */ 732 ierr = PetscMalloc((api[am]+1)*sizeof(PetscInt),&ptap->apj);CHKERRQ(ierr); 733 apj = ptap->apj; 734 ierr = PetscFreeSpaceContiguous(&free_space,ptap->apj);CHKERRQ(ierr); 735 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 736 737 /* create and assemble symbolic parallel matrix Cmpi */ 738 /*----------------------------------------------------*/ 739 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 740 ierr = MatSetSizes(Cmpi,am,pn,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 741 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 742 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 743 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 744 ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr); 745 746 /* malloc apa for assembly Cmpi */ 747 ierr = PetscMalloc(apnz_max*sizeof(PetscScalar),&apa);CHKERRQ(ierr); 748 ierr = PetscMemzero(apa,apnz_max*sizeof(PetscScalar));CHKERRQ(ierr); 749 ptap->apa = apa; 750 for (i=0; i<am; i++){ 751 row = i + rstart; 752 apnz = api[i+1] - api[i]; 753 ierr = MatSetValues(Cmpi,1,&row,apnz,apj,apa,INSERT_VALUES);CHKERRQ(ierr); 754 apj += apnz; 755 } 756 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 757 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 758 759 ptap->destroy = Cmpi->ops->destroy; 760 ptap->duplicate = Cmpi->ops->duplicate; 761 Cmpi->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_Scalable; 762 Cmpi->ops->destroy = MatDestroy_MPIAIJ_MatMatMult; 763 Cmpi->ops->duplicate = MatDuplicate_MPIAIJ_MatMatMult; 764 765 /* attach the supporting struct to Cmpi for reuse */ 766 c = (Mat_MPIAIJ*)Cmpi->data; 767 c->ptap = ptap; 768 769 *C = Cmpi; 770 771 /* set MatInfo */ 772 afill = (PetscReal)api[am]/(adi[am]+aoi[am]+pi_loc[pm]) + 1.e-5; 773 if (afill < 1.0) afill = 1.0; 774 Cmpi->info.mallocs = nspacedouble; 775 Cmpi->info.fill_ratio_given = fill; 776 Cmpi->info.fill_ratio_needed = afill; 777 778 #if defined(PETSC_USE_INFO) 779 if (api[am]) { 780 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr); 781 ierr = PetscInfo1(Cmpi,"Use MatMatMult(A,B,MatReuse,%G,&C) for best performance.;\n",afill);CHKERRQ(ierr); 782 } else { 783 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 784 } 785 #endif 786 PetscFunctionReturn(0); 787 } 788 789 /*-------------------------------------------------------------------------*/ 790 #undef __FUNCT__ 791 #define __FUNCT__ "MatTransposeMatMult_MPIAIJ_MPIAIJ" 792 PetscErrorCode MatTransposeMatMult_MPIAIJ_MPIAIJ(Mat P,Mat A,MatReuse scall,PetscReal fill,Mat *C) 793 { 794 PetscErrorCode ierr; 795 PetscBool scalable=PETSC_FALSE; 796 797 PetscFunctionBegin; 798 if (scall == MAT_INITIAL_MATRIX){ 799 ierr = PetscObjectOptionsBegin((PetscObject)A);CHKERRQ(ierr); 800 ierr = PetscOptionsBool("-mattransposematmult_scalable","Use a scalable but slower C=Pt*A","",scalable,&scalable,PETSC_NULL);CHKERRQ(ierr); 801 if (scalable){ 802 ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(P,A,fill,C);CHKERRQ(ierr); 803 } else { 804 ierr = MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(P,A,fill,C);CHKERRQ(ierr); 805 } 806 ierr = PetscOptionsEnd();CHKERRQ(ierr); 807 } 808 ierr = (*(*C)->ops->mattransposemultnumeric)(P,A,*C);CHKERRQ(ierr); 809 PetscFunctionReturn(0); 810 } 811 812 #undef __FUNCT__ 813 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ" 814 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P,Mat A,Mat C) 815 { 816 PetscErrorCode ierr; 817 Mat_Merge_SeqsToMPI *merge; 818 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 819 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data; 820 Mat_PtAPMPI *ptap; 821 PetscInt *adj,*aJ; 822 PetscInt i,j,k,anz,pnz,row,*cj; 823 MatScalar *ada,*aval,*ca,valtmp; 824 PetscInt am=A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n; 825 MPI_Comm comm=((PetscObject)C)->comm; 826 PetscMPIInt size,rank,taga,*len_s; 827 PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci; 828 PetscInt **buf_ri,**buf_rj; 829 PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */ 830 MPI_Request *s_waits,*r_waits; 831 MPI_Status *status; 832 MatScalar **abuf_r,*ba_i,*pA,*coa,*ba; 833 PetscInt *ai,*aj,*coi,*coj; 834 PetscInt *poJ=po->j,*pdJ=pd->j; 835 Mat A_loc; 836 Mat_SeqAIJ *a_loc; 837 838 PetscFunctionBegin; 839 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 840 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 841 842 ptap = c->ptap; 843 merge = ptap->merge; 844 845 /* 2) compute numeric C_seq = P_loc^T*A_loc*P - dominating part */ 846 /*--------------------------------------------------------------*/ 847 /* get data from symbolic products */ 848 coi = merge->coi; coj = merge->coj; 849 ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr); 850 ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr); 851 852 bi = merge->bi; bj = merge->bj; 853 owners = merge->rowmap->range; 854 ierr = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr); 855 ierr = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr); 856 857 /* get A_loc by taking all local rows of A */ 858 A_loc = ptap->A_loc; 859 ierr = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr); 860 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 861 ai = a_loc->i; 862 aj = a_loc->j; 863 864 ierr = PetscMalloc((A->cmap->N)*sizeof(PetscScalar),&aval);CHKERRQ(ierr); /* non-scalable!!! */ 865 ierr = PetscMemzero(aval,A->cmap->N*sizeof(PetscScalar));CHKERRQ(ierr); 866 867 for (i=0; i<am; i++) { 868 /* 2-a) put A[i,:] to dense array aval */ 869 anz = ai[i+1] - ai[i]; 870 adj = aj + ai[i]; 871 ada = a_loc->a + ai[i]; 872 for (j=0; j<anz; j++){ 873 aval[adj[j]] = ada[j]; 874 } 875 876 /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */ 877 /*--------------------------------------------------------------*/ 878 /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */ 879 pnz = po->i[i+1] - po->i[i]; 880 poJ = po->j + po->i[i]; 881 pA = po->a + po->i[i]; 882 for (j=0; j<pnz; j++){ 883 row = poJ[j]; 884 cnz = coi[row+1] - coi[row]; 885 cj = coj + coi[row]; 886 ca = coa + coi[row]; 887 /* perform dense axpy */ 888 valtmp = pA[j]; 889 for (k=0; k<cnz; k++) { 890 ca[k] += valtmp*aval[cj[k]]; 891 } 892 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 893 } 894 895 /* put the value into Cd (diagonal part) */ 896 pnz = pd->i[i+1] - pd->i[i]; 897 pdJ = pd->j + pd->i[i]; 898 pA = pd->a + pd->i[i]; 899 for (j=0; j<pnz; j++){ 900 row = pdJ[j]; 901 cnz = bi[row+1] - bi[row]; 902 cj = bj + bi[row]; 903 ca = ba + bi[row]; 904 /* perform dense axpy */ 905 valtmp = pA[j]; 906 for (k=0; k<cnz; k++) { 907 ca[k] += valtmp*aval[cj[k]]; 908 } 909 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 910 } 911 912 /* zero the current row of Pt*A */ 913 aJ = aj + ai[i]; 914 for (k=0; k<anz; k++) aval[aJ[k]] = 0.0; 915 } 916 917 /* 3) send and recv matrix values coa */ 918 /*------------------------------------*/ 919 buf_ri = merge->buf_ri; 920 buf_rj = merge->buf_rj; 921 len_s = merge->len_s; 922 ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr); 923 ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 924 925 ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr); 926 for (proc=0,k=0; proc<size; proc++){ 927 if (!len_s[proc]) continue; 928 i = merge->owners_co[proc]; 929 ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 930 k++; 931 } 932 if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);} 933 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);} 934 935 ierr = PetscFree2(s_waits,status);CHKERRQ(ierr); 936 ierr = PetscFree(r_waits);CHKERRQ(ierr); 937 ierr = PetscFree(coa);CHKERRQ(ierr); 938 939 /* 4) insert local Cseq and received values into Cmpi */ 940 /*----------------------------------------------------*/ 941 ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr); 942 for (k=0; k<merge->nrecv; k++){ 943 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 944 nrows = *(buf_ri_k[k]); 945 nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 946 nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 947 } 948 949 for (i=0; i<cm; i++) { 950 row = owners[rank] + i; /* global row index of C_seq */ 951 bj_i = bj + bi[i]; /* col indices of the i-th row of C */ 952 ba_i = ba + bi[i]; 953 bnz = bi[i+1] - bi[i]; 954 /* add received vals into ba */ 955 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 956 /* i-th row */ 957 if (i == *nextrow[k]) { 958 cnz = *(nextci[k]+1) - *nextci[k]; 959 cj = buf_rj[k] + *(nextci[k]); 960 ca = abuf_r[k] + *(nextci[k]); 961 nextcj = 0; 962 for (j=0; nextcj<cnz; j++){ 963 if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */ 964 ba_i[j] += ca[nextcj++]; 965 } 966 } 967 nextrow[k]++; nextci[k]++; 968 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 969 } 970 } 971 ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 972 } 973 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 974 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 975 976 ierr = PetscFree(ba);CHKERRQ(ierr); 977 ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr); 978 ierr = PetscFree(abuf_r);CHKERRQ(ierr); 979 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 980 ierr = PetscFree(aval);CHKERRQ(ierr); 981 PetscFunctionReturn(0); 982 } 983 984 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */ 985 #undef __FUNCT__ 986 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ" 987 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P,Mat A,PetscReal fill,Mat *C) 988 { 989 PetscErrorCode ierr; 990 Mat Cmpi,A_loc,POt,PDt; 991 Mat_PtAPMPI *ptap; 992 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 993 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 994 PetscInt *pdti,*pdtj,*poti,*potj,*ptJ; 995 PetscInt nnz; 996 PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row; 997 PetscInt am=A->rmap->n,pn=P->cmap->n; 998 PetscBT lnkbt; 999 MPI_Comm comm=((PetscObject)A)->comm; 1000 PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri; 1001 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1002 PetscInt len,proc,*dnz,*onz,*owners; 1003 PetscInt nzi,*bi,*bj; 1004 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1005 MPI_Request *swaits,*rwaits; 1006 MPI_Status *sstatus,rstatus; 1007 Mat_Merge_SeqsToMPI *merge; 1008 PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j; 1009 PetscReal afill=1.0,afill_tmp; 1010 PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax; 1011 PetscScalar *vals; 1012 Mat_SeqAIJ *a_loc, *pdt,*pot; 1013 1014 PetscFunctionBegin; 1015 /* check if matrix local sizes are compatible */ 1016 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend){ 1017 SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 1018 } 1019 1020 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1021 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1022 1023 /* create struct Mat_PtAPMPI and attached it to C later */ 1024 ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr); 1025 1026 /* get A_loc by taking all local rows of A */ 1027 ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr); 1028 ptap->A_loc = A_loc; 1029 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 1030 ai = a_loc->i; 1031 aj = a_loc->j; 1032 1033 /* determine symbolic Co=(p->B)^T*A - send to others */ 1034 /*----------------------------------------------------*/ 1035 ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr); 1036 pdt = (Mat_SeqAIJ*)PDt->data; 1037 pdti = pdt->i; pdtj = pdt->j; 1038 1039 ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr); 1040 pot = (Mat_SeqAIJ*)POt->data; 1041 poti = pot->i; potj = pot->j; 1042 1043 /* then, compute symbolic Co = (p->B)^T*A */ 1044 pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors 1045 >= (num of nonzero rows of C_seq) - pn */ 1046 ierr = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr); 1047 coi[0] = 0; 1048 1049 /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */ 1050 nnz = fill*(poti[pon] + ai[am]); 1051 ierr = PetscFreeSpaceGet(nnz,&free_space); 1052 current_space = free_space; 1053 1054 /* create and initialize a linked list */ 1055 i = PetscMax(pdt->rmax,pot->rmax); 1056 Crmax = i*a_loc->rmax*size; 1057 if (!Crmax || Crmax > aN) Crmax = aN; 1058 ierr = PetscLLCondensedCreate(Crmax,aN,&lnk,&lnkbt);CHKERRQ(ierr); 1059 1060 for (i=0; i<pon; i++) { 1061 pnz = poti[i+1] - poti[i]; 1062 ptJ = potj + poti[i]; 1063 for (j=0; j<pnz; j++){ 1064 row = ptJ[j]; /* row of A_loc == col of Pot */ 1065 anz = ai[row+1] - ai[row]; 1066 Jptr = aj + ai[row]; 1067 /* add non-zero cols of AP into the sorted linked list lnk */ 1068 ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1069 } 1070 nnz = lnk[0]; 1071 1072 /* If free space is not available, double the total space in the list */ 1073 if (current_space->local_remaining<nnz) { 1074 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1075 nspacedouble++; 1076 } 1077 1078 /* Copy data into free space, and zero out denserows */ 1079 ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1080 current_space->array += nnz; 1081 current_space->local_used += nnz; 1082 current_space->local_remaining -= nnz; 1083 coi[i+1] = coi[i] + nnz; 1084 } 1085 1086 ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr); 1087 ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr); 1088 afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]); 1089 if (afill_tmp > afill) afill = afill_tmp; 1090 1091 /* send j-array (coj) of Co to other processors */ 1092 /*----------------------------------------------*/ 1093 /* determine row ownership */ 1094 ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr); 1095 ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr); 1096 merge->rowmap->n = pn; 1097 merge->rowmap->bs = 1; 1098 ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr); 1099 owners = merge->rowmap->range; 1100 1101 /* determine the number of messages to send, their lengths */ 1102 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr); 1103 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1104 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr); 1105 len_s = merge->len_s; 1106 merge->nsend = 0; 1107 1108 ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr); 1109 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1110 1111 proc = 0; 1112 for (i=0; i<pon; i++){ 1113 while (prmap[i] >= owners[proc+1]) proc++; 1114 len_si[proc]++; /* num of rows in Co to be sent to [proc] */ 1115 len_s[proc] += coi[i+1] - coi[i]; 1116 } 1117 1118 len = 0; /* max length of buf_si[] */ 1119 owners_co[0] = 0; 1120 for (proc=0; proc<size; proc++){ 1121 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1122 if (len_si[proc]){ 1123 merge->nsend++; 1124 len_si[proc] = 2*(len_si[proc] + 1); 1125 len += len_si[proc]; 1126 } 1127 } 1128 1129 /* determine the number and length of messages to receive for coi and coj */ 1130 ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 1131 ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 1132 1133 /* post the Irecv and Isend of coj */ 1134 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1135 ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1136 ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr); 1137 for (proc=0, k=0; proc<size; proc++){ 1138 if (!len_s[proc]) continue; 1139 i = owners_co[proc]; 1140 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1141 k++; 1142 } 1143 1144 /* receives and sends of coj are complete */ 1145 ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr); 1146 for (i=0; i<merge->nrecv; i++){ 1147 PetscMPIInt icompleted; 1148 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1149 } 1150 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1151 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1152 1153 /* send and recv coi */ 1154 /*-------------------*/ 1155 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1156 ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1157 ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr); 1158 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1159 for (proc=0,k=0; proc<size; proc++){ 1160 if (!len_s[proc]) continue; 1161 /* form outgoing message for i-structure: 1162 buf_si[0]: nrows to be sent 1163 [1:nrows]: row index (global) 1164 [nrows+1:2*nrows+1]: i-structure index 1165 */ 1166 /*-------------------------------------------*/ 1167 nrows = len_si[proc]/2 - 1; 1168 buf_si_i = buf_si + nrows+1; 1169 buf_si[0] = nrows; 1170 buf_si_i[0] = 0; 1171 nrows = 0; 1172 for (i=owners_co[proc]; i<owners_co[proc+1]; i++){ 1173 nzi = coi[i+1] - coi[i]; 1174 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1175 buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */ 1176 nrows++; 1177 } 1178 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1179 k++; 1180 buf_si += len_si[proc]; 1181 } 1182 i = merge->nrecv; 1183 while (i--) { 1184 PetscMPIInt icompleted; 1185 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1186 } 1187 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1188 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1189 ierr = PetscFree(len_si);CHKERRQ(ierr); 1190 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1191 ierr = PetscFree(swaits);CHKERRQ(ierr); 1192 ierr = PetscFree(sstatus);CHKERRQ(ierr); 1193 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1194 1195 /* compute the local portion of C (mpi mat) */ 1196 /*------------------------------------------*/ 1197 /* allocate bi array and free space for accumulating nonzero column info */ 1198 ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 1199 bi[0] = 0; 1200 1201 /* set initial free space to be fill*(nnz(P) + nnz(A)) */ 1202 nnz = fill*(pdti[pn] + poti[pon] + ai[am]); 1203 ierr = PetscFreeSpaceGet(nnz,&free_space); 1204 current_space = free_space; 1205 1206 ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr); 1207 for (k=0; k<merge->nrecv; k++){ 1208 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1209 nrows = *buf_ri_k[k]; 1210 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1211 nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 1212 } 1213 1214 ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr); 1215 rmax = 0; 1216 for (i=0; i<pn; i++) { 1217 /* add pdt[i,:]*AP into lnk */ 1218 pnz = pdti[i+1] - pdti[i]; 1219 ptJ = pdtj + pdti[i]; 1220 for (j=0; j<pnz; j++){ 1221 row = ptJ[j]; /* row of AP == col of Pt */ 1222 anz = ai[row+1] - ai[row]; 1223 Jptr = aj + ai[row]; 1224 /* add non-zero cols of AP into the sorted linked list lnk */ 1225 ierr = PetscLLCondensedAddSorted(anz,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1226 } 1227 1228 /* add received col data into lnk */ 1229 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 1230 if (i == *nextrow[k]) { /* i-th row */ 1231 nzi = *(nextci[k]+1) - *nextci[k]; 1232 Jptr = buf_rj[k] + *nextci[k]; 1233 ierr = PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);CHKERRQ(ierr); 1234 nextrow[k]++; nextci[k]++; 1235 } 1236 } 1237 nnz = lnk[0]; 1238 1239 /* if free space is not available, make more free space */ 1240 if (current_space->local_remaining<nnz) { 1241 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1242 nspacedouble++; 1243 } 1244 /* copy data into free space, then initialize lnk */ 1245 ierr = PetscLLCondensedClean(aN,nnz,current_space->array,lnk,lnkbt);CHKERRQ(ierr); 1246 ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr); 1247 current_space->array += nnz; 1248 current_space->local_used += nnz; 1249 current_space->local_remaining -= nnz; 1250 bi[i+1] = bi[i] + nnz; 1251 if (nnz > rmax) rmax = nnz; 1252 } 1253 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1254 1255 ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 1256 ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr); 1257 afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]); 1258 if (afill_tmp > afill) afill = afill_tmp; 1259 ierr = PetscLLCondensedDestroy(lnk,lnkbt);CHKERRQ(ierr); 1260 ierr = MatDestroy(&POt);CHKERRQ(ierr); 1261 ierr = MatDestroy(&PDt);CHKERRQ(ierr); 1262 1263 /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */ 1264 /*----------------------------------------------------------------------------------*/ 1265 ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr); 1266 ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr); 1267 1268 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1269 ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1270 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 1271 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1272 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1273 ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr); 1274 for (i=0; i<pn; i++){ 1275 row = i + rstart; 1276 nnz = bi[i+1] - bi[i]; 1277 Jptr = bj + bi[i]; 1278 ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr); 1279 } 1280 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1281 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1282 ierr = PetscFree(vals);CHKERRQ(ierr); 1283 1284 merge->bi = bi; 1285 merge->bj = bj; 1286 merge->coi = coi; 1287 merge->coj = coj; 1288 merge->buf_ri = buf_ri; 1289 merge->buf_rj = buf_rj; 1290 merge->owners_co = owners_co; 1291 merge->destroy = Cmpi->ops->destroy; 1292 merge->duplicate = Cmpi->ops->duplicate; 1293 1294 Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ; 1295 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1296 1297 /* attach the supporting struct to Cmpi for reuse */ 1298 c = (Mat_MPIAIJ*)Cmpi->data; 1299 c->ptap = ptap; 1300 ptap->api = PETSC_NULL; 1301 ptap->apj = PETSC_NULL; 1302 ptap->merge = merge; 1303 ptap->rmax = rmax; 1304 1305 *C = Cmpi; 1306 #if defined(PETSC_USE_INFO) 1307 if (bi[pn] != 0) { 1308 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr); 1309 ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr); 1310 } else { 1311 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 1312 } 1313 #endif 1314 PetscFunctionReturn(0); 1315 } 1316 1317 #undef __FUNCT__ 1318 #define __FUNCT__ "MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable" 1319 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,Mat C) 1320 { 1321 PetscErrorCode ierr; 1322 Mat_Merge_SeqsToMPI *merge; 1323 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data; 1324 Mat_SeqAIJ *pd=(Mat_SeqAIJ*)(p->A)->data,*po=(Mat_SeqAIJ*)(p->B)->data; 1325 Mat_PtAPMPI *ptap; 1326 PetscInt *adj; 1327 PetscInt i,j,k,anz,pnz,row,*cj,nexta; 1328 MatScalar *ada,*ca,valtmp; 1329 PetscInt am=A->rmap->n,cm=C->rmap->n,pon=(p->B)->cmap->n; 1330 MPI_Comm comm=((PetscObject)C)->comm; 1331 PetscMPIInt size,rank,taga,*len_s; 1332 PetscInt *owners,proc,nrows,**buf_ri_k,**nextrow,**nextci; 1333 PetscInt **buf_ri,**buf_rj; 1334 PetscInt cnz=0,*bj_i,*bi,*bj,bnz,nextcj; /* bi,bj,ba: local array of C(mpi mat) */ 1335 MPI_Request *s_waits,*r_waits; 1336 MPI_Status *status; 1337 MatScalar **abuf_r,*ba_i,*pA,*coa,*ba; 1338 PetscInt *ai,*aj,*coi,*coj; 1339 PetscInt *poJ=po->j,*pdJ=pd->j; 1340 Mat A_loc; 1341 Mat_SeqAIJ *a_loc; 1342 1343 PetscFunctionBegin; 1344 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1345 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1346 1347 ptap = c->ptap; 1348 merge = ptap->merge; 1349 1350 /* 2) compute numeric C_seq = P_loc^T*A_loc */ 1351 /*------------------------------------------*/ 1352 /* get data from symbolic products */ 1353 coi = merge->coi; coj = merge->coj; 1354 ierr = PetscMalloc((coi[pon]+1)*sizeof(MatScalar),&coa);CHKERRQ(ierr); 1355 ierr = PetscMemzero(coa,coi[pon]*sizeof(MatScalar));CHKERRQ(ierr); 1356 bi = merge->bi; bj = merge->bj; 1357 owners = merge->rowmap->range; 1358 ierr = PetscMalloc((bi[cm]+1)*sizeof(MatScalar),&ba);CHKERRQ(ierr); 1359 ierr = PetscMemzero(ba,bi[cm]*sizeof(MatScalar));CHKERRQ(ierr); 1360 1361 /* get A_loc by taking all local rows of A */ 1362 A_loc = ptap->A_loc; 1363 ierr = MatMPIAIJGetLocalMat(A,MAT_REUSE_MATRIX,&A_loc);CHKERRQ(ierr); 1364 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 1365 ai = a_loc->i; 1366 aj = a_loc->j; 1367 1368 for (i=0; i<am; i++) { 1369 /* 2-a) put A[i,:] to dense array aval */ 1370 anz = ai[i+1] - ai[i]; 1371 adj = aj + ai[i]; 1372 ada = a_loc->a + ai[i]; 1373 1374 /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */ 1375 /*-------------------------------------------------------------*/ 1376 /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */ 1377 pnz = po->i[i+1] - po->i[i]; 1378 poJ = po->j + po->i[i]; 1379 pA = po->a + po->i[i]; 1380 for (j=0; j<pnz; j++){ 1381 row = poJ[j]; 1382 cnz = coi[row+1] - coi[row]; 1383 cj = coj + coi[row]; 1384 ca = coa + coi[row]; 1385 /* perform sparse axpy */ 1386 nexta = 0; 1387 valtmp = pA[j]; 1388 for (k=0; nexta<anz; k++) { 1389 if (cj[k] == adj[nexta]){ 1390 ca[k] += valtmp*ada[nexta]; 1391 nexta++; 1392 } 1393 } 1394 ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr); 1395 } 1396 1397 /* put the value into Cd (diagonal part) */ 1398 pnz = pd->i[i+1] - pd->i[i]; 1399 pdJ = pd->j + pd->i[i]; 1400 pA = pd->a + pd->i[i]; 1401 for (j=0; j<pnz; j++){ 1402 row = pdJ[j]; 1403 cnz = bi[row+1] - bi[row]; 1404 cj = bj + bi[row]; 1405 ca = ba + bi[row]; 1406 /* perform sparse axpy */ 1407 nexta = 0; 1408 valtmp = pA[j]; 1409 for (k=0; nexta<anz; k++) { 1410 if (cj[k] == adj[nexta]){ 1411 ca[k] += valtmp*ada[nexta]; 1412 nexta++; 1413 } 1414 } 1415 ierr = PetscLogFlops(2.0*anz);CHKERRQ(ierr); 1416 } 1417 1418 } 1419 1420 /* 3) send and recv matrix values coa */ 1421 /*------------------------------------*/ 1422 buf_ri = merge->buf_ri; 1423 buf_rj = merge->buf_rj; 1424 len_s = merge->len_s; 1425 ierr = PetscCommGetNewTag(comm,&taga);CHKERRQ(ierr); 1426 ierr = PetscPostIrecvScalar(comm,taga,merge->nrecv,merge->id_r,merge->len_r,&abuf_r,&r_waits);CHKERRQ(ierr); 1427 1428 ierr = PetscMalloc2(merge->nsend+1,MPI_Request,&s_waits,size,MPI_Status,&status);CHKERRQ(ierr); 1429 for (proc=0,k=0; proc<size; proc++){ 1430 if (!len_s[proc]) continue; 1431 i = merge->owners_co[proc]; 1432 ierr = MPI_Isend(coa+coi[i],len_s[proc],MPIU_MATSCALAR,proc,taga,comm,s_waits+k);CHKERRQ(ierr); 1433 k++; 1434 } 1435 if (merge->nrecv) {ierr = MPI_Waitall(merge->nrecv,r_waits,status);CHKERRQ(ierr);} 1436 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,s_waits,status);CHKERRQ(ierr);} 1437 1438 ierr = PetscFree2(s_waits,status);CHKERRQ(ierr); 1439 ierr = PetscFree(r_waits);CHKERRQ(ierr); 1440 ierr = PetscFree(coa);CHKERRQ(ierr); 1441 1442 /* 4) insert local Cseq and received values into Cmpi */ 1443 /*----------------------------------------------------*/ 1444 ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr); 1445 for (k=0; k<merge->nrecv; k++){ 1446 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1447 nrows = *(buf_ri_k[k]); 1448 nextrow[k] = buf_ri_k[k]+1; /* next row number of k-th recved i-structure */ 1449 nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 1450 } 1451 1452 for (i=0; i<cm; i++) { 1453 row = owners[rank] + i; /* global row index of C_seq */ 1454 bj_i = bj + bi[i]; /* col indices of the i-th row of C */ 1455 ba_i = ba + bi[i]; 1456 bnz = bi[i+1] - bi[i]; 1457 /* add received vals into ba */ 1458 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 1459 /* i-th row */ 1460 if (i == *nextrow[k]) { 1461 cnz = *(nextci[k]+1) - *nextci[k]; 1462 cj = buf_rj[k] + *(nextci[k]); 1463 ca = abuf_r[k] + *(nextci[k]); 1464 nextcj = 0; 1465 for (j=0; nextcj<cnz; j++){ 1466 if (bj_i[j] == cj[nextcj]){ /* bcol == ccol */ 1467 ba_i[j] += ca[nextcj++]; 1468 } 1469 } 1470 nextrow[k]++; nextci[k]++; 1471 ierr = PetscLogFlops(2.0*cnz);CHKERRQ(ierr); 1472 } 1473 } 1474 ierr = MatSetValues(C,1,&row,bnz,bj_i,ba_i,INSERT_VALUES);CHKERRQ(ierr); 1475 } 1476 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1477 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1478 1479 ierr = PetscFree(ba);CHKERRQ(ierr); 1480 ierr = PetscFree(abuf_r[0]);CHKERRQ(ierr); 1481 ierr = PetscFree(abuf_r);CHKERRQ(ierr); 1482 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1483 PetscFunctionReturn(0); 1484 } 1485 1486 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */ 1487 #undef __FUNCT__ 1488 #define __FUNCT__ "MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable" 1489 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_Scalable(Mat P,Mat A,PetscReal fill,Mat *C) 1490 { 1491 PetscErrorCode ierr; 1492 Mat Cmpi,A_loc,POt,PDt; 1493 Mat_PtAPMPI *ptap; 1494 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 1495 Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c; 1496 PetscInt *pdti,*pdtj,*poti,*potj,*ptJ; 1497 PetscInt nnz; 1498 PetscInt *lnk,*owners_co,*coi,*coj,i,k,pnz,row; 1499 PetscInt am=A->rmap->n,pn=P->cmap->n; 1500 MPI_Comm comm=((PetscObject)A)->comm; 1501 PetscMPIInt size,rank,tagi,tagj,*len_si,*len_s,*len_ri; 1502 PetscInt **buf_rj,**buf_ri,**buf_ri_k; 1503 PetscInt len,proc,*dnz,*onz,*owners; 1504 PetscInt nzi,*bi,*bj; 1505 PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci; 1506 MPI_Request *swaits,*rwaits; 1507 MPI_Status *sstatus,rstatus; 1508 Mat_Merge_SeqsToMPI *merge; 1509 PetscInt *ai,*aj,*Jptr,anz,*prmap=p->garray,pon,nspacedouble=0,j; 1510 PetscReal afill=1.0,afill_tmp; 1511 PetscInt rstart = P->cmap->rstart,rmax,aN=A->cmap->N,Crmax; 1512 PetscScalar *vals; 1513 Mat_SeqAIJ *a_loc, *pdt,*pot; 1514 1515 PetscFunctionBegin; 1516 /* check if matrix local sizes are compatible */ 1517 if (A->rmap->rstart != P->rmap->rstart || A->rmap->rend != P->rmap->rend){ 1518 SETERRQ4(comm,PETSC_ERR_ARG_SIZ,"Matrix local dimensions are incompatible, A (%D, %D) != P (%D,%D)",A->rmap->rstart,A->rmap->rend,P->rmap->rstart,P->rmap->rend); 1519 } 1520 1521 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 1522 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 1523 1524 /* create struct Mat_PtAPMPI and attached it to C later */ 1525 ierr = PetscNew(Mat_PtAPMPI,&ptap);CHKERRQ(ierr); 1526 1527 /* get A_loc by taking all local rows of A */ 1528 ierr = MatMPIAIJGetLocalMat(A,MAT_INITIAL_MATRIX,&A_loc);CHKERRQ(ierr); 1529 ptap->A_loc = A_loc; 1530 a_loc = (Mat_SeqAIJ*)(A_loc)->data; 1531 ai = a_loc->i; 1532 aj = a_loc->j; 1533 1534 /* determine symbolic Co=(p->B)^T*A - send to others */ 1535 /*----------------------------------------------------*/ 1536 ierr = MatTransposeSymbolic_SeqAIJ(p->A,&PDt);CHKERRQ(ierr); 1537 pdt = (Mat_SeqAIJ*)PDt->data; 1538 pdti = pdt->i; pdtj = pdt->j; 1539 1540 ierr = MatTransposeSymbolic_SeqAIJ(p->B,&POt);CHKERRQ(ierr); 1541 pot = (Mat_SeqAIJ*)POt->data; 1542 poti = pot->i; potj = pot->j; 1543 1544 /* then, compute symbolic Co = (p->B)^T*A */ 1545 pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors 1546 >= (num of nonzero rows of C_seq) - pn */ 1547 ierr = PetscMalloc((pon+1)*sizeof(PetscInt),&coi);CHKERRQ(ierr); 1548 coi[0] = 0; 1549 1550 /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */ 1551 nnz = fill*(poti[pon] + ai[am]); 1552 ierr = PetscFreeSpaceGet(nnz,&free_space); 1553 current_space = free_space; 1554 1555 /* create and initialize a linked list */ 1556 i = PetscMax(pdt->rmax,pot->rmax); 1557 Crmax = i*a_loc->rmax*size; /* non-scalable! */ 1558 if (!Crmax || Crmax > aN) Crmax = aN; 1559 ierr = PetscLLCondensedCreate_Scalable(Crmax,&lnk);CHKERRQ(ierr); 1560 1561 for (i=0; i<pon; i++) { 1562 nnz = 0; 1563 pnz = poti[i+1] - poti[i]; 1564 ptJ = potj + poti[i]; 1565 for (j=0; j<pnz; j++){ 1566 row = ptJ[j]; /* row of A_loc == col of Pot */ 1567 anz = ai[row+1] - ai[row]; 1568 Jptr = aj + ai[row]; 1569 /* add non-zero cols of AP into the sorted linked list lnk */ 1570 ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr); 1571 } 1572 nnz = lnk[0]; 1573 1574 /* If free space is not available, double the total space in the list */ 1575 if (current_space->local_remaining<nnz) { 1576 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1577 nspacedouble++; 1578 } 1579 1580 /* Copy data into free space, and zero out denserows */ 1581 ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr); 1582 current_space->array += nnz; 1583 current_space->local_used += nnz; 1584 current_space->local_remaining -= nnz; 1585 coi[i+1] = coi[i] + nnz; 1586 } 1587 1588 ierr = PetscMalloc((coi[pon]+1)*sizeof(PetscInt),&coj);CHKERRQ(ierr); 1589 ierr = PetscFreeSpaceContiguous(&free_space,coj);CHKERRQ(ierr); 1590 afill_tmp = (PetscReal)coi[pon]/(poti[pon] + ai[am]); 1591 if (afill_tmp > afill) afill = afill_tmp; 1592 1593 /* send j-array (coj) of Co to other processors */ 1594 /*----------------------------------------------*/ 1595 /* determine row ownership */ 1596 ierr = PetscNew(Mat_Merge_SeqsToMPI,&merge);CHKERRQ(ierr); 1597 ierr = PetscLayoutCreate(comm,&merge->rowmap);CHKERRQ(ierr); 1598 merge->rowmap->n = pn; 1599 merge->rowmap->bs = 1; 1600 ierr = PetscLayoutSetUp(merge->rowmap);CHKERRQ(ierr); 1601 owners = merge->rowmap->range; 1602 1603 /* determine the number of messages to send, their lengths */ 1604 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&len_si);CHKERRQ(ierr); 1605 ierr = PetscMemzero(len_si,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1606 ierr = PetscMalloc(size*sizeof(PetscMPIInt),&merge->len_s);CHKERRQ(ierr); 1607 len_s = merge->len_s; 1608 merge->nsend = 0; 1609 1610 ierr = PetscMalloc((size+2)*sizeof(PetscInt),&owners_co);CHKERRQ(ierr); 1611 ierr = PetscMemzero(len_s,size*sizeof(PetscMPIInt));CHKERRQ(ierr); 1612 1613 proc = 0; 1614 for (i=0; i<pon; i++){ 1615 while (prmap[i] >= owners[proc+1]) proc++; 1616 len_si[proc]++; /* num of rows in Co to be sent to [proc] */ 1617 len_s[proc] += coi[i+1] - coi[i]; 1618 } 1619 1620 len = 0; /* max length of buf_si[] */ 1621 owners_co[0] = 0; 1622 for (proc=0; proc<size; proc++){ 1623 owners_co[proc+1] = owners_co[proc] + len_si[proc]; 1624 if (len_si[proc]){ 1625 merge->nsend++; 1626 len_si[proc] = 2*(len_si[proc] + 1); 1627 len += len_si[proc]; 1628 } 1629 } 1630 1631 /* determine the number and length of messages to receive for coi and coj */ 1632 ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&merge->nrecv);CHKERRQ(ierr); 1633 ierr = PetscGatherMessageLengths2(comm,merge->nsend,merge->nrecv,len_s,len_si,&merge->id_r,&merge->len_r,&len_ri);CHKERRQ(ierr); 1634 1635 /* post the Irecv and Isend of coj */ 1636 ierr = PetscCommGetNewTag(comm,&tagj);CHKERRQ(ierr); 1637 ierr = PetscPostIrecvInt(comm,tagj,merge->nrecv,merge->id_r,merge->len_r,&buf_rj,&rwaits);CHKERRQ(ierr); 1638 ierr = PetscMalloc((merge->nsend+1)*sizeof(MPI_Request),&swaits);CHKERRQ(ierr); 1639 for (proc=0, k=0; proc<size; proc++){ 1640 if (!len_s[proc]) continue; 1641 i = owners_co[proc]; 1642 ierr = MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);CHKERRQ(ierr); 1643 k++; 1644 } 1645 1646 /* receives and sends of coj are complete */ 1647 ierr = PetscMalloc(size*sizeof(MPI_Status),&sstatus);CHKERRQ(ierr); 1648 for (i=0; i<merge->nrecv; i++){ 1649 PetscMPIInt icompleted; 1650 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1651 } 1652 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1653 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1654 1655 /* send and recv coi */ 1656 /*-------------------*/ 1657 ierr = PetscCommGetNewTag(comm,&tagi);CHKERRQ(ierr); 1658 ierr = PetscPostIrecvInt(comm,tagi,merge->nrecv,merge->id_r,len_ri,&buf_ri,&rwaits);CHKERRQ(ierr); 1659 ierr = PetscMalloc((len+1)*sizeof(PetscInt),&buf_s);CHKERRQ(ierr); 1660 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1661 for (proc=0,k=0; proc<size; proc++){ 1662 if (!len_s[proc]) continue; 1663 /* form outgoing message for i-structure: 1664 buf_si[0]: nrows to be sent 1665 [1:nrows]: row index (global) 1666 [nrows+1:2*nrows+1]: i-structure index 1667 */ 1668 /*-------------------------------------------*/ 1669 nrows = len_si[proc]/2 - 1; 1670 buf_si_i = buf_si + nrows+1; 1671 buf_si[0] = nrows; 1672 buf_si_i[0] = 0; 1673 nrows = 0; 1674 for (i=owners_co[proc]; i<owners_co[proc+1]; i++){ 1675 nzi = coi[i+1] - coi[i]; 1676 buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */ 1677 buf_si[nrows+1] =prmap[i] -owners[proc]; /* local row index */ 1678 nrows++; 1679 } 1680 ierr = MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);CHKERRQ(ierr); 1681 k++; 1682 buf_si += len_si[proc]; 1683 } 1684 i = merge->nrecv; 1685 while (i--) { 1686 PetscMPIInt icompleted; 1687 ierr = MPI_Waitany(merge->nrecv,rwaits,&icompleted,&rstatus);CHKERRQ(ierr); 1688 } 1689 ierr = PetscFree(rwaits);CHKERRQ(ierr); 1690 if (merge->nsend) {ierr = MPI_Waitall(merge->nsend,swaits,sstatus);CHKERRQ(ierr);} 1691 ierr = PetscFree(len_si);CHKERRQ(ierr); 1692 ierr = PetscFree(len_ri);CHKERRQ(ierr); 1693 ierr = PetscFree(swaits);CHKERRQ(ierr); 1694 ierr = PetscFree(sstatus);CHKERRQ(ierr); 1695 ierr = PetscFree(buf_s);CHKERRQ(ierr); 1696 1697 /* compute the local portion of C (mpi mat) */ 1698 /*------------------------------------------*/ 1699 /* allocate bi array and free space for accumulating nonzero column info */ 1700 ierr = PetscMalloc((pn+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 1701 bi[0] = 0; 1702 1703 /* set initial free space to be fill*(nnz(P) + nnz(AP)) */ 1704 nnz = fill*(pdti[pn] + poti[pon] + ai[am]); 1705 ierr = PetscFreeSpaceGet(nnz,&free_space); 1706 current_space = free_space; 1707 1708 ierr = PetscMalloc3(merge->nrecv,PetscInt**,&buf_ri_k,merge->nrecv,PetscInt*,&nextrow,merge->nrecv,PetscInt*,&nextci);CHKERRQ(ierr); 1709 for (k=0; k<merge->nrecv; k++){ 1710 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1711 nrows = *buf_ri_k[k]; 1712 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1713 nextci[k] = buf_ri_k[k] + (nrows + 1);/* poins to the next i-structure of k-th recved i-structure */ 1714 } 1715 1716 ierr = MatPreallocateInitialize(comm,pn,A->cmap->n,dnz,onz);CHKERRQ(ierr); 1717 rmax = 0; 1718 for (i=0; i<pn; i++) { 1719 /* add pdt[i,:]*AP into lnk */ 1720 pnz = pdti[i+1] - pdti[i]; 1721 ptJ = pdtj + pdti[i]; 1722 for (j=0; j<pnz; j++){ 1723 row = ptJ[j]; /* row of AP == col of Pt */ 1724 anz = ai[row+1] - ai[row]; 1725 Jptr = aj + ai[row]; 1726 /* add non-zero cols of AP into the sorted linked list lnk */ 1727 ierr = PetscLLCondensedAddSorted_Scalable(anz,Jptr,lnk);CHKERRQ(ierr); 1728 } 1729 1730 /* add received col data into lnk */ 1731 for (k=0; k<merge->nrecv; k++){ /* k-th received message */ 1732 if (i == *nextrow[k]) { /* i-th row */ 1733 nzi = *(nextci[k]+1) - *nextci[k]; 1734 Jptr = buf_rj[k] + *nextci[k]; 1735 ierr = PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);CHKERRQ(ierr); 1736 nextrow[k]++; nextci[k]++; 1737 } 1738 } 1739 nnz = lnk[0]; 1740 1741 /* if free space is not available, make more free space */ 1742 if (current_space->local_remaining<nnz) { 1743 ierr = PetscFreeSpaceGet(nnz+current_space->total_array_size,¤t_space);CHKERRQ(ierr); 1744 nspacedouble++; 1745 } 1746 /* copy data into free space, then initialize lnk */ 1747 ierr = PetscLLCondensedClean_Scalable(nnz,current_space->array,lnk);CHKERRQ(ierr); 1748 ierr = MatPreallocateSet(i+owners[rank],nnz,current_space->array,dnz,onz);CHKERRQ(ierr); 1749 current_space->array += nnz; 1750 current_space->local_used += nnz; 1751 current_space->local_remaining -= nnz; 1752 bi[i+1] = bi[i] + nnz; 1753 if (nnz > rmax) rmax = nnz; 1754 } 1755 ierr = PetscFree3(buf_ri_k,nextrow,nextci);CHKERRQ(ierr); 1756 1757 ierr = PetscMalloc((bi[pn]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 1758 ierr = PetscFreeSpaceContiguous(&free_space,bj);CHKERRQ(ierr); 1759 afill_tmp = (PetscReal)bi[pn]/(pdti[pn] + poti[pon] + ai[am]); 1760 if (afill_tmp > afill) afill = afill_tmp; 1761 ierr = PetscLLCondensedDestroy_Scalable(lnk);CHKERRQ(ierr); 1762 ierr = MatDestroy(&POt);CHKERRQ(ierr); 1763 ierr = MatDestroy(&PDt);CHKERRQ(ierr); 1764 1765 /* create symbolic parallel matrix Cmpi - why cannot be assembled in Numeric part */ 1766 /*----------------------------------------------------------------------------------*/ 1767 ierr = PetscMalloc((rmax+1)*sizeof(PetscScalar),&vals);CHKERRQ(ierr); 1768 ierr = PetscMemzero(vals,rmax*sizeof(PetscScalar));CHKERRQ(ierr); 1769 1770 ierr = MatCreate(comm,&Cmpi);CHKERRQ(ierr); 1771 ierr = MatSetSizes(Cmpi,pn,A->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr); 1772 ierr = MatSetType(Cmpi,MATMPIAIJ);CHKERRQ(ierr); 1773 ierr = MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);CHKERRQ(ierr); 1774 ierr = MatPreallocateFinalize(dnz,onz);CHKERRQ(ierr); 1775 ierr = MatSetBlockSize(Cmpi,1);CHKERRQ(ierr); 1776 for (i=0; i<pn; i++){ 1777 row = i + rstart; 1778 nnz = bi[i+1] - bi[i]; 1779 Jptr = bj + bi[i]; 1780 ierr = MatSetValues(Cmpi,1,&row,nnz,Jptr,vals,INSERT_VALUES);CHKERRQ(ierr); 1781 } 1782 ierr = MatAssemblyBegin(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1783 ierr = MatAssemblyEnd(Cmpi,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1784 ierr = PetscFree(vals);CHKERRQ(ierr); 1785 1786 merge->bi = bi; 1787 merge->bj = bj; 1788 merge->coi = coi; 1789 merge->coj = coj; 1790 merge->buf_ri = buf_ri; 1791 merge->buf_rj = buf_rj; 1792 merge->owners_co = owners_co; 1793 merge->destroy = Cmpi->ops->destroy; 1794 merge->duplicate = Cmpi->ops->duplicate; 1795 1796 Cmpi->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_Scalable; 1797 Cmpi->ops->destroy = MatDestroy_MPIAIJ_PtAP; 1798 1799 /* attach the supporting struct to Cmpi for reuse */ 1800 c = (Mat_MPIAIJ*)Cmpi->data; 1801 c->ptap = ptap; 1802 ptap->api = PETSC_NULL; 1803 ptap->apj = PETSC_NULL; 1804 ptap->merge = merge; 1805 ptap->rmax = rmax; 1806 ptap->apa = PETSC_NULL; 1807 1808 *C = Cmpi; 1809 #if defined(PETSC_USE_INFO) 1810 if (bi[pn] != 0) { 1811 ierr = PetscInfo3(Cmpi,"Reallocs %D; Fill ratio: given %G needed %G.\n",nspacedouble,fill,afill);CHKERRQ(ierr); 1812 ierr = PetscInfo1(Cmpi,"Use MatTransposeMatMult(A,B,MatReuse,%G,&C) for best performance.\n",afill);CHKERRQ(ierr); 1813 } else { 1814 ierr = PetscInfo(Cmpi,"Empty matrix product\n");CHKERRQ(ierr); 1815 } 1816 #endif 1817 PetscFunctionReturn(0); 1818 } 1819