1 2 /* 3 Defines matrix-matrix product routines for pairs of MPIAIJ matrices 4 C = A * B 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <../src/mat/impls/aij/mpi/mpiaij.h> 9 #include <petscbt.h> 10 #include <../src/mat/impls/dense/mpi/mpidense.h> 11 #include <petsc/private/vecimpl.h> 12 #include <petsc/private/sfimpl.h> 13 14 #if defined(PETSC_HAVE_HYPRE) 15 PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat, Mat, PetscReal, Mat); 16 #endif 17 18 PETSC_INTERN PetscErrorCode MatProductSymbolic_ABt_MPIAIJ_MPIAIJ(Mat C) { 19 Mat_Product *product = C->product; 20 Mat B = product->B; 21 22 PetscFunctionBegin; 23 PetscCall(MatTranspose(B, MAT_INITIAL_MATRIX, &product->B)); 24 PetscCall(MatDestroy(&B)); 25 PetscCall(MatProductSymbolic_AB_MPIAIJ_MPIAIJ(C)); 26 PetscFunctionReturn(0); 27 } 28 29 PETSC_INTERN PetscErrorCode MatProductSymbolic_AB_MPIAIJ_MPIAIJ(Mat C) { 30 Mat_Product *product = C->product; 31 Mat A = product->A, B = product->B; 32 MatProductAlgorithm alg = product->alg; 33 PetscReal fill = product->fill; 34 PetscBool flg; 35 36 PetscFunctionBegin; 37 /* scalable */ 38 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 39 if (flg) { 40 PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ(A, B, fill, C)); 41 PetscFunctionReturn(0); 42 } 43 44 /* nonscalable */ 45 PetscCall(PetscStrcmp(alg, "nonscalable", &flg)); 46 if (flg) { 47 PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A, B, fill, C)); 48 PetscFunctionReturn(0); 49 } 50 51 /* seqmpi */ 52 PetscCall(PetscStrcmp(alg, "seqmpi", &flg)); 53 if (flg) { 54 PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ_seqMPI(A, B, fill, C)); 55 PetscFunctionReturn(0); 56 } 57 58 /* backend general code */ 59 PetscCall(PetscStrcmp(alg, "backend", &flg)); 60 if (flg) { 61 PetscCall(MatProductSymbolic_MPIAIJBACKEND(C)); 62 PetscFunctionReturn(0); 63 } 64 65 #if defined(PETSC_HAVE_HYPRE) 66 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 67 if (flg) { 68 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 69 PetscFunctionReturn(0); 70 } 71 #endif 72 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 73 } 74 75 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(void *data) { 76 Mat_APMPI *ptap = (Mat_APMPI *)data; 77 78 PetscFunctionBegin; 79 PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r)); 80 PetscCall(PetscFree(ptap->bufa)); 81 PetscCall(MatDestroy(&ptap->P_loc)); 82 PetscCall(MatDestroy(&ptap->P_oth)); 83 PetscCall(MatDestroy(&ptap->Pt)); 84 PetscCall(PetscFree(ptap->api)); 85 PetscCall(PetscFree(ptap->apj)); 86 PetscCall(PetscFree(ptap->apa)); 87 PetscCall(PetscFree(ptap)); 88 PetscFunctionReturn(0); 89 } 90 91 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A, Mat P, Mat C) { 92 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *c = (Mat_MPIAIJ *)C->data; 93 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 94 Mat_SeqAIJ *cd = (Mat_SeqAIJ *)(c->A)->data, *co = (Mat_SeqAIJ *)(c->B)->data; 95 PetscScalar *cda = cd->a, *coa = co->a; 96 Mat_SeqAIJ *p_loc, *p_oth; 97 PetscScalar *apa, *ca; 98 PetscInt cm = C->rmap->n; 99 Mat_APMPI *ptap; 100 PetscInt *api, *apj, *apJ, i, k; 101 PetscInt cstart = C->cmap->rstart; 102 PetscInt cdnz, conz, k0, k1; 103 const PetscScalar *dummy; 104 MPI_Comm comm; 105 PetscMPIInt size; 106 107 PetscFunctionBegin; 108 MatCheckProduct(C, 3); 109 ptap = (Mat_APMPI *)C->product->data; 110 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 111 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 112 PetscCallMPI(MPI_Comm_size(comm, &size)); 113 PetscCheck(ptap->P_oth || size <= 1, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "AP cannot be reused. Do not call MatProductClear()"); 114 115 /* flag CPU mask for C */ 116 #if defined(PETSC_HAVE_DEVICE) 117 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 118 if (c->A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->A->offloadmask = PETSC_OFFLOAD_CPU; 119 if (c->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->B->offloadmask = PETSC_OFFLOAD_CPU; 120 #endif 121 122 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 123 /*-----------------------------------------------------*/ 124 /* update numerical values of P_oth and P_loc */ 125 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 126 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 127 128 /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */ 129 /*----------------------------------------------------------*/ 130 /* get data from symbolic products */ 131 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 132 p_oth = NULL; 133 if (size > 1) { p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; } 134 135 /* get apa for storing dense row A[i,:]*P */ 136 apa = ptap->apa; 137 138 api = ptap->api; 139 apj = ptap->apj; 140 /* trigger copy to CPU */ 141 PetscCall(MatSeqAIJGetArrayRead(a->A, &dummy)); 142 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &dummy)); 143 PetscCall(MatSeqAIJGetArrayRead(a->B, &dummy)); 144 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &dummy)); 145 for (i = 0; i < cm; i++) { 146 /* compute apa = A[i,:]*P */ 147 AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa); 148 149 /* set values in C */ 150 apJ = apj + api[i]; 151 cdnz = cd->i[i + 1] - cd->i[i]; 152 conz = co->i[i + 1] - co->i[i]; 153 154 /* 1st off-diagonal part of C */ 155 ca = coa + co->i[i]; 156 k = 0; 157 for (k0 = 0; k0 < conz; k0++) { 158 if (apJ[k] >= cstart) break; 159 ca[k0] = apa[apJ[k]]; 160 apa[apJ[k++]] = 0.0; 161 } 162 163 /* diagonal part of C */ 164 ca = cda + cd->i[i]; 165 for (k1 = 0; k1 < cdnz; k1++) { 166 ca[k1] = apa[apJ[k]]; 167 apa[apJ[k++]] = 0.0; 168 } 169 170 /* 2nd off-diagonal part of C */ 171 ca = coa + co->i[i]; 172 for (; k0 < conz; k0++) { 173 ca[k0] = apa[apJ[k]]; 174 apa[apJ[k++]] = 0.0; 175 } 176 } 177 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 178 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 179 PetscFunctionReturn(0); 180 } 181 182 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A, Mat P, PetscReal fill, Mat C) { 183 MPI_Comm comm; 184 PetscMPIInt size; 185 Mat_APMPI *ptap; 186 PetscFreeSpaceList free_space = NULL, current_space = NULL; 187 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 188 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_loc, *p_oth; 189 PetscInt *pi_loc, *pj_loc, *pi_oth, *pj_oth, *dnz, *onz; 190 PetscInt *adi = ad->i, *adj = ad->j, *aoi = ao->i, *aoj = ao->j, rstart = A->rmap->rstart; 191 PetscInt *lnk, i, pnz, row, *api, *apj, *Jptr, apnz, nspacedouble = 0, j, nzi; 192 PetscInt am = A->rmap->n, pN = P->cmap->N, pn = P->cmap->n, pm = P->rmap->n; 193 PetscBT lnkbt; 194 PetscReal afill; 195 MatType mtype; 196 197 PetscFunctionBegin; 198 MatCheckProduct(C, 4); 199 PetscCheck(!C->product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 200 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 201 PetscCallMPI(MPI_Comm_size(comm, &size)); 202 203 /* create struct Mat_APMPI and attached it to C later */ 204 PetscCall(PetscNew(&ptap)); 205 206 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 207 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 208 209 /* get P_loc by taking all local rows of P */ 210 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 211 212 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 213 pi_loc = p_loc->i; 214 pj_loc = p_loc->j; 215 if (size > 1) { 216 p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 217 pi_oth = p_oth->i; 218 pj_oth = p_oth->j; 219 } else { 220 p_oth = NULL; 221 pi_oth = NULL; 222 pj_oth = NULL; 223 } 224 225 /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */ 226 /*-------------------------------------------------------------------*/ 227 PetscCall(PetscMalloc1(am + 2, &api)); 228 ptap->api = api; 229 api[0] = 0; 230 231 /* create and initialize a linked list */ 232 PetscCall(PetscLLCondensedCreate(pN, pN, &lnk, &lnkbt)); 233 234 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 235 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(adi[am], PetscIntSumTruncate(aoi[am], pi_loc[pm]))), &free_space)); 236 current_space = free_space; 237 238 MatPreallocateBegin(comm, am, pn, dnz, onz); 239 for (i = 0; i < am; i++) { 240 /* diagonal portion of A */ 241 nzi = adi[i + 1] - adi[i]; 242 for (j = 0; j < nzi; j++) { 243 row = *adj++; 244 pnz = pi_loc[row + 1] - pi_loc[row]; 245 Jptr = pj_loc + pi_loc[row]; 246 /* add non-zero cols of P into the sorted linked list lnk */ 247 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 248 } 249 /* off-diagonal portion of A */ 250 nzi = aoi[i + 1] - aoi[i]; 251 for (j = 0; j < nzi; j++) { 252 row = *aoj++; 253 pnz = pi_oth[row + 1] - pi_oth[row]; 254 Jptr = pj_oth + pi_oth[row]; 255 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 256 } 257 /* add possible missing diagonal entry */ 258 if (C->force_diagonals) { 259 j = i + rstart; /* column index */ 260 PetscCall(PetscLLCondensedAddSorted(1, &j, lnk, lnkbt)); 261 } 262 263 apnz = lnk[0]; 264 api[i + 1] = api[i] + apnz; 265 266 /* if free space is not available, double the total space in the list */ 267 if (current_space->local_remaining < apnz) { 268 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 269 nspacedouble++; 270 } 271 272 /* Copy data into free space, then initialize lnk */ 273 PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt)); 274 PetscCall(MatPreallocateSet(i + rstart, apnz, current_space->array, dnz, onz)); 275 276 current_space->array += apnz; 277 current_space->local_used += apnz; 278 current_space->local_remaining -= apnz; 279 } 280 281 /* Allocate space for apj, initialize apj, and */ 282 /* destroy list of free space and other temporary array(s) */ 283 PetscCall(PetscMalloc1(api[am] + 1, &ptap->apj)); 284 apj = ptap->apj; 285 PetscCall(PetscFreeSpaceContiguous(&free_space, ptap->apj)); 286 PetscCall(PetscLLDestroy(lnk, lnkbt)); 287 288 /* malloc apa to store dense row A[i,:]*P */ 289 PetscCall(PetscCalloc1(pN, &ptap->apa)); 290 291 /* set and assemble symbolic parallel matrix C */ 292 /*---------------------------------------------*/ 293 PetscCall(MatSetSizes(C, am, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 294 PetscCall(MatSetBlockSizesFromMats(C, A, P)); 295 296 PetscCall(MatGetType(A, &mtype)); 297 PetscCall(MatSetType(C, mtype)); 298 PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz)); 299 MatPreallocateEnd(dnz, onz); 300 301 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(C, apj, api)); 302 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 303 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 304 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 305 306 C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable; 307 C->ops->productnumeric = MatProductNumeric_AB; 308 309 /* attach the supporting struct to C for reuse */ 310 C->product->data = ptap; 311 C->product->destroy = MatDestroy_MPIAIJ_MatMatMult; 312 313 /* set MatInfo */ 314 afill = (PetscReal)api[am] / (adi[am] + aoi[am] + pi_loc[pm] + 1) + 1.e-5; 315 if (afill < 1.0) afill = 1.0; 316 C->info.mallocs = nspacedouble; 317 C->info.fill_ratio_given = fill; 318 C->info.fill_ratio_needed = afill; 319 320 #if defined(PETSC_USE_INFO) 321 if (api[am]) { 322 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill)); 323 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 324 } else { 325 PetscCall(PetscInfo(C, "Empty matrix product\n")); 326 } 327 #endif 328 PetscFunctionReturn(0); 329 } 330 331 /* ------------------------------------------------------- */ 332 static PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat, Mat, PetscReal, Mat); 333 static PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat, Mat, Mat); 334 335 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_MPIDense_AB(Mat C) { 336 Mat_Product *product = C->product; 337 Mat A = product->A, B = product->B; 338 339 PetscFunctionBegin; 340 if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) 341 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend); 342 343 C->ops->matmultsymbolic = MatMatMultSymbolic_MPIAIJ_MPIDense; 344 C->ops->productsymbolic = MatProductSymbolic_AB; 345 PetscFunctionReturn(0); 346 } 347 /* -------------------------------------------------------------------- */ 348 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_MPIDense_AtB(Mat C) { 349 Mat_Product *product = C->product; 350 Mat A = product->A, B = product->B; 351 352 PetscFunctionBegin; 353 if (A->rmap->rstart != B->rmap->rstart || A->rmap->rend != B->rmap->rend) 354 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend); 355 356 C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_MPIAIJ_MPIDense; 357 C->ops->productsymbolic = MatProductSymbolic_AtB; 358 PetscFunctionReturn(0); 359 } 360 361 /* --------------------------------------------------------------------- */ 362 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJ_MPIDense(Mat C) { 363 Mat_Product *product = C->product; 364 365 PetscFunctionBegin; 366 switch (product->type) { 367 case MATPRODUCT_AB: PetscCall(MatProductSetFromOptions_MPIAIJ_MPIDense_AB(C)); break; 368 case MATPRODUCT_AtB: PetscCall(MatProductSetFromOptions_MPIAIJ_MPIDense_AtB(C)); break; 369 default: break; 370 } 371 PetscFunctionReturn(0); 372 } 373 /* ------------------------------------------------------- */ 374 375 typedef struct { 376 Mat workB, workB1; 377 MPI_Request *rwaits, *swaits; 378 PetscInt nsends, nrecvs; 379 MPI_Datatype *stype, *rtype; 380 PetscInt blda; 381 } MPIAIJ_MPIDense; 382 383 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx) { 384 MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense *)ctx; 385 PetscInt i; 386 387 PetscFunctionBegin; 388 PetscCall(MatDestroy(&contents->workB)); 389 PetscCall(MatDestroy(&contents->workB1)); 390 for (i = 0; i < contents->nsends; i++) { PetscCallMPI(MPI_Type_free(&contents->stype[i])); } 391 for (i = 0; i < contents->nrecvs; i++) { PetscCallMPI(MPI_Type_free(&contents->rtype[i])); } 392 PetscCall(PetscFree4(contents->stype, contents->rtype, contents->rwaits, contents->swaits)); 393 PetscCall(PetscFree(contents)); 394 PetscFunctionReturn(0); 395 } 396 397 static PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A, Mat B, PetscReal fill, Mat C) { 398 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 399 PetscInt nz = aij->B->cmap->n, nsends, nrecvs, i, nrows_to, j, blda, m, M, n, N; 400 MPIAIJ_MPIDense *contents; 401 VecScatter ctx = aij->Mvctx; 402 PetscInt Am = A->rmap->n, Bm = B->rmap->n, BN = B->cmap->N, Bbn, Bbn1, bs, nrows_from, numBb; 403 MPI_Comm comm; 404 MPI_Datatype type1, *stype, *rtype; 405 const PetscInt *sindices, *sstarts, *rstarts; 406 PetscMPIInt *disp; 407 PetscBool cisdense; 408 409 PetscFunctionBegin; 410 MatCheckProduct(C, 4); 411 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty"); 412 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 413 PetscCall(PetscObjectBaseTypeCompare((PetscObject)C, MATMPIDENSE, &cisdense)); 414 if (!cisdense) { PetscCall(MatSetType(C, ((PetscObject)B)->type_name)); } 415 PetscCall(MatGetLocalSize(C, &m, &n)); 416 PetscCall(MatGetSize(C, &M, &N)); 417 if (m == PETSC_DECIDE || n == PETSC_DECIDE || M == PETSC_DECIDE || N == PETSC_DECIDE) { PetscCall(MatSetSizes(C, Am, B->cmap->n, A->rmap->N, BN)); } 418 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 419 PetscCall(MatSetUp(C)); 420 PetscCall(MatDenseGetLDA(B, &blda)); 421 PetscCall(PetscNew(&contents)); 422 423 PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, NULL, NULL)); 424 PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, NULL, NULL)); 425 426 /* Create column block of B and C for memory scalability when BN is too large */ 427 /* Estimate Bbn, column size of Bb */ 428 if (nz) { 429 Bbn1 = 2 * Am * BN / nz; 430 if (!Bbn1) Bbn1 = 1; 431 } else Bbn1 = BN; 432 433 bs = PetscAbs(B->cmap->bs); 434 Bbn1 = Bbn1 / bs * bs; /* Bbn1 is a multiple of bs */ 435 if (Bbn1 > BN) Bbn1 = BN; 436 PetscCallMPI(MPI_Allreduce(&Bbn1, &Bbn, 1, MPIU_INT, MPI_MAX, comm)); 437 438 /* Enable runtime option for Bbn */ 439 PetscOptionsBegin(comm, ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 440 PetscCall(PetscOptionsInt("-matmatmult_Bbn", "Number of columns in Bb", "MatMatMult", Bbn, &Bbn, NULL)); 441 PetscOptionsEnd(); 442 Bbn = PetscMin(Bbn, BN); 443 444 if (Bbn > 0 && Bbn < BN) { 445 numBb = BN / Bbn; 446 Bbn1 = BN - numBb * Bbn; 447 } else numBb = 0; 448 449 if (numBb) { 450 PetscCall(PetscInfo(C, "use Bb, BN=%" PetscInt_FMT ", Bbn=%" PetscInt_FMT "; numBb=%" PetscInt_FMT "\n", BN, Bbn, numBb)); 451 if (Bbn1) { /* Create workB1 for the remaining columns */ 452 PetscCall(PetscInfo(C, "use Bb1, BN=%" PetscInt_FMT ", Bbn1=%" PetscInt_FMT "\n", BN, Bbn1)); 453 /* Create work matrix used to store off processor rows of B needed for local product */ 454 PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, nz, Bbn1, NULL, &contents->workB1)); 455 } else contents->workB1 = NULL; 456 } 457 458 /* Create work matrix used to store off processor rows of B needed for local product */ 459 PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, nz, Bbn, NULL, &contents->workB)); 460 461 /* Use MPI derived data type to reduce memory required by the send/recv buffers */ 462 PetscCall(PetscMalloc4(nsends, &stype, nrecvs, &rtype, nrecvs, &contents->rwaits, nsends, &contents->swaits)); 463 contents->stype = stype; 464 contents->nsends = nsends; 465 466 contents->rtype = rtype; 467 contents->nrecvs = nrecvs; 468 contents->blda = blda; 469 470 PetscCall(PetscMalloc1(Bm + 1, &disp)); 471 for (i = 0; i < nsends; i++) { 472 nrows_to = sstarts[i + 1] - sstarts[i]; 473 for (j = 0; j < nrows_to; j++) disp[j] = sindices[sstarts[i] + j]; /* rowB to be sent */ 474 PetscCallMPI(MPI_Type_create_indexed_block(nrows_to, 1, disp, MPIU_SCALAR, &type1)); 475 PetscCallMPI(MPI_Type_create_resized(type1, 0, blda * sizeof(PetscScalar), &stype[i])); 476 PetscCallMPI(MPI_Type_commit(&stype[i])); 477 PetscCallMPI(MPI_Type_free(&type1)); 478 } 479 480 for (i = 0; i < nrecvs; i++) { 481 /* received values from a process form a (nrows_from x Bbn) row block in workB (column-wise) */ 482 nrows_from = rstarts[i + 1] - rstarts[i]; 483 disp[0] = 0; 484 PetscCallMPI(MPI_Type_create_indexed_block(1, nrows_from, disp, MPIU_SCALAR, &type1)); 485 PetscCallMPI(MPI_Type_create_resized(type1, 0, nz * sizeof(PetscScalar), &rtype[i])); 486 PetscCallMPI(MPI_Type_commit(&rtype[i])); 487 PetscCallMPI(MPI_Type_free(&type1)); 488 } 489 490 PetscCall(PetscFree(disp)); 491 PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, NULL, NULL)); 492 PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, NULL, NULL)); 493 PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE)); 494 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 495 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 496 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 497 498 C->product->data = contents; 499 C->product->destroy = MatMPIAIJ_MPIDenseDestroy; 500 C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense; 501 PetscFunctionReturn(0); 502 } 503 504 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat, Mat, Mat, const PetscBool); 505 /* 506 Performs an efficient scatter on the rows of B needed by this process; this is 507 a modification of the VecScatterBegin_() routines. 508 509 Input: Bbidx = 0: B = Bb 510 = 1: B = Bb1, see MatMatMultSymbolic_MPIAIJ_MPIDense() 511 */ 512 PetscErrorCode MatMPIDenseScatter(Mat A, Mat B, PetscInt Bbidx, Mat C, Mat *outworkB) { 513 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 514 const PetscScalar *b; 515 PetscScalar *rvalues; 516 VecScatter ctx = aij->Mvctx; 517 const PetscInt *sindices, *sstarts, *rstarts; 518 const PetscMPIInt *sprocs, *rprocs; 519 PetscInt i, nsends, nrecvs; 520 MPI_Request *swaits, *rwaits; 521 MPI_Comm comm; 522 PetscMPIInt tag = ((PetscObject)ctx)->tag, ncols = B->cmap->N, nrows = aij->B->cmap->n, nsends_mpi, nrecvs_mpi; 523 MPIAIJ_MPIDense *contents; 524 Mat workB; 525 MPI_Datatype *stype, *rtype; 526 PetscInt blda; 527 528 PetscFunctionBegin; 529 MatCheckProduct(C, 4); 530 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 531 contents = (MPIAIJ_MPIDense *)C->product->data; 532 PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, &sprocs, NULL /*bs*/)); 533 PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, &rprocs, NULL /*bs*/)); 534 PetscCall(PetscMPIIntCast(nsends, &nsends_mpi)); 535 PetscCall(PetscMPIIntCast(nrecvs, &nrecvs_mpi)); 536 if (Bbidx == 0) workB = *outworkB = contents->workB; 537 else workB = *outworkB = contents->workB1; 538 PetscCheck(nrows == workB->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of rows of workB %" PetscInt_FMT " not equal to columns of aij->B %d", workB->cmap->n, nrows); 539 swaits = contents->swaits; 540 rwaits = contents->rwaits; 541 542 PetscCall(MatDenseGetArrayRead(B, &b)); 543 PetscCall(MatDenseGetLDA(B, &blda)); 544 PetscCheck(blda == contents->blda, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot reuse an input matrix with lda %" PetscInt_FMT " != %" PetscInt_FMT, blda, contents->blda); 545 PetscCall(MatDenseGetArray(workB, &rvalues)); 546 547 /* Post recv, use MPI derived data type to save memory */ 548 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 549 rtype = contents->rtype; 550 for (i = 0; i < nrecvs; i++) { PetscCallMPI(MPI_Irecv(rvalues + (rstarts[i] - rstarts[0]), ncols, rtype[i], rprocs[i], tag, comm, rwaits + i)); } 551 552 stype = contents->stype; 553 for (i = 0; i < nsends; i++) { PetscCallMPI(MPI_Isend(b, ncols, stype[i], sprocs[i], tag, comm, swaits + i)); } 554 555 if (nrecvs) PetscCallMPI(MPI_Waitall(nrecvs_mpi, rwaits, MPI_STATUSES_IGNORE)); 556 if (nsends) PetscCallMPI(MPI_Waitall(nsends_mpi, swaits, MPI_STATUSES_IGNORE)); 557 558 PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, &sprocs, NULL)); 559 PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, &rprocs, NULL)); 560 PetscCall(MatDenseRestoreArrayRead(B, &b)); 561 PetscCall(MatDenseRestoreArray(workB, &rvalues)); 562 PetscFunctionReturn(0); 563 } 564 565 static PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A, Mat B, Mat C) { 566 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 567 Mat_MPIDense *bdense = (Mat_MPIDense *)B->data; 568 Mat_MPIDense *cdense = (Mat_MPIDense *)C->data; 569 Mat workB; 570 MPIAIJ_MPIDense *contents; 571 572 PetscFunctionBegin; 573 MatCheckProduct(C, 3); 574 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 575 contents = (MPIAIJ_MPIDense *)C->product->data; 576 /* diagonal block of A times all local rows of B */ 577 /* TODO: this calls a symbolic multiplication every time, which could be avoided */ 578 PetscCall(MatMatMult(aij->A, bdense->A, MAT_REUSE_MATRIX, PETSC_DEFAULT, &cdense->A)); 579 if (contents->workB->cmap->n == B->cmap->N) { 580 /* get off processor parts of B needed to complete C=A*B */ 581 PetscCall(MatMPIDenseScatter(A, B, 0, C, &workB)); 582 583 /* off-diagonal block of A times nonlocal rows of B */ 584 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B, workB, cdense->A, PETSC_TRUE)); 585 } else { 586 Mat Bb, Cb; 587 PetscInt BN = B->cmap->N, n = contents->workB->cmap->n, i; 588 PetscBool ccpu; 589 590 PetscCheck(n > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Column block size %" PetscInt_FMT " must be positive", n); 591 /* Prevent from unneeded copies back and forth from the GPU 592 when getting and restoring the submatrix 593 We need a proper GPU code for AIJ * dense in parallel */ 594 PetscCall(MatBoundToCPU(C, &ccpu)); 595 PetscCall(MatBindToCPU(C, PETSC_TRUE)); 596 for (i = 0; i < BN; i += n) { 597 PetscCall(MatDenseGetSubMatrix(B, PETSC_DECIDE, PETSC_DECIDE, i, PetscMin(i + n, BN), &Bb)); 598 PetscCall(MatDenseGetSubMatrix(C, PETSC_DECIDE, PETSC_DECIDE, i, PetscMin(i + n, BN), &Cb)); 599 600 /* get off processor parts of B needed to complete C=A*B */ 601 PetscCall(MatMPIDenseScatter(A, Bb, (i + n) > BN, C, &workB)); 602 603 /* off-diagonal block of A times nonlocal rows of B */ 604 cdense = (Mat_MPIDense *)Cb->data; 605 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B, workB, cdense->A, PETSC_TRUE)); 606 PetscCall(MatDenseRestoreSubMatrix(B, &Bb)); 607 PetscCall(MatDenseRestoreSubMatrix(C, &Cb)); 608 } 609 PetscCall(MatBindToCPU(C, ccpu)); 610 } 611 PetscFunctionReturn(0); 612 } 613 614 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C) { 615 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *c = (Mat_MPIAIJ *)C->data; 616 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 617 Mat_SeqAIJ *cd = (Mat_SeqAIJ *)(c->A)->data, *co = (Mat_SeqAIJ *)(c->B)->data; 618 PetscInt *adi = ad->i, *adj, *aoi = ao->i, *aoj; 619 PetscScalar *ada, *aoa, *cda = cd->a, *coa = co->a; 620 Mat_SeqAIJ *p_loc, *p_oth; 621 PetscInt *pi_loc, *pj_loc, *pi_oth, *pj_oth, *pj; 622 PetscScalar *pa_loc, *pa_oth, *pa, valtmp, *ca; 623 PetscInt cm = C->rmap->n, anz, pnz; 624 Mat_APMPI *ptap; 625 PetscScalar *apa_sparse; 626 const PetscScalar *dummy; 627 PetscInt *api, *apj, *apJ, i, j, k, row; 628 PetscInt cstart = C->cmap->rstart; 629 PetscInt cdnz, conz, k0, k1, nextp; 630 MPI_Comm comm; 631 PetscMPIInt size; 632 633 PetscFunctionBegin; 634 MatCheckProduct(C, 3); 635 ptap = (Mat_APMPI *)C->product->data; 636 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 637 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 638 PetscCallMPI(MPI_Comm_size(comm, &size)); 639 PetscCheck(ptap->P_oth || size <= 1, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "AP cannot be reused. Do not call MatProductClear()"); 640 641 /* flag CPU mask for C */ 642 #if defined(PETSC_HAVE_DEVICE) 643 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 644 if (c->A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->A->offloadmask = PETSC_OFFLOAD_CPU; 645 if (c->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->B->offloadmask = PETSC_OFFLOAD_CPU; 646 #endif 647 apa_sparse = ptap->apa; 648 649 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 650 /*-----------------------------------------------------*/ 651 /* update numerical values of P_oth and P_loc */ 652 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 653 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 654 655 /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */ 656 /*----------------------------------------------------------*/ 657 /* get data from symbolic products */ 658 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 659 pi_loc = p_loc->i; 660 pj_loc = p_loc->j; 661 pa_loc = p_loc->a; 662 if (size > 1) { 663 p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 664 pi_oth = p_oth->i; 665 pj_oth = p_oth->j; 666 pa_oth = p_oth->a; 667 } else { 668 p_oth = NULL; 669 pi_oth = NULL; 670 pj_oth = NULL; 671 pa_oth = NULL; 672 } 673 674 /* trigger copy to CPU */ 675 PetscCall(MatSeqAIJGetArrayRead(a->A, &dummy)); 676 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &dummy)); 677 PetscCall(MatSeqAIJGetArrayRead(a->B, &dummy)); 678 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &dummy)); 679 api = ptap->api; 680 apj = ptap->apj; 681 for (i = 0; i < cm; i++) { 682 apJ = apj + api[i]; 683 684 /* diagonal portion of A */ 685 anz = adi[i + 1] - adi[i]; 686 adj = ad->j + adi[i]; 687 ada = ad->a + adi[i]; 688 for (j = 0; j < anz; j++) { 689 row = adj[j]; 690 pnz = pi_loc[row + 1] - pi_loc[row]; 691 pj = pj_loc + pi_loc[row]; 692 pa = pa_loc + pi_loc[row]; 693 /* perform sparse axpy */ 694 valtmp = ada[j]; 695 nextp = 0; 696 for (k = 0; nextp < pnz; k++) { 697 if (apJ[k] == pj[nextp]) { /* column of AP == column of P */ 698 apa_sparse[k] += valtmp * pa[nextp++]; 699 } 700 } 701 PetscCall(PetscLogFlops(2.0 * pnz)); 702 } 703 704 /* off-diagonal portion of A */ 705 anz = aoi[i + 1] - aoi[i]; 706 aoj = ao->j + aoi[i]; 707 aoa = ao->a + aoi[i]; 708 for (j = 0; j < anz; j++) { 709 row = aoj[j]; 710 pnz = pi_oth[row + 1] - pi_oth[row]; 711 pj = pj_oth + pi_oth[row]; 712 pa = pa_oth + pi_oth[row]; 713 /* perform sparse axpy */ 714 valtmp = aoa[j]; 715 nextp = 0; 716 for (k = 0; nextp < pnz; k++) { 717 if (apJ[k] == pj[nextp]) { /* column of AP == column of P */ 718 apa_sparse[k] += valtmp * pa[nextp++]; 719 } 720 } 721 PetscCall(PetscLogFlops(2.0 * pnz)); 722 } 723 724 /* set values in C */ 725 cdnz = cd->i[i + 1] - cd->i[i]; 726 conz = co->i[i + 1] - co->i[i]; 727 728 /* 1st off-diagonal part of C */ 729 ca = coa + co->i[i]; 730 k = 0; 731 for (k0 = 0; k0 < conz; k0++) { 732 if (apJ[k] >= cstart) break; 733 ca[k0] = apa_sparse[k]; 734 apa_sparse[k] = 0.0; 735 k++; 736 } 737 738 /* diagonal part of C */ 739 ca = cda + cd->i[i]; 740 for (k1 = 0; k1 < cdnz; k1++) { 741 ca[k1] = apa_sparse[k]; 742 apa_sparse[k] = 0.0; 743 k++; 744 } 745 746 /* 2nd off-diagonal part of C */ 747 ca = coa + co->i[i]; 748 for (; k0 < conz; k0++) { 749 ca[k0] = apa_sparse[k]; 750 apa_sparse[k] = 0.0; 751 k++; 752 } 753 } 754 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 755 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 756 PetscFunctionReturn(0); 757 } 758 759 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */ 760 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat C) { 761 MPI_Comm comm; 762 PetscMPIInt size; 763 Mat_APMPI *ptap; 764 PetscFreeSpaceList free_space = NULL, current_space = NULL; 765 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 766 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_loc, *p_oth; 767 PetscInt *pi_loc, *pj_loc, *pi_oth, *pj_oth, *dnz, *onz; 768 PetscInt *adi = ad->i, *adj = ad->j, *aoi = ao->i, *aoj = ao->j, rstart = A->rmap->rstart; 769 PetscInt i, pnz, row, *api, *apj, *Jptr, apnz, nspacedouble = 0, j, nzi, *lnk, apnz_max = 1; 770 PetscInt am = A->rmap->n, pn = P->cmap->n, pm = P->rmap->n, lsize = pn + 20; 771 PetscReal afill; 772 MatType mtype; 773 774 PetscFunctionBegin; 775 MatCheckProduct(C, 4); 776 PetscCheck(!C->product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 777 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 778 PetscCallMPI(MPI_Comm_size(comm, &size)); 779 780 /* create struct Mat_APMPI and attached it to C later */ 781 PetscCall(PetscNew(&ptap)); 782 783 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 784 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 785 786 /* get P_loc by taking all local rows of P */ 787 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 788 789 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 790 pi_loc = p_loc->i; 791 pj_loc = p_loc->j; 792 if (size > 1) { 793 p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 794 pi_oth = p_oth->i; 795 pj_oth = p_oth->j; 796 } else { 797 p_oth = NULL; 798 pi_oth = NULL; 799 pj_oth = NULL; 800 } 801 802 /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */ 803 /*-------------------------------------------------------------------*/ 804 PetscCall(PetscMalloc1(am + 2, &api)); 805 ptap->api = api; 806 api[0] = 0; 807 808 PetscCall(PetscLLCondensedCreate_Scalable(lsize, &lnk)); 809 810 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 811 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(adi[am], PetscIntSumTruncate(aoi[am], pi_loc[pm]))), &free_space)); 812 current_space = free_space; 813 MatPreallocateBegin(comm, am, pn, dnz, onz); 814 for (i = 0; i < am; i++) { 815 /* diagonal portion of A */ 816 nzi = adi[i + 1] - adi[i]; 817 for (j = 0; j < nzi; j++) { 818 row = *adj++; 819 pnz = pi_loc[row + 1] - pi_loc[row]; 820 Jptr = pj_loc + pi_loc[row]; 821 /* Expand list if it is not long enough */ 822 if (pnz + apnz_max > lsize) { 823 lsize = pnz + apnz_max; 824 PetscCall(PetscLLCondensedExpand_Scalable(lsize, &lnk)); 825 } 826 /* add non-zero cols of P into the sorted linked list lnk */ 827 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 828 apnz = *lnk; /* The first element in the list is the number of items in the list */ 829 api[i + 1] = api[i] + apnz; 830 if (apnz > apnz_max) apnz_max = apnz + 1; /* '1' for diagonal entry */ 831 } 832 /* off-diagonal portion of A */ 833 nzi = aoi[i + 1] - aoi[i]; 834 for (j = 0; j < nzi; j++) { 835 row = *aoj++; 836 pnz = pi_oth[row + 1] - pi_oth[row]; 837 Jptr = pj_oth + pi_oth[row]; 838 /* Expand list if it is not long enough */ 839 if (pnz + apnz_max > lsize) { 840 lsize = pnz + apnz_max; 841 PetscCall(PetscLLCondensedExpand_Scalable(lsize, &lnk)); 842 } 843 /* add non-zero cols of P into the sorted linked list lnk */ 844 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 845 apnz = *lnk; /* The first element in the list is the number of items in the list */ 846 api[i + 1] = api[i] + apnz; 847 if (apnz > apnz_max) apnz_max = apnz + 1; /* '1' for diagonal entry */ 848 } 849 850 /* add missing diagonal entry */ 851 if (C->force_diagonals) { 852 j = i + rstart; /* column index */ 853 PetscCall(PetscLLCondensedAddSorted_Scalable(1, &j, lnk)); 854 } 855 856 apnz = *lnk; 857 api[i + 1] = api[i] + apnz; 858 if (apnz > apnz_max) apnz_max = apnz; 859 860 /* if free space is not available, double the total space in the list */ 861 if (current_space->local_remaining < apnz) { 862 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 863 nspacedouble++; 864 } 865 866 /* Copy data into free space, then initialize lnk */ 867 PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk)); 868 PetscCall(MatPreallocateSet(i + rstart, apnz, current_space->array, dnz, onz)); 869 870 current_space->array += apnz; 871 current_space->local_used += apnz; 872 current_space->local_remaining -= apnz; 873 } 874 875 /* Allocate space for apj, initialize apj, and */ 876 /* destroy list of free space and other temporary array(s) */ 877 PetscCall(PetscMalloc1(api[am] + 1, &ptap->apj)); 878 apj = ptap->apj; 879 PetscCall(PetscFreeSpaceContiguous(&free_space, ptap->apj)); 880 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 881 882 /* create and assemble symbolic parallel matrix C */ 883 /*----------------------------------------------------*/ 884 PetscCall(MatSetSizes(C, am, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 885 PetscCall(MatSetBlockSizesFromMats(C, A, P)); 886 PetscCall(MatGetType(A, &mtype)); 887 PetscCall(MatSetType(C, mtype)); 888 PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz)); 889 MatPreallocateEnd(dnz, onz); 890 891 /* malloc apa for assembly C */ 892 PetscCall(PetscCalloc1(apnz_max, &ptap->apa)); 893 894 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(C, apj, api)); 895 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 896 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 897 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 898 899 C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ; 900 C->ops->productnumeric = MatProductNumeric_AB; 901 902 /* attach the supporting struct to C for reuse */ 903 C->product->data = ptap; 904 C->product->destroy = MatDestroy_MPIAIJ_MatMatMult; 905 906 /* set MatInfo */ 907 afill = (PetscReal)api[am] / (adi[am] + aoi[am] + pi_loc[pm] + 1) + 1.e-5; 908 if (afill < 1.0) afill = 1.0; 909 C->info.mallocs = nspacedouble; 910 C->info.fill_ratio_given = fill; 911 C->info.fill_ratio_needed = afill; 912 913 #if defined(PETSC_USE_INFO) 914 if (api[am]) { 915 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill)); 916 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 917 } else { 918 PetscCall(PetscInfo(C, "Empty matrix product\n")); 919 } 920 #endif 921 PetscFunctionReturn(0); 922 } 923 924 /* This function is needed for the seqMPI matrix-matrix multiplication. */ 925 /* Three input arrays are merged to one output array. The size of the */ 926 /* output array is also output. Duplicate entries only show up once. */ 927 static void Merge3SortedArrays(PetscInt size1, PetscInt *in1, PetscInt size2, PetscInt *in2, PetscInt size3, PetscInt *in3, PetscInt *size4, PetscInt *out) { 928 int i = 0, j = 0, k = 0, l = 0; 929 930 /* Traverse all three arrays */ 931 while (i < size1 && j < size2 && k < size3) { 932 if (in1[i] < in2[j] && in1[i] < in3[k]) { 933 out[l++] = in1[i++]; 934 } else if (in2[j] < in1[i] && in2[j] < in3[k]) { 935 out[l++] = in2[j++]; 936 } else if (in3[k] < in1[i] && in3[k] < in2[j]) { 937 out[l++] = in3[k++]; 938 } else if (in1[i] == in2[j] && in1[i] < in3[k]) { 939 out[l++] = in1[i]; 940 i++, j++; 941 } else if (in1[i] == in3[k] && in1[i] < in2[j]) { 942 out[l++] = in1[i]; 943 i++, k++; 944 } else if (in3[k] == in2[j] && in2[j] < in1[i]) { 945 out[l++] = in2[j]; 946 k++, j++; 947 } else if (in1[i] == in2[j] && in1[i] == in3[k]) { 948 out[l++] = in1[i]; 949 i++, j++, k++; 950 } 951 } 952 953 /* Traverse two remaining arrays */ 954 while (i < size1 && j < size2) { 955 if (in1[i] < in2[j]) { 956 out[l++] = in1[i++]; 957 } else if (in1[i] > in2[j]) { 958 out[l++] = in2[j++]; 959 } else { 960 out[l++] = in1[i]; 961 i++, j++; 962 } 963 } 964 965 while (i < size1 && k < size3) { 966 if (in1[i] < in3[k]) { 967 out[l++] = in1[i++]; 968 } else if (in1[i] > in3[k]) { 969 out[l++] = in3[k++]; 970 } else { 971 out[l++] = in1[i]; 972 i++, k++; 973 } 974 } 975 976 while (k < size3 && j < size2) { 977 if (in3[k] < in2[j]) { 978 out[l++] = in3[k++]; 979 } else if (in3[k] > in2[j]) { 980 out[l++] = in2[j++]; 981 } else { 982 out[l++] = in3[k]; 983 k++, j++; 984 } 985 } 986 987 /* Traverse one remaining array */ 988 while (i < size1) out[l++] = in1[i++]; 989 while (j < size2) out[l++] = in2[j++]; 990 while (k < size3) out[l++] = in3[k++]; 991 992 *size4 = l; 993 } 994 995 /* This matrix-matrix multiplication algorithm divides the multiplication into three multiplications and */ 996 /* adds up the products. Two of these three multiplications are performed with existing (sequential) */ 997 /* matrix-matrix multiplications. */ 998 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_seqMPI(Mat A, Mat P, PetscReal fill, Mat C) { 999 MPI_Comm comm; 1000 PetscMPIInt size; 1001 Mat_APMPI *ptap; 1002 PetscFreeSpaceList free_space_diag = NULL, current_space = NULL; 1003 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1004 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_loc; 1005 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1006 Mat_SeqAIJ *adpd_seq, *p_off, *aopoth_seq; 1007 PetscInt adponz, adpdnz; 1008 PetscInt *pi_loc, *dnz, *onz; 1009 PetscInt *adi = ad->i, *adj = ad->j, *aoi = ao->i, rstart = A->rmap->rstart; 1010 PetscInt *lnk, i, i1 = 0, pnz, row, *adpoi, *adpoj, *api, *adpoJ, *aopJ, *apJ, *Jptr, aopnz, nspacedouble = 0, j, nzi, *apj, apnz, *adpdi, *adpdj, *adpdJ, *poff_i, *poff_j, *j_temp, *aopothi, *aopothj; 1011 PetscInt am = A->rmap->n, pN = P->cmap->N, pn = P->cmap->n, pm = P->rmap->n, p_colstart, p_colend; 1012 PetscBT lnkbt; 1013 PetscReal afill; 1014 PetscMPIInt rank; 1015 Mat adpd, aopoth; 1016 MatType mtype; 1017 const char *prefix; 1018 1019 PetscFunctionBegin; 1020 MatCheckProduct(C, 4); 1021 PetscCheck(!C->product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1022 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1023 PetscCallMPI(MPI_Comm_size(comm, &size)); 1024 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1025 PetscCall(MatGetOwnershipRangeColumn(P, &p_colstart, &p_colend)); 1026 1027 /* create struct Mat_APMPI and attached it to C later */ 1028 PetscCall(PetscNew(&ptap)); 1029 1030 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1031 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1032 1033 /* get P_loc by taking all local rows of P */ 1034 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 1035 1036 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1037 pi_loc = p_loc->i; 1038 1039 /* Allocate memory for the i arrays of the matrices A*P, A_diag*P_off and A_offd * P */ 1040 PetscCall(PetscMalloc1(am + 2, &api)); 1041 PetscCall(PetscMalloc1(am + 2, &adpoi)); 1042 1043 adpoi[0] = 0; 1044 ptap->api = api; 1045 api[0] = 0; 1046 1047 /* create and initialize a linked list, will be used for both A_diag * P_loc_off and A_offd * P_oth */ 1048 PetscCall(PetscLLCondensedCreate(pN, pN, &lnk, &lnkbt)); 1049 MatPreallocateBegin(comm, am, pn, dnz, onz); 1050 1051 /* Symbolic calc of A_loc_diag * P_loc_diag */ 1052 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1053 PetscCall(MatProductCreate(a->A, p->A, NULL, &adpd)); 1054 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1055 PetscCall(MatSetOptionsPrefix(adpd, prefix)); 1056 PetscCall(MatAppendOptionsPrefix(adpd, "inner_diag_")); 1057 1058 PetscCall(MatProductSetType(adpd, MATPRODUCT_AB)); 1059 PetscCall(MatProductSetAlgorithm(adpd, "sorted")); 1060 PetscCall(MatProductSetFill(adpd, fill)); 1061 PetscCall(MatProductSetFromOptions(adpd)); 1062 1063 adpd->force_diagonals = C->force_diagonals; 1064 PetscCall(MatProductSymbolic(adpd)); 1065 1066 adpd_seq = (Mat_SeqAIJ *)((adpd)->data); 1067 adpdi = adpd_seq->i; 1068 adpdj = adpd_seq->j; 1069 p_off = (Mat_SeqAIJ *)((p->B)->data); 1070 poff_i = p_off->i; 1071 poff_j = p_off->j; 1072 1073 /* j_temp stores indices of a result row before they are added to the linked list */ 1074 PetscCall(PetscMalloc1(pN + 2, &j_temp)); 1075 1076 /* Symbolic calc of the A_diag * p_loc_off */ 1077 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 1078 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(adi[am], PetscIntSumTruncate(aoi[am], pi_loc[pm]))), &free_space_diag)); 1079 current_space = free_space_diag; 1080 1081 for (i = 0; i < am; i++) { 1082 /* A_diag * P_loc_off */ 1083 nzi = adi[i + 1] - adi[i]; 1084 for (j = 0; j < nzi; j++) { 1085 row = *adj++; 1086 pnz = poff_i[row + 1] - poff_i[row]; 1087 Jptr = poff_j + poff_i[row]; 1088 for (i1 = 0; i1 < pnz; i1++) { j_temp[i1] = p->garray[Jptr[i1]]; } 1089 /* add non-zero cols of P into the sorted linked list lnk */ 1090 PetscCall(PetscLLCondensedAddSorted(pnz, j_temp, lnk, lnkbt)); 1091 } 1092 1093 adponz = lnk[0]; 1094 adpoi[i + 1] = adpoi[i] + adponz; 1095 1096 /* if free space is not available, double the total space in the list */ 1097 if (current_space->local_remaining < adponz) { 1098 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(adponz, current_space->total_array_size), ¤t_space)); 1099 nspacedouble++; 1100 } 1101 1102 /* Copy data into free space, then initialize lnk */ 1103 PetscCall(PetscLLCondensedClean(pN, adponz, current_space->array, lnk, lnkbt)); 1104 1105 current_space->array += adponz; 1106 current_space->local_used += adponz; 1107 current_space->local_remaining -= adponz; 1108 } 1109 1110 /* Symbolic calc of A_off * P_oth */ 1111 PetscCall(MatSetOptionsPrefix(a->B, prefix)); 1112 PetscCall(MatAppendOptionsPrefix(a->B, "inner_offdiag_")); 1113 PetscCall(MatCreate(PETSC_COMM_SELF, &aopoth)); 1114 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(a->B, ptap->P_oth, fill, aopoth)); 1115 aopoth_seq = (Mat_SeqAIJ *)((aopoth)->data); 1116 aopothi = aopoth_seq->i; 1117 aopothj = aopoth_seq->j; 1118 1119 /* Allocate space for apj, adpj, aopj, ... */ 1120 /* destroy lists of free space and other temporary array(s) */ 1121 1122 PetscCall(PetscMalloc1(aopothi[am] + adpoi[am] + adpdi[am] + 2, &ptap->apj)); 1123 PetscCall(PetscMalloc1(adpoi[am] + 2, &adpoj)); 1124 1125 /* Copy from linked list to j-array */ 1126 PetscCall(PetscFreeSpaceContiguous(&free_space_diag, adpoj)); 1127 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1128 1129 adpoJ = adpoj; 1130 adpdJ = adpdj; 1131 aopJ = aopothj; 1132 apj = ptap->apj; 1133 apJ = apj; /* still empty */ 1134 1135 /* Merge j-arrays of A_off * P, A_diag * P_loc_off, and */ 1136 /* A_diag * P_loc_diag to get A*P */ 1137 for (i = 0; i < am; i++) { 1138 aopnz = aopothi[i + 1] - aopothi[i]; 1139 adponz = adpoi[i + 1] - adpoi[i]; 1140 adpdnz = adpdi[i + 1] - adpdi[i]; 1141 1142 /* Correct indices from A_diag*P_diag */ 1143 for (i1 = 0; i1 < adpdnz; i1++) { adpdJ[i1] += p_colstart; } 1144 /* Merge j-arrays of A_diag * P_loc_off and A_diag * P_loc_diag and A_off * P_oth */ 1145 Merge3SortedArrays(adponz, adpoJ, adpdnz, adpdJ, aopnz, aopJ, &apnz, apJ); 1146 PetscCall(MatPreallocateSet(i + rstart, apnz, apJ, dnz, onz)); 1147 1148 aopJ += aopnz; 1149 adpoJ += adponz; 1150 adpdJ += adpdnz; 1151 apJ += apnz; 1152 api[i + 1] = api[i] + apnz; 1153 } 1154 1155 /* malloc apa to store dense row A[i,:]*P */ 1156 PetscCall(PetscCalloc1(pN + 2, &ptap->apa)); 1157 1158 /* create and assemble symbolic parallel matrix C */ 1159 PetscCall(MatSetSizes(C, am, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1160 PetscCall(MatSetBlockSizesFromMats(C, A, P)); 1161 PetscCall(MatGetType(A, &mtype)); 1162 PetscCall(MatSetType(C, mtype)); 1163 PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz)); 1164 MatPreallocateEnd(dnz, onz); 1165 1166 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(C, apj, api)); 1167 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1168 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1169 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 1170 1171 C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable; 1172 C->ops->productnumeric = MatProductNumeric_AB; 1173 1174 /* attach the supporting struct to C for reuse */ 1175 C->product->data = ptap; 1176 C->product->destroy = MatDestroy_MPIAIJ_MatMatMult; 1177 1178 /* set MatInfo */ 1179 afill = (PetscReal)api[am] / (adi[am] + aoi[am] + pi_loc[pm] + 1) + 1.e-5; 1180 if (afill < 1.0) afill = 1.0; 1181 C->info.mallocs = nspacedouble; 1182 C->info.fill_ratio_given = fill; 1183 C->info.fill_ratio_needed = afill; 1184 1185 #if defined(PETSC_USE_INFO) 1186 if (api[am]) { 1187 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill)); 1188 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1189 } else { 1190 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1191 } 1192 #endif 1193 1194 PetscCall(MatDestroy(&aopoth)); 1195 PetscCall(MatDestroy(&adpd)); 1196 PetscCall(PetscFree(j_temp)); 1197 PetscCall(PetscFree(adpoj)); 1198 PetscCall(PetscFree(adpoi)); 1199 PetscFunctionReturn(0); 1200 } 1201 1202 /*-------------------------------------------------------------------------*/ 1203 /* This routine only works when scall=MAT_REUSE_MATRIX! */ 1204 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P, Mat A, Mat C) { 1205 Mat_APMPI *ptap; 1206 Mat Pt; 1207 1208 PetscFunctionBegin; 1209 MatCheckProduct(C, 3); 1210 ptap = (Mat_APMPI *)C->product->data; 1211 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 1212 PetscCheck(ptap->Pt, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be reused. Do not call MatProductClear()"); 1213 1214 Pt = ptap->Pt; 1215 PetscCall(MatTransposeSetPrecursor(P, Pt)); 1216 PetscCall(MatTranspose(P, MAT_REUSE_MATRIX, &Pt)); 1217 PetscCall(MatMatMultNumeric_MPIAIJ_MPIAIJ(Pt, A, C)); 1218 PetscFunctionReturn(0); 1219 } 1220 1221 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */ 1222 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P, Mat A, PetscReal fill, Mat C) { 1223 Mat_APMPI *ptap; 1224 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1225 MPI_Comm comm; 1226 PetscMPIInt size, rank; 1227 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1228 PetscInt pn = P->cmap->n, aN = A->cmap->N, an = A->cmap->n; 1229 PetscInt *lnk, i, k, nsend, rstart; 1230 PetscBT lnkbt; 1231 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 1232 PETSC_UNUSED PetscMPIInt icompleted = 0; 1233 PetscInt **buf_rj, **buf_ri, **buf_ri_k, row, ncols, *cols; 1234 PetscInt len, proc, *dnz, *onz, *owners, nzi; 1235 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 1236 MPI_Request *swaits, *rwaits; 1237 MPI_Status *sstatus, rstatus; 1238 PetscLayout rowmap; 1239 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1240 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 1241 PetscInt *Jptr, *prmap = p->garray, con, j, Crmax; 1242 Mat_SeqAIJ *a_loc, *c_loc, *c_oth; 1243 PetscTable ta; 1244 MatType mtype; 1245 const char *prefix; 1246 1247 PetscFunctionBegin; 1248 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1249 PetscCallMPI(MPI_Comm_size(comm, &size)); 1250 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1251 1252 /* create symbolic parallel matrix C */ 1253 PetscCall(MatGetType(A, &mtype)); 1254 PetscCall(MatSetType(C, mtype)); 1255 1256 C->ops->transposematmultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable; 1257 1258 /* create struct Mat_APMPI and attached it to C later */ 1259 PetscCall(PetscNew(&ptap)); 1260 1261 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1262 /* --------------------------------- */ 1263 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 1264 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 1265 1266 /* (1) compute symbolic A_loc */ 1267 /* ---------------------------*/ 1268 PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &ptap->A_loc)); 1269 1270 /* (2-1) compute symbolic C_oth = Ro*A_loc */ 1271 /* ------------------------------------ */ 1272 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1273 PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix)); 1274 PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_")); 1275 PetscCall(MatCreate(PETSC_COMM_SELF, &ptap->C_oth)); 1276 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro, ptap->A_loc, fill, ptap->C_oth)); 1277 1278 /* (3) send coj of C_oth to other processors */ 1279 /* ------------------------------------------ */ 1280 /* determine row ownership */ 1281 PetscCall(PetscLayoutCreate(comm, &rowmap)); 1282 rowmap->n = pn; 1283 rowmap->bs = 1; 1284 PetscCall(PetscLayoutSetUp(rowmap)); 1285 owners = rowmap->range; 1286 1287 /* determine the number of messages to send, their lengths */ 1288 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 1289 PetscCall(PetscArrayzero(len_s, size)); 1290 PetscCall(PetscArrayzero(len_si, size)); 1291 1292 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 1293 coi = c_oth->i; 1294 coj = c_oth->j; 1295 con = ptap->C_oth->rmap->n; 1296 proc = 0; 1297 for (i = 0; i < con; i++) { 1298 while (prmap[i] >= owners[proc + 1]) proc++; 1299 len_si[proc]++; /* num of rows in Co(=Pt*A) to be sent to [proc] */ 1300 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1301 } 1302 1303 len = 0; /* max length of buf_si[], see (4) */ 1304 owners_co[0] = 0; 1305 nsend = 0; 1306 for (proc = 0; proc < size; proc++) { 1307 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 1308 if (len_s[proc]) { 1309 nsend++; 1310 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1311 len += len_si[proc]; 1312 } 1313 } 1314 1315 /* determine the number and length of messages to receive for coi and coj */ 1316 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 1317 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 1318 1319 /* post the Irecv and Isend of coj */ 1320 PetscCall(PetscCommGetNewTag(comm, &tagj)); 1321 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 1322 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 1323 for (proc = 0, k = 0; proc < size; proc++) { 1324 if (!len_s[proc]) continue; 1325 i = owners_co[proc]; 1326 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 1327 k++; 1328 } 1329 1330 /* (2-2) compute symbolic C_loc = Rd*A_loc */ 1331 /* ---------------------------------------- */ 1332 PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix)); 1333 PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_")); 1334 PetscCall(MatCreate(PETSC_COMM_SELF, &ptap->C_loc)); 1335 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd, ptap->A_loc, fill, ptap->C_loc)); 1336 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 1337 1338 /* receives coj are complete */ 1339 for (i = 0; i < nrecv; i++) { PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); } 1340 PetscCall(PetscFree(rwaits)); 1341 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1342 1343 /* add received column indices into ta to update Crmax */ 1344 a_loc = (Mat_SeqAIJ *)(ptap->A_loc)->data; 1345 1346 /* create and initialize a linked list */ 1347 PetscCall(PetscTableCreate(an, aN, &ta)); /* for compute Crmax */ 1348 MatRowMergeMax_SeqAIJ(a_loc, ptap->A_loc->rmap->N, ta); 1349 1350 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1351 Jptr = buf_rj[k]; 1352 for (j = 0; j < len_r[k]; j++) { PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES)); } 1353 } 1354 PetscCall(PetscTableGetCount(ta, &Crmax)); 1355 PetscCall(PetscTableDestroy(&ta)); 1356 1357 /* (4) send and recv coi */ 1358 /*-----------------------*/ 1359 PetscCall(PetscCommGetNewTag(comm, &tagi)); 1360 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 1361 PetscCall(PetscMalloc1(len + 1, &buf_s)); 1362 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1363 for (proc = 0, k = 0; proc < size; proc++) { 1364 if (!len_s[proc]) continue; 1365 /* form outgoing message for i-structure: 1366 buf_si[0]: nrows to be sent 1367 [1:nrows]: row index (global) 1368 [nrows+1:2*nrows+1]: i-structure index 1369 */ 1370 /*-------------------------------------------*/ 1371 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 1372 buf_si_i = buf_si + nrows + 1; 1373 buf_si[0] = nrows; 1374 buf_si_i[0] = 0; 1375 nrows = 0; 1376 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 1377 nzi = coi[i + 1] - coi[i]; 1378 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 1379 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 1380 nrows++; 1381 } 1382 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 1383 k++; 1384 buf_si += len_si[proc]; 1385 } 1386 for (i = 0; i < nrecv; i++) { PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); } 1387 PetscCall(PetscFree(rwaits)); 1388 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1389 1390 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 1391 PetscCall(PetscFree(len_ri)); 1392 PetscCall(PetscFree(swaits)); 1393 PetscCall(PetscFree(buf_s)); 1394 1395 /* (5) compute the local portion of C */ 1396 /* ------------------------------------------ */ 1397 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of C */ 1398 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 1399 current_space = free_space; 1400 1401 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 1402 for (k = 0; k < nrecv; k++) { 1403 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1404 nrows = *buf_ri_k[k]; 1405 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1406 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1407 } 1408 1409 MatPreallocateBegin(comm, pn, an, dnz, onz); 1410 PetscCall(PetscLLCondensedCreate(Crmax, aN, &lnk, &lnkbt)); 1411 for (i = 0; i < pn; i++) { /* for each local row of C */ 1412 /* add C_loc into C */ 1413 nzi = c_loc->i[i + 1] - c_loc->i[i]; 1414 Jptr = c_loc->j + c_loc->i[i]; 1415 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1416 1417 /* add received col data into lnk */ 1418 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1419 if (i == *nextrow[k]) { /* i-th row */ 1420 nzi = *(nextci[k] + 1) - *nextci[k]; 1421 Jptr = buf_rj[k] + *nextci[k]; 1422 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1423 nextrow[k]++; 1424 nextci[k]++; 1425 } 1426 } 1427 1428 /* add missing diagonal entry */ 1429 if (C->force_diagonals) { 1430 k = i + owners[rank]; /* column index */ 1431 PetscCall(PetscLLCondensedAddSorted(1, &k, lnk, lnkbt)); 1432 } 1433 1434 nzi = lnk[0]; 1435 1436 /* copy data into free space, then initialize lnk */ 1437 PetscCall(PetscLLCondensedClean(aN, nzi, current_space->array, lnk, lnkbt)); 1438 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 1439 } 1440 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 1441 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1442 PetscCall(PetscFreeSpaceDestroy(free_space)); 1443 1444 /* local sizes and preallocation */ 1445 PetscCall(MatSetSizes(C, pn, an, PETSC_DETERMINE, PETSC_DETERMINE)); 1446 if (P->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, P->cmap->bs)); 1447 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, A->cmap->bs)); 1448 PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz)); 1449 MatPreallocateEnd(dnz, onz); 1450 1451 /* add C_loc and C_oth to C */ 1452 PetscCall(MatGetOwnershipRange(C, &rstart, NULL)); 1453 for (i = 0; i < pn; i++) { 1454 ncols = c_loc->i[i + 1] - c_loc->i[i]; 1455 cols = c_loc->j + c_loc->i[i]; 1456 row = rstart + i; 1457 PetscCall(MatSetValues(C, 1, (const PetscInt *)&row, ncols, (const PetscInt *)cols, NULL, INSERT_VALUES)); 1458 1459 if (C->force_diagonals) { PetscCall(MatSetValues(C, 1, (const PetscInt *)&row, 1, (const PetscInt *)&row, NULL, INSERT_VALUES)); } 1460 } 1461 for (i = 0; i < con; i++) { 1462 ncols = c_oth->i[i + 1] - c_oth->i[i]; 1463 cols = c_oth->j + c_oth->i[i]; 1464 row = prmap[i]; 1465 PetscCall(MatSetValues(C, 1, (const PetscInt *)&row, ncols, (const PetscInt *)cols, NULL, INSERT_VALUES)); 1466 } 1467 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1468 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1469 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 1470 1471 /* members in merge */ 1472 PetscCall(PetscFree(id_r)); 1473 PetscCall(PetscFree(len_r)); 1474 PetscCall(PetscFree(buf_ri[0])); 1475 PetscCall(PetscFree(buf_ri)); 1476 PetscCall(PetscFree(buf_rj[0])); 1477 PetscCall(PetscFree(buf_rj)); 1478 PetscCall(PetscLayoutDestroy(&rowmap)); 1479 1480 /* attach the supporting struct to C for reuse */ 1481 C->product->data = ptap; 1482 C->product->destroy = MatDestroy_MPIAIJ_PtAP; 1483 PetscFunctionReturn(0); 1484 } 1485 1486 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P, Mat A, Mat C) { 1487 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1488 Mat_SeqAIJ *c_seq; 1489 Mat_APMPI *ptap; 1490 Mat A_loc, C_loc, C_oth; 1491 PetscInt i, rstart, rend, cm, ncols, row; 1492 const PetscInt *cols; 1493 const PetscScalar *vals; 1494 1495 PetscFunctionBegin; 1496 MatCheckProduct(C, 3); 1497 ptap = (Mat_APMPI *)C->product->data; 1498 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 1499 PetscCheck(ptap->A_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be reused. Do not call MatProductClear()"); 1500 PetscCall(MatZeroEntries(C)); 1501 1502 /* These matrices are obtained in MatTransposeMatMultSymbolic() */ 1503 /* 1) get R = Pd^T, Ro = Po^T */ 1504 /*----------------------------*/ 1505 PetscCall(MatTransposeSetPrecursor(p->A, ptap->Rd)); 1506 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 1507 PetscCall(MatTransposeSetPrecursor(p->B, ptap->Ro)); 1508 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 1509 1510 /* 2) compute numeric A_loc */ 1511 /*--------------------------*/ 1512 PetscCall(MatMPIAIJGetLocalMat(A, MAT_REUSE_MATRIX, &ptap->A_loc)); 1513 1514 /* 3) C_loc = Rd*A_loc, C_oth = Ro*A_loc */ 1515 A_loc = ptap->A_loc; 1516 PetscCall(((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd, A_loc, ptap->C_loc)); 1517 PetscCall(((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro, A_loc, ptap->C_oth)); 1518 C_loc = ptap->C_loc; 1519 C_oth = ptap->C_oth; 1520 1521 /* add C_loc and C_oth to C */ 1522 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 1523 1524 /* C_loc -> C */ 1525 cm = C_loc->rmap->N; 1526 c_seq = (Mat_SeqAIJ *)C_loc->data; 1527 cols = c_seq->j; 1528 vals = c_seq->a; 1529 for (i = 0; i < cm; i++) { 1530 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1531 row = rstart + i; 1532 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1533 cols += ncols; 1534 vals += ncols; 1535 } 1536 1537 /* Co -> C, off-processor part */ 1538 cm = C_oth->rmap->N; 1539 c_seq = (Mat_SeqAIJ *)C_oth->data; 1540 cols = c_seq->j; 1541 vals = c_seq->a; 1542 for (i = 0; i < cm; i++) { 1543 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1544 row = p->garray[i]; 1545 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1546 cols += ncols; 1547 vals += ncols; 1548 } 1549 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1550 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1551 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 1552 PetscFunctionReturn(0); 1553 } 1554 1555 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P, Mat A, Mat C) { 1556 Mat_Merge_SeqsToMPI *merge; 1557 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1558 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)(p->A)->data, *po = (Mat_SeqAIJ *)(p->B)->data; 1559 Mat_APMPI *ap; 1560 PetscInt *adj; 1561 PetscInt i, j, k, anz, pnz, row, *cj, nexta; 1562 MatScalar *ada, *ca, valtmp; 1563 PetscInt am = A->rmap->n, cm = C->rmap->n, pon = (p->B)->cmap->n; 1564 MPI_Comm comm; 1565 PetscMPIInt size, rank, taga, *len_s; 1566 PetscInt *owners, proc, nrows, **buf_ri_k, **nextrow, **nextci; 1567 PetscInt **buf_ri, **buf_rj; 1568 PetscInt cnz = 0, *bj_i, *bi, *bj, bnz, nextcj; /* bi,bj,ba: local array of C(mpi mat) */ 1569 MPI_Request *s_waits, *r_waits; 1570 MPI_Status *status; 1571 MatScalar **abuf_r, *ba_i, *pA, *coa, *ba; 1572 const PetscScalar *dummy; 1573 PetscInt *ai, *aj, *coi, *coj, *poJ, *pdJ; 1574 Mat A_loc; 1575 Mat_SeqAIJ *a_loc; 1576 1577 PetscFunctionBegin; 1578 MatCheckProduct(C, 3); 1579 ap = (Mat_APMPI *)C->product->data; 1580 PetscCheck(ap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be computed. Missing data"); 1581 PetscCheck(ap->A_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be reused. Do not call MatProductClear()"); 1582 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1583 PetscCallMPI(MPI_Comm_size(comm, &size)); 1584 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1585 1586 merge = ap->merge; 1587 1588 /* 2) compute numeric C_seq = P_loc^T*A_loc */ 1589 /*------------------------------------------*/ 1590 /* get data from symbolic products */ 1591 coi = merge->coi; 1592 coj = merge->coj; 1593 PetscCall(PetscCalloc1(coi[pon] + 1, &coa)); 1594 bi = merge->bi; 1595 bj = merge->bj; 1596 owners = merge->rowmap->range; 1597 PetscCall(PetscCalloc1(bi[cm] + 1, &ba)); 1598 1599 /* get A_loc by taking all local rows of A */ 1600 A_loc = ap->A_loc; 1601 PetscCall(MatMPIAIJGetLocalMat(A, MAT_REUSE_MATRIX, &A_loc)); 1602 a_loc = (Mat_SeqAIJ *)(A_loc)->data; 1603 ai = a_loc->i; 1604 aj = a_loc->j; 1605 1606 /* trigger copy to CPU */ 1607 PetscCall(MatSeqAIJGetArrayRead(p->A, &dummy)); 1608 PetscCall(MatSeqAIJRestoreArrayRead(p->A, &dummy)); 1609 PetscCall(MatSeqAIJGetArrayRead(p->B, &dummy)); 1610 PetscCall(MatSeqAIJRestoreArrayRead(p->B, &dummy)); 1611 for (i = 0; i < am; i++) { 1612 anz = ai[i + 1] - ai[i]; 1613 adj = aj + ai[i]; 1614 ada = a_loc->a + ai[i]; 1615 1616 /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */ 1617 /*-------------------------------------------------------------*/ 1618 /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */ 1619 pnz = po->i[i + 1] - po->i[i]; 1620 poJ = po->j + po->i[i]; 1621 pA = po->a + po->i[i]; 1622 for (j = 0; j < pnz; j++) { 1623 row = poJ[j]; 1624 cj = coj + coi[row]; 1625 ca = coa + coi[row]; 1626 /* perform sparse axpy */ 1627 nexta = 0; 1628 valtmp = pA[j]; 1629 for (k = 0; nexta < anz; k++) { 1630 if (cj[k] == adj[nexta]) { 1631 ca[k] += valtmp * ada[nexta]; 1632 nexta++; 1633 } 1634 } 1635 PetscCall(PetscLogFlops(2.0 * anz)); 1636 } 1637 1638 /* put the value into Cd (diagonal part) */ 1639 pnz = pd->i[i + 1] - pd->i[i]; 1640 pdJ = pd->j + pd->i[i]; 1641 pA = pd->a + pd->i[i]; 1642 for (j = 0; j < pnz; j++) { 1643 row = pdJ[j]; 1644 cj = bj + bi[row]; 1645 ca = ba + bi[row]; 1646 /* perform sparse axpy */ 1647 nexta = 0; 1648 valtmp = pA[j]; 1649 for (k = 0; nexta < anz; k++) { 1650 if (cj[k] == adj[nexta]) { 1651 ca[k] += valtmp * ada[nexta]; 1652 nexta++; 1653 } 1654 } 1655 PetscCall(PetscLogFlops(2.0 * anz)); 1656 } 1657 } 1658 1659 /* 3) send and recv matrix values coa */ 1660 /*------------------------------------*/ 1661 buf_ri = merge->buf_ri; 1662 buf_rj = merge->buf_rj; 1663 len_s = merge->len_s; 1664 PetscCall(PetscCommGetNewTag(comm, &taga)); 1665 PetscCall(PetscPostIrecvScalar(comm, taga, merge->nrecv, merge->id_r, merge->len_r, &abuf_r, &r_waits)); 1666 1667 PetscCall(PetscMalloc2(merge->nsend + 1, &s_waits, size, &status)); 1668 for (proc = 0, k = 0; proc < size; proc++) { 1669 if (!len_s[proc]) continue; 1670 i = merge->owners_co[proc]; 1671 PetscCallMPI(MPI_Isend(coa + coi[i], len_s[proc], MPIU_MATSCALAR, proc, taga, comm, s_waits + k)); 1672 k++; 1673 } 1674 if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, r_waits, status)); 1675 if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, s_waits, status)); 1676 1677 PetscCall(PetscFree2(s_waits, status)); 1678 PetscCall(PetscFree(r_waits)); 1679 PetscCall(PetscFree(coa)); 1680 1681 /* 4) insert local Cseq and received values into Cmpi */ 1682 /*----------------------------------------------------*/ 1683 PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextci)); 1684 for (k = 0; k < merge->nrecv; k++) { 1685 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1686 nrows = *(buf_ri_k[k]); 1687 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1688 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1689 } 1690 1691 for (i = 0; i < cm; i++) { 1692 row = owners[rank] + i; /* global row index of C_seq */ 1693 bj_i = bj + bi[i]; /* col indices of the i-th row of C */ 1694 ba_i = ba + bi[i]; 1695 bnz = bi[i + 1] - bi[i]; 1696 /* add received vals into ba */ 1697 for (k = 0; k < merge->nrecv; k++) { /* k-th received message */ 1698 /* i-th row */ 1699 if (i == *nextrow[k]) { 1700 cnz = *(nextci[k] + 1) - *nextci[k]; 1701 cj = buf_rj[k] + *(nextci[k]); 1702 ca = abuf_r[k] + *(nextci[k]); 1703 nextcj = 0; 1704 for (j = 0; nextcj < cnz; j++) { 1705 if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */ 1706 ba_i[j] += ca[nextcj++]; 1707 } 1708 } 1709 nextrow[k]++; 1710 nextci[k]++; 1711 PetscCall(PetscLogFlops(2.0 * cnz)); 1712 } 1713 } 1714 PetscCall(MatSetValues(C, 1, &row, bnz, bj_i, ba_i, INSERT_VALUES)); 1715 } 1716 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1717 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1718 1719 PetscCall(PetscFree(ba)); 1720 PetscCall(PetscFree(abuf_r[0])); 1721 PetscCall(PetscFree(abuf_r)); 1722 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 1723 PetscFunctionReturn(0); 1724 } 1725 1726 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P, Mat A, PetscReal fill, Mat C) { 1727 Mat A_loc; 1728 Mat_APMPI *ap; 1729 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1730 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *a = (Mat_MPIAIJ *)A->data; 1731 PetscInt *pdti, *pdtj, *poti, *potj, *ptJ; 1732 PetscInt nnz; 1733 PetscInt *lnk, *owners_co, *coi, *coj, i, k, pnz, row; 1734 PetscInt am = A->rmap->n, pn = P->cmap->n; 1735 MPI_Comm comm; 1736 PetscMPIInt size, rank, tagi, tagj, *len_si, *len_s, *len_ri; 1737 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 1738 PetscInt len, proc, *dnz, *onz, *owners; 1739 PetscInt nzi, *bi, *bj; 1740 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 1741 MPI_Request *swaits, *rwaits; 1742 MPI_Status *sstatus, rstatus; 1743 Mat_Merge_SeqsToMPI *merge; 1744 PetscInt *ai, *aj, *Jptr, anz, *prmap = p->garray, pon, nspacedouble = 0, j; 1745 PetscReal afill = 1.0, afill_tmp; 1746 PetscInt rstart = P->cmap->rstart, rmax, aN = A->cmap->N, Armax; 1747 Mat_SeqAIJ *a_loc; 1748 PetscTable ta; 1749 MatType mtype; 1750 1751 PetscFunctionBegin; 1752 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1753 /* check if matrix local sizes are compatible */ 1754 PetscCheck(A->rmap->rstart == P->rmap->rstart && A->rmap->rend == P->rmap->rend, comm, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != P (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->rstart, 1755 A->rmap->rend, P->rmap->rstart, P->rmap->rend); 1756 1757 PetscCallMPI(MPI_Comm_size(comm, &size)); 1758 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1759 1760 /* create struct Mat_APMPI and attached it to C later */ 1761 PetscCall(PetscNew(&ap)); 1762 1763 /* get A_loc by taking all local rows of A */ 1764 PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &A_loc)); 1765 1766 ap->A_loc = A_loc; 1767 a_loc = (Mat_SeqAIJ *)(A_loc)->data; 1768 ai = a_loc->i; 1769 aj = a_loc->j; 1770 1771 /* determine symbolic Co=(p->B)^T*A - send to others */ 1772 /*----------------------------------------------------*/ 1773 PetscCall(MatGetSymbolicTranspose_SeqAIJ(p->A, &pdti, &pdtj)); 1774 PetscCall(MatGetSymbolicTranspose_SeqAIJ(p->B, &poti, &potj)); 1775 pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors 1776 >= (num of nonzero rows of C_seq) - pn */ 1777 PetscCall(PetscMalloc1(pon + 1, &coi)); 1778 coi[0] = 0; 1779 1780 /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */ 1781 nnz = PetscRealIntMultTruncate(fill, PetscIntSumTruncate(poti[pon], ai[am])); 1782 PetscCall(PetscFreeSpaceGet(nnz, &free_space)); 1783 current_space = free_space; 1784 1785 /* create and initialize a linked list */ 1786 PetscCall(PetscTableCreate(A->cmap->n + a->B->cmap->N, aN, &ta)); 1787 MatRowMergeMax_SeqAIJ(a_loc, am, ta); 1788 PetscCall(PetscTableGetCount(ta, &Armax)); 1789 1790 PetscCall(PetscLLCondensedCreate_Scalable(Armax, &lnk)); 1791 1792 for (i = 0; i < pon; i++) { 1793 pnz = poti[i + 1] - poti[i]; 1794 ptJ = potj + poti[i]; 1795 for (j = 0; j < pnz; j++) { 1796 row = ptJ[j]; /* row of A_loc == col of Pot */ 1797 anz = ai[row + 1] - ai[row]; 1798 Jptr = aj + ai[row]; 1799 /* add non-zero cols of AP into the sorted linked list lnk */ 1800 PetscCall(PetscLLCondensedAddSorted_Scalable(anz, Jptr, lnk)); 1801 } 1802 nnz = lnk[0]; 1803 1804 /* If free space is not available, double the total space in the list */ 1805 if (current_space->local_remaining < nnz) { 1806 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(nnz, current_space->total_array_size), ¤t_space)); 1807 nspacedouble++; 1808 } 1809 1810 /* Copy data into free space, and zero out denserows */ 1811 PetscCall(PetscLLCondensedClean_Scalable(nnz, current_space->array, lnk)); 1812 1813 current_space->array += nnz; 1814 current_space->local_used += nnz; 1815 current_space->local_remaining -= nnz; 1816 1817 coi[i + 1] = coi[i] + nnz; 1818 } 1819 1820 PetscCall(PetscMalloc1(coi[pon] + 1, &coj)); 1821 PetscCall(PetscFreeSpaceContiguous(&free_space, coj)); 1822 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); /* must destroy to get a new one for C */ 1823 1824 afill_tmp = (PetscReal)coi[pon] / (poti[pon] + ai[am] + 1); 1825 if (afill_tmp > afill) afill = afill_tmp; 1826 1827 /* send j-array (coj) of Co to other processors */ 1828 /*----------------------------------------------*/ 1829 /* determine row ownership */ 1830 PetscCall(PetscNew(&merge)); 1831 PetscCall(PetscLayoutCreate(comm, &merge->rowmap)); 1832 1833 merge->rowmap->n = pn; 1834 merge->rowmap->bs = 1; 1835 1836 PetscCall(PetscLayoutSetUp(merge->rowmap)); 1837 owners = merge->rowmap->range; 1838 1839 /* determine the number of messages to send, their lengths */ 1840 PetscCall(PetscCalloc1(size, &len_si)); 1841 PetscCall(PetscCalloc1(size, &merge->len_s)); 1842 1843 len_s = merge->len_s; 1844 merge->nsend = 0; 1845 1846 PetscCall(PetscMalloc1(size + 2, &owners_co)); 1847 1848 proc = 0; 1849 for (i = 0; i < pon; i++) { 1850 while (prmap[i] >= owners[proc + 1]) proc++; 1851 len_si[proc]++; /* num of rows in Co to be sent to [proc] */ 1852 len_s[proc] += coi[i + 1] - coi[i]; 1853 } 1854 1855 len = 0; /* max length of buf_si[] */ 1856 owners_co[0] = 0; 1857 for (proc = 0; proc < size; proc++) { 1858 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 1859 if (len_si[proc]) { 1860 merge->nsend++; 1861 len_si[proc] = 2 * (len_si[proc] + 1); 1862 len += len_si[proc]; 1863 } 1864 } 1865 1866 /* determine the number and length of messages to receive for coi and coj */ 1867 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &merge->nrecv)); 1868 PetscCall(PetscGatherMessageLengths2(comm, merge->nsend, merge->nrecv, len_s, len_si, &merge->id_r, &merge->len_r, &len_ri)); 1869 1870 /* post the Irecv and Isend of coj */ 1871 PetscCall(PetscCommGetNewTag(comm, &tagj)); 1872 PetscCall(PetscPostIrecvInt(comm, tagj, merge->nrecv, merge->id_r, merge->len_r, &buf_rj, &rwaits)); 1873 PetscCall(PetscMalloc1(merge->nsend + 1, &swaits)); 1874 for (proc = 0, k = 0; proc < size; proc++) { 1875 if (!len_s[proc]) continue; 1876 i = owners_co[proc]; 1877 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 1878 k++; 1879 } 1880 1881 /* receives and sends of coj are complete */ 1882 PetscCall(PetscMalloc1(size, &sstatus)); 1883 for (i = 0; i < merge->nrecv; i++) { 1884 PETSC_UNUSED PetscMPIInt icompleted; 1885 PetscCallMPI(MPI_Waitany(merge->nrecv, rwaits, &icompleted, &rstatus)); 1886 } 1887 PetscCall(PetscFree(rwaits)); 1888 if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, swaits, sstatus)); 1889 1890 /* add received column indices into table to update Armax */ 1891 /* Armax can be as large as aN if a P[row,:] is dense, see src/ksp/ksp/tutorials/ex56.c! */ 1892 for (k = 0; k < merge->nrecv; k++) { /* k-th received message */ 1893 Jptr = buf_rj[k]; 1894 for (j = 0; j < merge->len_r[k]; j++) { PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES)); } 1895 } 1896 PetscCall(PetscTableGetCount(ta, &Armax)); 1897 1898 /* send and recv coi */ 1899 /*-------------------*/ 1900 PetscCall(PetscCommGetNewTag(comm, &tagi)); 1901 PetscCall(PetscPostIrecvInt(comm, tagi, merge->nrecv, merge->id_r, len_ri, &buf_ri, &rwaits)); 1902 PetscCall(PetscMalloc1(len + 1, &buf_s)); 1903 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1904 for (proc = 0, k = 0; proc < size; proc++) { 1905 if (!len_s[proc]) continue; 1906 /* form outgoing message for i-structure: 1907 buf_si[0]: nrows to be sent 1908 [1:nrows]: row index (global) 1909 [nrows+1:2*nrows+1]: i-structure index 1910 */ 1911 /*-------------------------------------------*/ 1912 nrows = len_si[proc] / 2 - 1; 1913 buf_si_i = buf_si + nrows + 1; 1914 buf_si[0] = nrows; 1915 buf_si_i[0] = 0; 1916 nrows = 0; 1917 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 1918 nzi = coi[i + 1] - coi[i]; 1919 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 1920 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 1921 nrows++; 1922 } 1923 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 1924 k++; 1925 buf_si += len_si[proc]; 1926 } 1927 i = merge->nrecv; 1928 while (i--) { 1929 PETSC_UNUSED PetscMPIInt icompleted; 1930 PetscCallMPI(MPI_Waitany(merge->nrecv, rwaits, &icompleted, &rstatus)); 1931 } 1932 PetscCall(PetscFree(rwaits)); 1933 if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, swaits, sstatus)); 1934 PetscCall(PetscFree(len_si)); 1935 PetscCall(PetscFree(len_ri)); 1936 PetscCall(PetscFree(swaits)); 1937 PetscCall(PetscFree(sstatus)); 1938 PetscCall(PetscFree(buf_s)); 1939 1940 /* compute the local portion of C (mpi mat) */ 1941 /*------------------------------------------*/ 1942 /* allocate bi array and free space for accumulating nonzero column info */ 1943 PetscCall(PetscMalloc1(pn + 1, &bi)); 1944 bi[0] = 0; 1945 1946 /* set initial free space to be fill*(nnz(P) + nnz(AP)) */ 1947 nnz = PetscRealIntMultTruncate(fill, PetscIntSumTruncate(pdti[pn], PetscIntSumTruncate(poti[pon], ai[am]))); 1948 PetscCall(PetscFreeSpaceGet(nnz, &free_space)); 1949 current_space = free_space; 1950 1951 PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextci)); 1952 for (k = 0; k < merge->nrecv; k++) { 1953 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1954 nrows = *buf_ri_k[k]; 1955 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1956 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th received i-structure */ 1957 } 1958 1959 PetscCall(PetscLLCondensedCreate_Scalable(Armax, &lnk)); 1960 MatPreallocateBegin(comm, pn, A->cmap->n, dnz, onz); 1961 rmax = 0; 1962 for (i = 0; i < pn; i++) { 1963 /* add pdt[i,:]*AP into lnk */ 1964 pnz = pdti[i + 1] - pdti[i]; 1965 ptJ = pdtj + pdti[i]; 1966 for (j = 0; j < pnz; j++) { 1967 row = ptJ[j]; /* row of AP == col of Pt */ 1968 anz = ai[row + 1] - ai[row]; 1969 Jptr = aj + ai[row]; 1970 /* add non-zero cols of AP into the sorted linked list lnk */ 1971 PetscCall(PetscLLCondensedAddSorted_Scalable(anz, Jptr, lnk)); 1972 } 1973 1974 /* add received col data into lnk */ 1975 for (k = 0; k < merge->nrecv; k++) { /* k-th received message */ 1976 if (i == *nextrow[k]) { /* i-th row */ 1977 nzi = *(nextci[k] + 1) - *nextci[k]; 1978 Jptr = buf_rj[k] + *nextci[k]; 1979 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 1980 nextrow[k]++; 1981 nextci[k]++; 1982 } 1983 } 1984 1985 /* add missing diagonal entry */ 1986 if (C->force_diagonals) { 1987 k = i + owners[rank]; /* column index */ 1988 PetscCall(PetscLLCondensedAddSorted_Scalable(1, &k, lnk)); 1989 } 1990 1991 nnz = lnk[0]; 1992 1993 /* if free space is not available, make more free space */ 1994 if (current_space->local_remaining < nnz) { 1995 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(nnz, current_space->total_array_size), ¤t_space)); 1996 nspacedouble++; 1997 } 1998 /* copy data into free space, then initialize lnk */ 1999 PetscCall(PetscLLCondensedClean_Scalable(nnz, current_space->array, lnk)); 2000 PetscCall(MatPreallocateSet(i + owners[rank], nnz, current_space->array, dnz, onz)); 2001 2002 current_space->array += nnz; 2003 current_space->local_used += nnz; 2004 current_space->local_remaining -= nnz; 2005 2006 bi[i + 1] = bi[i] + nnz; 2007 if (nnz > rmax) rmax = nnz; 2008 } 2009 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 2010 2011 PetscCall(PetscMalloc1(bi[pn] + 1, &bj)); 2012 PetscCall(PetscFreeSpaceContiguous(&free_space, bj)); 2013 afill_tmp = (PetscReal)bi[pn] / (pdti[pn] + poti[pon] + ai[am] + 1); 2014 if (afill_tmp > afill) afill = afill_tmp; 2015 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 2016 PetscCall(PetscTableDestroy(&ta)); 2017 PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(p->A, &pdti, &pdtj)); 2018 PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(p->B, &poti, &potj)); 2019 2020 /* create symbolic parallel matrix C - why cannot be assembled in Numeric part */ 2021 /*-------------------------------------------------------------------------------*/ 2022 PetscCall(MatSetSizes(C, pn, A->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 2023 PetscCall(MatSetBlockSizes(C, PetscAbs(P->cmap->bs), PetscAbs(A->cmap->bs))); 2024 PetscCall(MatGetType(A, &mtype)); 2025 PetscCall(MatSetType(C, mtype)); 2026 PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz)); 2027 MatPreallocateEnd(dnz, onz); 2028 PetscCall(MatSetBlockSize(C, 1)); 2029 PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE)); 2030 for (i = 0; i < pn; i++) { 2031 row = i + rstart; 2032 nnz = bi[i + 1] - bi[i]; 2033 Jptr = bj + bi[i]; 2034 PetscCall(MatSetValues(C, 1, &row, nnz, Jptr, NULL, INSERT_VALUES)); 2035 } 2036 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 2037 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 2038 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 2039 merge->bi = bi; 2040 merge->bj = bj; 2041 merge->coi = coi; 2042 merge->coj = coj; 2043 merge->buf_ri = buf_ri; 2044 merge->buf_rj = buf_rj; 2045 merge->owners_co = owners_co; 2046 2047 /* attach the supporting struct to C for reuse */ 2048 C->product->data = ap; 2049 C->product->destroy = MatDestroy_MPIAIJ_PtAP; 2050 ap->merge = merge; 2051 2052 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ; 2053 2054 #if defined(PETSC_USE_INFO) 2055 if (bi[pn] != 0) { 2056 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill)); 2057 PetscCall(PetscInfo(C, "Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n", (double)afill)); 2058 } else { 2059 PetscCall(PetscInfo(C, "Empty matrix product\n")); 2060 } 2061 #endif 2062 PetscFunctionReturn(0); 2063 } 2064 2065 /* ---------------------------------------------------------------- */ 2066 static PetscErrorCode MatProductSymbolic_AtB_MPIAIJ_MPIAIJ(Mat C) { 2067 Mat_Product *product = C->product; 2068 Mat A = product->A, B = product->B; 2069 PetscReal fill = product->fill; 2070 PetscBool flg; 2071 2072 PetscFunctionBegin; 2073 /* scalable */ 2074 PetscCall(PetscStrcmp(product->alg, "scalable", &flg)); 2075 if (flg) { 2076 PetscCall(MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(A, B, fill, C)); 2077 goto next; 2078 } 2079 2080 /* nonscalable */ 2081 PetscCall(PetscStrcmp(product->alg, "nonscalable", &flg)); 2082 if (flg) { 2083 PetscCall(MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A, B, fill, C)); 2084 goto next; 2085 } 2086 2087 /* matmatmult */ 2088 PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 2089 if (flg) { 2090 Mat At; 2091 Mat_APMPI *ptap; 2092 2093 PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 2094 PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ(At, B, fill, C)); 2095 ptap = (Mat_APMPI *)C->product->data; 2096 if (ptap) { 2097 ptap->Pt = At; 2098 C->product->destroy = MatDestroy_MPIAIJ_PtAP; 2099 } 2100 C->ops->transposematmultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult; 2101 goto next; 2102 } 2103 2104 /* backend general code */ 2105 PetscCall(PetscStrcmp(product->alg, "backend", &flg)); 2106 if (flg) { 2107 PetscCall(MatProductSymbolic_MPIAIJBACKEND(C)); 2108 PetscFunctionReturn(0); 2109 } 2110 2111 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type is not supported"); 2112 2113 next: 2114 C->ops->productnumeric = MatProductNumeric_AtB; 2115 PetscFunctionReturn(0); 2116 } 2117 2118 /* ---------------------------------------------------------------- */ 2119 /* Set options for MatMatMultxxx_MPIAIJ_MPIAIJ */ 2120 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_AB(Mat C) { 2121 Mat_Product *product = C->product; 2122 Mat A = product->A, B = product->B; 2123 #if defined(PETSC_HAVE_HYPRE) 2124 const char *algTypes[5] = {"scalable", "nonscalable", "seqmpi", "backend", "hypre"}; 2125 PetscInt nalg = 5; 2126 #else 2127 const char *algTypes[4] = { 2128 "scalable", 2129 "nonscalable", 2130 "seqmpi", 2131 "backend", 2132 }; 2133 PetscInt nalg = 4; 2134 #endif 2135 PetscInt alg = 1; /* set nonscalable algorithm as default */ 2136 PetscBool flg; 2137 MPI_Comm comm; 2138 2139 PetscFunctionBegin; 2140 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2141 2142 /* Set "nonscalable" as default algorithm */ 2143 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2144 if (flg) { 2145 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2146 2147 /* Set "scalable" as default if BN and local nonzeros of A and B are large */ 2148 if (B->cmap->N > 100000) { /* may switch to scalable algorithm as default */ 2149 MatInfo Ainfo, Binfo; 2150 PetscInt nz_local; 2151 PetscBool alg_scalable_loc = PETSC_FALSE, alg_scalable; 2152 2153 PetscCall(MatGetInfo(A, MAT_LOCAL, &Ainfo)); 2154 PetscCall(MatGetInfo(B, MAT_LOCAL, &Binfo)); 2155 nz_local = (PetscInt)(Ainfo.nz_allocated + Binfo.nz_allocated); 2156 2157 if (B->cmap->N > product->fill * nz_local) alg_scalable_loc = PETSC_TRUE; 2158 PetscCall(MPIU_Allreduce(&alg_scalable_loc, &alg_scalable, 1, MPIU_BOOL, MPI_LOR, comm)); 2159 2160 if (alg_scalable) { 2161 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 2162 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2163 PetscCall(PetscInfo(B, "Use scalable algorithm, BN %" PetscInt_FMT ", fill*nz_allocated %g\n", B->cmap->N, (double)(product->fill * nz_local))); 2164 } 2165 } 2166 } 2167 2168 /* Get runtime option */ 2169 if (product->api_user) { 2170 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 2171 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2172 PetscOptionsEnd(); 2173 } else { 2174 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 2175 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2176 PetscOptionsEnd(); 2177 } 2178 if (flg) { PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); } 2179 2180 C->ops->productsymbolic = MatProductSymbolic_AB_MPIAIJ_MPIAIJ; 2181 PetscFunctionReturn(0); 2182 } 2183 2184 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_ABt(Mat C) { 2185 PetscFunctionBegin; 2186 PetscCall(MatProductSetFromOptions_MPIAIJ_AB(C)); 2187 C->ops->productsymbolic = MatProductSymbolic_ABt_MPIAIJ_MPIAIJ; 2188 PetscFunctionReturn(0); 2189 } 2190 2191 /* Set options for MatTransposeMatMultXXX_MPIAIJ_MPIAIJ */ 2192 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_AtB(Mat C) { 2193 Mat_Product *product = C->product; 2194 Mat A = product->A, B = product->B; 2195 const char *algTypes[4] = {"scalable", "nonscalable", "at*b", "backend"}; 2196 PetscInt nalg = 4; 2197 PetscInt alg = 1; /* set default algorithm */ 2198 PetscBool flg; 2199 MPI_Comm comm; 2200 2201 PetscFunctionBegin; 2202 /* Check matrix local sizes */ 2203 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2204 PetscCheck(A->rmap->rstart == B->rmap->rstart && A->rmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != B (%" PetscInt_FMT ",%" PetscInt_FMT ")", 2205 A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend); 2206 2207 /* Set default algorithm */ 2208 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2209 if (flg) { PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); } 2210 2211 /* Set "scalable" as default if BN and local nonzeros of A and B are large */ 2212 if (alg && B->cmap->N > 100000) { /* may switch to scalable algorithm as default */ 2213 MatInfo Ainfo, Binfo; 2214 PetscInt nz_local; 2215 PetscBool alg_scalable_loc = PETSC_FALSE, alg_scalable; 2216 2217 PetscCall(MatGetInfo(A, MAT_LOCAL, &Ainfo)); 2218 PetscCall(MatGetInfo(B, MAT_LOCAL, &Binfo)); 2219 nz_local = (PetscInt)(Ainfo.nz_allocated + Binfo.nz_allocated); 2220 2221 if (B->cmap->N > product->fill * nz_local) alg_scalable_loc = PETSC_TRUE; 2222 PetscCall(MPIU_Allreduce(&alg_scalable_loc, &alg_scalable, 1, MPIU_BOOL, MPI_LOR, comm)); 2223 2224 if (alg_scalable) { 2225 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 2226 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2227 PetscCall(PetscInfo(B, "Use scalable algorithm, BN %" PetscInt_FMT ", fill*nz_allocated %g\n", B->cmap->N, (double)(product->fill * nz_local))); 2228 } 2229 } 2230 2231 /* Get runtime option */ 2232 if (product->api_user) { 2233 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 2234 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2235 PetscOptionsEnd(); 2236 } else { 2237 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 2238 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2239 PetscOptionsEnd(); 2240 } 2241 if (flg) { PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); } 2242 2243 C->ops->productsymbolic = MatProductSymbolic_AtB_MPIAIJ_MPIAIJ; 2244 PetscFunctionReturn(0); 2245 } 2246 2247 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_PtAP(Mat C) { 2248 Mat_Product *product = C->product; 2249 Mat A = product->A, P = product->B; 2250 MPI_Comm comm; 2251 PetscBool flg; 2252 PetscInt alg = 1; /* set default algorithm */ 2253 #if !defined(PETSC_HAVE_HYPRE) 2254 const char *algTypes[5] = {"scalable", "nonscalable", "allatonce", "allatonce_merged", "backend"}; 2255 PetscInt nalg = 5; 2256 #else 2257 const char *algTypes[6] = {"scalable", "nonscalable", "allatonce", "allatonce_merged", "backend", "hypre"}; 2258 PetscInt nalg = 6; 2259 #endif 2260 PetscInt pN = P->cmap->N; 2261 2262 PetscFunctionBegin; 2263 /* Check matrix local sizes */ 2264 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2265 PetscCheck(A->rmap->rstart == P->rmap->rstart && A->rmap->rend == P->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, Arow (%" PetscInt_FMT ", %" PetscInt_FMT ") != Prow (%" PetscInt_FMT ",%" PetscInt_FMT ")", 2266 A->rmap->rstart, A->rmap->rend, P->rmap->rstart, P->rmap->rend); 2267 PetscCheck(A->cmap->rstart == P->rmap->rstart && A->cmap->rend == P->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, Acol (%" PetscInt_FMT ", %" PetscInt_FMT ") != Prow (%" PetscInt_FMT ",%" PetscInt_FMT ")", 2268 A->cmap->rstart, A->cmap->rend, P->rmap->rstart, P->rmap->rend); 2269 2270 /* Set "nonscalable" as default algorithm */ 2271 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2272 if (flg) { 2273 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2274 2275 /* Set "scalable" as default if BN and local nonzeros of A and B are large */ 2276 if (pN > 100000) { 2277 MatInfo Ainfo, Pinfo; 2278 PetscInt nz_local; 2279 PetscBool alg_scalable_loc = PETSC_FALSE, alg_scalable; 2280 2281 PetscCall(MatGetInfo(A, MAT_LOCAL, &Ainfo)); 2282 PetscCall(MatGetInfo(P, MAT_LOCAL, &Pinfo)); 2283 nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated); 2284 2285 if (pN > product->fill * nz_local) alg_scalable_loc = PETSC_TRUE; 2286 PetscCall(MPIU_Allreduce(&alg_scalable_loc, &alg_scalable, 1, MPIU_BOOL, MPI_LOR, comm)); 2287 2288 if (alg_scalable) { 2289 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 2290 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2291 } 2292 } 2293 } 2294 2295 /* Get runtime option */ 2296 if (product->api_user) { 2297 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 2298 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 2299 PetscOptionsEnd(); 2300 } else { 2301 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 2302 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 2303 PetscOptionsEnd(); 2304 } 2305 if (flg) { PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); } 2306 2307 C->ops->productsymbolic = MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ; 2308 PetscFunctionReturn(0); 2309 } 2310 2311 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_RARt(Mat C) { 2312 Mat_Product *product = C->product; 2313 Mat A = product->A, R = product->B; 2314 2315 PetscFunctionBegin; 2316 /* Check matrix local sizes */ 2317 PetscCheck(A->cmap->n == R->cmap->n && A->rmap->n == R->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A local (%" PetscInt_FMT ", %" PetscInt_FMT "), R local (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->n, 2318 A->rmap->n, R->rmap->n, R->cmap->n); 2319 2320 C->ops->productsymbolic = MatProductSymbolic_RARt_MPIAIJ_MPIAIJ; 2321 PetscFunctionReturn(0); 2322 } 2323 2324 /* 2325 Set options for ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm 2326 */ 2327 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_ABC(Mat C) { 2328 Mat_Product *product = C->product; 2329 PetscBool flg = PETSC_FALSE; 2330 PetscInt alg = 1; /* default algorithm */ 2331 const char *algTypes[3] = {"scalable", "nonscalable", "seqmpi"}; 2332 PetscInt nalg = 3; 2333 2334 PetscFunctionBegin; 2335 /* Set default algorithm */ 2336 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2337 if (flg) { PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); } 2338 2339 /* Get runtime option */ 2340 if (product->api_user) { 2341 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 2342 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2343 PetscOptionsEnd(); 2344 } else { 2345 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 2346 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2347 PetscOptionsEnd(); 2348 } 2349 if (flg) { PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); } 2350 2351 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_MPIAIJ_MPIAIJ_MPIAIJ; 2352 C->ops->productsymbolic = MatProductSymbolic_ABC; 2353 PetscFunctionReturn(0); 2354 } 2355 2356 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJ(Mat C) { 2357 Mat_Product *product = C->product; 2358 2359 PetscFunctionBegin; 2360 switch (product->type) { 2361 case MATPRODUCT_AB: PetscCall(MatProductSetFromOptions_MPIAIJ_AB(C)); break; 2362 case MATPRODUCT_ABt: PetscCall(MatProductSetFromOptions_MPIAIJ_ABt(C)); break; 2363 case MATPRODUCT_AtB: PetscCall(MatProductSetFromOptions_MPIAIJ_AtB(C)); break; 2364 case MATPRODUCT_PtAP: PetscCall(MatProductSetFromOptions_MPIAIJ_PtAP(C)); break; 2365 case MATPRODUCT_RARt: PetscCall(MatProductSetFromOptions_MPIAIJ_RARt(C)); break; 2366 case MATPRODUCT_ABC: PetscCall(MatProductSetFromOptions_MPIAIJ_ABC(C)); break; 2367 default: break; 2368 } 2369 PetscFunctionReturn(0); 2370 } 2371