1 2 /* 3 Defines matrix-matrix product routines for pairs of MPIAIJ matrices 4 C = A * B 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <../src/mat/impls/aij/mpi/mpiaij.h> 9 #include <petscbt.h> 10 #include <../src/mat/impls/dense/mpi/mpidense.h> 11 #include <petsc/private/vecimpl.h> 12 #include <petsc/private/sfimpl.h> 13 14 #if defined(PETSC_HAVE_HYPRE) 15 PETSC_INTERN PetscErrorCode MatMatMultSymbolic_AIJ_AIJ_wHYPRE(Mat, Mat, PetscReal, Mat); 16 #endif 17 18 PETSC_INTERN PetscErrorCode MatProductSymbolic_ABt_MPIAIJ_MPIAIJ(Mat C) { 19 Mat_Product *product = C->product; 20 Mat B = product->B; 21 22 PetscFunctionBegin; 23 PetscCall(MatTranspose(B, MAT_INITIAL_MATRIX, &product->B)); 24 PetscCall(MatDestroy(&B)); 25 PetscCall(MatProductSymbolic_AB_MPIAIJ_MPIAIJ(C)); 26 PetscFunctionReturn(0); 27 } 28 29 PETSC_INTERN PetscErrorCode MatProductSymbolic_AB_MPIAIJ_MPIAIJ(Mat C) { 30 Mat_Product *product = C->product; 31 Mat A = product->A, B = product->B; 32 MatProductAlgorithm alg = product->alg; 33 PetscReal fill = product->fill; 34 PetscBool flg; 35 36 PetscFunctionBegin; 37 /* scalable */ 38 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 39 if (flg) { 40 PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ(A, B, fill, C)); 41 PetscFunctionReturn(0); 42 } 43 44 /* nonscalable */ 45 PetscCall(PetscStrcmp(alg, "nonscalable", &flg)); 46 if (flg) { 47 PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A, B, fill, C)); 48 PetscFunctionReturn(0); 49 } 50 51 /* seqmpi */ 52 PetscCall(PetscStrcmp(alg, "seqmpi", &flg)); 53 if (flg) { 54 PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ_seqMPI(A, B, fill, C)); 55 PetscFunctionReturn(0); 56 } 57 58 /* backend general code */ 59 PetscCall(PetscStrcmp(alg, "backend", &flg)); 60 if (flg) { 61 PetscCall(MatProductSymbolic_MPIAIJBACKEND(C)); 62 PetscFunctionReturn(0); 63 } 64 65 #if defined(PETSC_HAVE_HYPRE) 66 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 67 if (flg) { 68 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 69 PetscFunctionReturn(0); 70 } 71 #endif 72 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 73 } 74 75 PetscErrorCode MatDestroy_MPIAIJ_MatMatMult(void *data) { 76 Mat_APMPI *ptap = (Mat_APMPI *)data; 77 78 PetscFunctionBegin; 79 PetscCall(PetscFree2(ptap->startsj_s, ptap->startsj_r)); 80 PetscCall(PetscFree(ptap->bufa)); 81 PetscCall(MatDestroy(&ptap->P_loc)); 82 PetscCall(MatDestroy(&ptap->P_oth)); 83 PetscCall(MatDestroy(&ptap->Pt)); 84 PetscCall(PetscFree(ptap->api)); 85 PetscCall(PetscFree(ptap->apj)); 86 PetscCall(PetscFree(ptap->apa)); 87 PetscCall(PetscFree(ptap)); 88 PetscFunctionReturn(0); 89 } 90 91 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat A, Mat P, Mat C) { 92 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *c = (Mat_MPIAIJ *)C->data; 93 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 94 Mat_SeqAIJ *cd = (Mat_SeqAIJ *)(c->A)->data, *co = (Mat_SeqAIJ *)(c->B)->data; 95 PetscScalar *cda = cd->a, *coa = co->a; 96 Mat_SeqAIJ *p_loc, *p_oth; 97 PetscScalar *apa, *ca; 98 PetscInt cm = C->rmap->n; 99 Mat_APMPI *ptap; 100 PetscInt *api, *apj, *apJ, i, k; 101 PetscInt cstart = C->cmap->rstart; 102 PetscInt cdnz, conz, k0, k1; 103 const PetscScalar *dummy; 104 MPI_Comm comm; 105 PetscMPIInt size; 106 107 PetscFunctionBegin; 108 MatCheckProduct(C, 3); 109 ptap = (Mat_APMPI *)C->product->data; 110 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 111 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 112 PetscCallMPI(MPI_Comm_size(comm, &size)); 113 PetscCheck(ptap->P_oth || size <= 1, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "AP cannot be reused. Do not call MatProductClear()"); 114 115 /* flag CPU mask for C */ 116 #if defined(PETSC_HAVE_DEVICE) 117 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 118 if (c->A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->A->offloadmask = PETSC_OFFLOAD_CPU; 119 if (c->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->B->offloadmask = PETSC_OFFLOAD_CPU; 120 #endif 121 122 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 123 /*-----------------------------------------------------*/ 124 /* update numerical values of P_oth and P_loc */ 125 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 126 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 127 128 /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */ 129 /*----------------------------------------------------------*/ 130 /* get data from symbolic products */ 131 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 132 p_oth = NULL; 133 if (size > 1) p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 134 135 /* get apa for storing dense row A[i,:]*P */ 136 apa = ptap->apa; 137 138 api = ptap->api; 139 apj = ptap->apj; 140 /* trigger copy to CPU */ 141 PetscCall(MatSeqAIJGetArrayRead(a->A, &dummy)); 142 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &dummy)); 143 PetscCall(MatSeqAIJGetArrayRead(a->B, &dummy)); 144 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &dummy)); 145 for (i = 0; i < cm; i++) { 146 /* compute apa = A[i,:]*P */ 147 AProw_nonscalable(i, ad, ao, p_loc, p_oth, apa); 148 149 /* set values in C */ 150 apJ = apj + api[i]; 151 cdnz = cd->i[i + 1] - cd->i[i]; 152 conz = co->i[i + 1] - co->i[i]; 153 154 /* 1st off-diagonal part of C */ 155 ca = coa + co->i[i]; 156 k = 0; 157 for (k0 = 0; k0 < conz; k0++) { 158 if (apJ[k] >= cstart) break; 159 ca[k0] = apa[apJ[k]]; 160 apa[apJ[k++]] = 0.0; 161 } 162 163 /* diagonal part of C */ 164 ca = cda + cd->i[i]; 165 for (k1 = 0; k1 < cdnz; k1++) { 166 ca[k1] = apa[apJ[k]]; 167 apa[apJ[k++]] = 0.0; 168 } 169 170 /* 2nd off-diagonal part of C */ 171 ca = coa + co->i[i]; 172 for (; k0 < conz; k0++) { 173 ca[k0] = apa[apJ[k]]; 174 apa[apJ[k++]] = 0.0; 175 } 176 } 177 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 178 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 179 PetscFunctionReturn(0); 180 } 181 182 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat A, Mat P, PetscReal fill, Mat C) { 183 MPI_Comm comm; 184 PetscMPIInt size; 185 Mat_APMPI *ptap; 186 PetscFreeSpaceList free_space = NULL, current_space = NULL; 187 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 188 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_loc, *p_oth; 189 PetscInt *pi_loc, *pj_loc, *pi_oth, *pj_oth, *dnz, *onz; 190 PetscInt *adi = ad->i, *adj = ad->j, *aoi = ao->i, *aoj = ao->j, rstart = A->rmap->rstart; 191 PetscInt *lnk, i, pnz, row, *api, *apj, *Jptr, apnz, nspacedouble = 0, j, nzi; 192 PetscInt am = A->rmap->n, pN = P->cmap->N, pn = P->cmap->n, pm = P->rmap->n; 193 PetscBT lnkbt; 194 PetscReal afill; 195 MatType mtype; 196 197 PetscFunctionBegin; 198 MatCheckProduct(C, 4); 199 PetscCheck(!C->product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 200 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 201 PetscCallMPI(MPI_Comm_size(comm, &size)); 202 203 /* create struct Mat_APMPI and attached it to C later */ 204 PetscCall(PetscNew(&ptap)); 205 206 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 207 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 208 209 /* get P_loc by taking all local rows of P */ 210 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 211 212 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 213 pi_loc = p_loc->i; 214 pj_loc = p_loc->j; 215 if (size > 1) { 216 p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 217 pi_oth = p_oth->i; 218 pj_oth = p_oth->j; 219 } else { 220 p_oth = NULL; 221 pi_oth = NULL; 222 pj_oth = NULL; 223 } 224 225 /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */ 226 /*-------------------------------------------------------------------*/ 227 PetscCall(PetscMalloc1(am + 2, &api)); 228 ptap->api = api; 229 api[0] = 0; 230 231 /* create and initialize a linked list */ 232 PetscCall(PetscLLCondensedCreate(pN, pN, &lnk, &lnkbt)); 233 234 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 235 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(adi[am], PetscIntSumTruncate(aoi[am], pi_loc[pm]))), &free_space)); 236 current_space = free_space; 237 238 MatPreallocateBegin(comm, am, pn, dnz, onz); 239 for (i = 0; i < am; i++) { 240 /* diagonal portion of A */ 241 nzi = adi[i + 1] - adi[i]; 242 for (j = 0; j < nzi; j++) { 243 row = *adj++; 244 pnz = pi_loc[row + 1] - pi_loc[row]; 245 Jptr = pj_loc + pi_loc[row]; 246 /* add non-zero cols of P into the sorted linked list lnk */ 247 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 248 } 249 /* off-diagonal portion of A */ 250 nzi = aoi[i + 1] - aoi[i]; 251 for (j = 0; j < nzi; j++) { 252 row = *aoj++; 253 pnz = pi_oth[row + 1] - pi_oth[row]; 254 Jptr = pj_oth + pi_oth[row]; 255 PetscCall(PetscLLCondensedAddSorted(pnz, Jptr, lnk, lnkbt)); 256 } 257 /* add possible missing diagonal entry */ 258 if (C->force_diagonals) { 259 j = i + rstart; /* column index */ 260 PetscCall(PetscLLCondensedAddSorted(1, &j, lnk, lnkbt)); 261 } 262 263 apnz = lnk[0]; 264 api[i + 1] = api[i] + apnz; 265 266 /* if free space is not available, double the total space in the list */ 267 if (current_space->local_remaining < apnz) { 268 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 269 nspacedouble++; 270 } 271 272 /* Copy data into free space, then initialize lnk */ 273 PetscCall(PetscLLCondensedClean(pN, apnz, current_space->array, lnk, lnkbt)); 274 PetscCall(MatPreallocateSet(i + rstart, apnz, current_space->array, dnz, onz)); 275 276 current_space->array += apnz; 277 current_space->local_used += apnz; 278 current_space->local_remaining -= apnz; 279 } 280 281 /* Allocate space for apj, initialize apj, and */ 282 /* destroy list of free space and other temporary array(s) */ 283 PetscCall(PetscMalloc1(api[am] + 1, &ptap->apj)); 284 apj = ptap->apj; 285 PetscCall(PetscFreeSpaceContiguous(&free_space, ptap->apj)); 286 PetscCall(PetscLLDestroy(lnk, lnkbt)); 287 288 /* malloc apa to store dense row A[i,:]*P */ 289 PetscCall(PetscCalloc1(pN, &ptap->apa)); 290 291 /* set and assemble symbolic parallel matrix C */ 292 /*---------------------------------------------*/ 293 PetscCall(MatSetSizes(C, am, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 294 PetscCall(MatSetBlockSizesFromMats(C, A, P)); 295 296 PetscCall(MatGetType(A, &mtype)); 297 PetscCall(MatSetType(C, mtype)); 298 PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz)); 299 MatPreallocateEnd(dnz, onz); 300 301 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(C, apj, api)); 302 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 303 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 304 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 305 306 C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable; 307 C->ops->productnumeric = MatProductNumeric_AB; 308 309 /* attach the supporting struct to C for reuse */ 310 C->product->data = ptap; 311 C->product->destroy = MatDestroy_MPIAIJ_MatMatMult; 312 313 /* set MatInfo */ 314 afill = (PetscReal)api[am] / (adi[am] + aoi[am] + pi_loc[pm] + 1) + 1.e-5; 315 if (afill < 1.0) afill = 1.0; 316 C->info.mallocs = nspacedouble; 317 C->info.fill_ratio_given = fill; 318 C->info.fill_ratio_needed = afill; 319 320 #if defined(PETSC_USE_INFO) 321 if (api[am]) { 322 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill)); 323 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 324 } else { 325 PetscCall(PetscInfo(C, "Empty matrix product\n")); 326 } 327 #endif 328 PetscFunctionReturn(0); 329 } 330 331 /* ------------------------------------------------------- */ 332 static PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat, Mat, PetscReal, Mat); 333 static PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat, Mat, Mat); 334 335 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_MPIDense_AB(Mat C) { 336 Mat_Product *product = C->product; 337 Mat A = product->A, B = product->B; 338 339 PetscFunctionBegin; 340 if (A->cmap->rstart != B->rmap->rstart || A->cmap->rend != B->rmap->rend) 341 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->cmap->rstart, A->cmap->rend, B->rmap->rstart, B->rmap->rend); 342 343 C->ops->matmultsymbolic = MatMatMultSymbolic_MPIAIJ_MPIDense; 344 C->ops->productsymbolic = MatProductSymbolic_AB; 345 PetscFunctionReturn(0); 346 } 347 /* -------------------------------------------------------------------- */ 348 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_MPIDense_AtB(Mat C) { 349 Mat_Product *product = C->product; 350 Mat A = product->A, B = product->B; 351 352 PetscFunctionBegin; 353 if (A->rmap->rstart != B->rmap->rstart || A->rmap->rend != B->rmap->rend) 354 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, (%" PetscInt_FMT ", %" PetscInt_FMT ") != (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend); 355 356 C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_MPIAIJ_MPIDense; 357 C->ops->productsymbolic = MatProductSymbolic_AtB; 358 PetscFunctionReturn(0); 359 } 360 361 /* --------------------------------------------------------------------- */ 362 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJ_MPIDense(Mat C) { 363 Mat_Product *product = C->product; 364 365 PetscFunctionBegin; 366 switch (product->type) { 367 case MATPRODUCT_AB: PetscCall(MatProductSetFromOptions_MPIAIJ_MPIDense_AB(C)); break; 368 case MATPRODUCT_AtB: PetscCall(MatProductSetFromOptions_MPIAIJ_MPIDense_AtB(C)); break; 369 default: break; 370 } 371 PetscFunctionReturn(0); 372 } 373 /* ------------------------------------------------------- */ 374 375 typedef struct { 376 Mat workB, workB1; 377 MPI_Request *rwaits, *swaits; 378 PetscInt nsends, nrecvs; 379 MPI_Datatype *stype, *rtype; 380 PetscInt blda; 381 } MPIAIJ_MPIDense; 382 383 PetscErrorCode MatMPIAIJ_MPIDenseDestroy(void *ctx) { 384 MPIAIJ_MPIDense *contents = (MPIAIJ_MPIDense *)ctx; 385 PetscInt i; 386 387 PetscFunctionBegin; 388 PetscCall(MatDestroy(&contents->workB)); 389 PetscCall(MatDestroy(&contents->workB1)); 390 for (i = 0; i < contents->nsends; i++) PetscCallMPI(MPI_Type_free(&contents->stype[i])); 391 for (i = 0; i < contents->nrecvs; i++) PetscCallMPI(MPI_Type_free(&contents->rtype[i])); 392 PetscCall(PetscFree4(contents->stype, contents->rtype, contents->rwaits, contents->swaits)); 393 PetscCall(PetscFree(contents)); 394 PetscFunctionReturn(0); 395 } 396 397 static PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIDense(Mat A, Mat B, PetscReal fill, Mat C) { 398 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 399 PetscInt nz = aij->B->cmap->n, nsends, nrecvs, i, nrows_to, j, blda, m, M, n, N; 400 MPIAIJ_MPIDense *contents; 401 VecScatter ctx = aij->Mvctx; 402 PetscInt Am = A->rmap->n, Bm = B->rmap->n, BN = B->cmap->N, Bbn, Bbn1, bs, nrows_from, numBb; 403 MPI_Comm comm; 404 MPI_Datatype type1, *stype, *rtype; 405 const PetscInt *sindices, *sstarts, *rstarts; 406 PetscMPIInt *disp; 407 PetscBool cisdense; 408 409 PetscFunctionBegin; 410 MatCheckProduct(C, 4); 411 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data not empty"); 412 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 413 PetscCall(PetscObjectBaseTypeCompare((PetscObject)C, MATMPIDENSE, &cisdense)); 414 if (!cisdense) PetscCall(MatSetType(C, ((PetscObject)B)->type_name)); 415 PetscCall(MatGetLocalSize(C, &m, &n)); 416 PetscCall(MatGetSize(C, &M, &N)); 417 if (m == PETSC_DECIDE || n == PETSC_DECIDE || M == PETSC_DECIDE || N == PETSC_DECIDE) PetscCall(MatSetSizes(C, Am, B->cmap->n, A->rmap->N, BN)); 418 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 419 PetscCall(MatSetUp(C)); 420 PetscCall(MatDenseGetLDA(B, &blda)); 421 PetscCall(PetscNew(&contents)); 422 423 PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, NULL, NULL)); 424 PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, NULL, NULL)); 425 426 /* Create column block of B and C for memory scalability when BN is too large */ 427 /* Estimate Bbn, column size of Bb */ 428 if (nz) { 429 Bbn1 = 2 * Am * BN / nz; 430 if (!Bbn1) Bbn1 = 1; 431 } else Bbn1 = BN; 432 433 bs = PetscAbs(B->cmap->bs); 434 Bbn1 = Bbn1 / bs * bs; /* Bbn1 is a multiple of bs */ 435 if (Bbn1 > BN) Bbn1 = BN; 436 PetscCallMPI(MPI_Allreduce(&Bbn1, &Bbn, 1, MPIU_INT, MPI_MAX, comm)); 437 438 /* Enable runtime option for Bbn */ 439 PetscOptionsBegin(comm, ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 440 PetscCall(PetscOptionsInt("-matmatmult_Bbn", "Number of columns in Bb", "MatMatMult", Bbn, &Bbn, NULL)); 441 PetscOptionsEnd(); 442 Bbn = PetscMin(Bbn, BN); 443 444 if (Bbn > 0 && Bbn < BN) { 445 numBb = BN / Bbn; 446 Bbn1 = BN - numBb * Bbn; 447 } else numBb = 0; 448 449 if (numBb) { 450 PetscCall(PetscInfo(C, "use Bb, BN=%" PetscInt_FMT ", Bbn=%" PetscInt_FMT "; numBb=%" PetscInt_FMT "\n", BN, Bbn, numBb)); 451 if (Bbn1) { /* Create workB1 for the remaining columns */ 452 PetscCall(PetscInfo(C, "use Bb1, BN=%" PetscInt_FMT ", Bbn1=%" PetscInt_FMT "\n", BN, Bbn1)); 453 /* Create work matrix used to store off processor rows of B needed for local product */ 454 PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, nz, Bbn1, NULL, &contents->workB1)); 455 } else contents->workB1 = NULL; 456 } 457 458 /* Create work matrix used to store off processor rows of B needed for local product */ 459 PetscCall(MatCreateSeqDense(PETSC_COMM_SELF, nz, Bbn, NULL, &contents->workB)); 460 461 /* Use MPI derived data type to reduce memory required by the send/recv buffers */ 462 PetscCall(PetscMalloc4(nsends, &stype, nrecvs, &rtype, nrecvs, &contents->rwaits, nsends, &contents->swaits)); 463 contents->stype = stype; 464 contents->nsends = nsends; 465 466 contents->rtype = rtype; 467 contents->nrecvs = nrecvs; 468 contents->blda = blda; 469 470 PetscCall(PetscMalloc1(Bm + 1, &disp)); 471 for (i = 0; i < nsends; i++) { 472 nrows_to = sstarts[i + 1] - sstarts[i]; 473 for (j = 0; j < nrows_to; j++) disp[j] = sindices[sstarts[i] + j]; /* rowB to be sent */ 474 PetscCallMPI(MPI_Type_create_indexed_block(nrows_to, 1, disp, MPIU_SCALAR, &type1)); 475 PetscCallMPI(MPI_Type_create_resized(type1, 0, blda * sizeof(PetscScalar), &stype[i])); 476 PetscCallMPI(MPI_Type_commit(&stype[i])); 477 PetscCallMPI(MPI_Type_free(&type1)); 478 } 479 480 for (i = 0; i < nrecvs; i++) { 481 /* received values from a process form a (nrows_from x Bbn) row block in workB (column-wise) */ 482 nrows_from = rstarts[i + 1] - rstarts[i]; 483 disp[0] = 0; 484 PetscCallMPI(MPI_Type_create_indexed_block(1, nrows_from, disp, MPIU_SCALAR, &type1)); 485 PetscCallMPI(MPI_Type_create_resized(type1, 0, nz * sizeof(PetscScalar), &rtype[i])); 486 PetscCallMPI(MPI_Type_commit(&rtype[i])); 487 PetscCallMPI(MPI_Type_free(&type1)); 488 } 489 490 PetscCall(PetscFree(disp)); 491 PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, NULL, NULL)); 492 PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, NULL, NULL)); 493 PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE)); 494 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 495 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 496 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 497 498 C->product->data = contents; 499 C->product->destroy = MatMPIAIJ_MPIDenseDestroy; 500 C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIDense; 501 PetscFunctionReturn(0); 502 } 503 504 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat, Mat, Mat, const PetscBool); 505 506 /* 507 Performs an efficient scatter on the rows of B needed by this process; this is 508 a modification of the VecScatterBegin_() routines. 509 510 Input: If Bbidx = 0, uses B = Bb, else B = Bb1, see MatMatMultSymbolic_MPIAIJ_MPIDense() 511 */ 512 513 PetscErrorCode MatMPIDenseScatter(Mat A, Mat B, PetscInt Bbidx, Mat C, Mat *outworkB) { 514 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 515 const PetscScalar *b; 516 PetscScalar *rvalues; 517 VecScatter ctx = aij->Mvctx; 518 const PetscInt *sindices, *sstarts, *rstarts; 519 const PetscMPIInt *sprocs, *rprocs; 520 PetscInt i, nsends, nrecvs; 521 MPI_Request *swaits, *rwaits; 522 MPI_Comm comm; 523 PetscMPIInt tag = ((PetscObject)ctx)->tag, ncols = B->cmap->N, nrows = aij->B->cmap->n, nsends_mpi, nrecvs_mpi; 524 MPIAIJ_MPIDense *contents; 525 Mat workB; 526 MPI_Datatype *stype, *rtype; 527 PetscInt blda; 528 529 PetscFunctionBegin; 530 MatCheckProduct(C, 4); 531 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 532 contents = (MPIAIJ_MPIDense *)C->product->data; 533 PetscCall(VecScatterGetRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, &sprocs, NULL /*bs*/)); 534 PetscCall(VecScatterGetRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, &rprocs, NULL /*bs*/)); 535 PetscCall(PetscMPIIntCast(nsends, &nsends_mpi)); 536 PetscCall(PetscMPIIntCast(nrecvs, &nrecvs_mpi)); 537 if (Bbidx == 0) workB = *outworkB = contents->workB; 538 else workB = *outworkB = contents->workB1; 539 PetscCheck(nrows == workB->rmap->n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of rows of workB %" PetscInt_FMT " not equal to columns of aij->B %d", workB->cmap->n, nrows); 540 swaits = contents->swaits; 541 rwaits = contents->rwaits; 542 543 PetscCall(MatDenseGetArrayRead(B, &b)); 544 PetscCall(MatDenseGetLDA(B, &blda)); 545 PetscCheck(blda == contents->blda, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot reuse an input matrix with lda %" PetscInt_FMT " != %" PetscInt_FMT, blda, contents->blda); 546 PetscCall(MatDenseGetArray(workB, &rvalues)); 547 548 /* Post recv, use MPI derived data type to save memory */ 549 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 550 rtype = contents->rtype; 551 for (i = 0; i < nrecvs; i++) PetscCallMPI(MPI_Irecv(rvalues + (rstarts[i] - rstarts[0]), ncols, rtype[i], rprocs[i], tag, comm, rwaits + i)); 552 553 stype = contents->stype; 554 for (i = 0; i < nsends; i++) PetscCallMPI(MPI_Isend(b, ncols, stype[i], sprocs[i], tag, comm, swaits + i)); 555 556 if (nrecvs) PetscCallMPI(MPI_Waitall(nrecvs_mpi, rwaits, MPI_STATUSES_IGNORE)); 557 if (nsends) PetscCallMPI(MPI_Waitall(nsends_mpi, swaits, MPI_STATUSES_IGNORE)); 558 559 PetscCall(VecScatterRestoreRemote_Private(ctx, PETSC_TRUE /*send*/, &nsends, &sstarts, &sindices, &sprocs, NULL)); 560 PetscCall(VecScatterRestoreRemoteOrdered_Private(ctx, PETSC_FALSE /*recv*/, &nrecvs, &rstarts, NULL, &rprocs, NULL)); 561 PetscCall(MatDenseRestoreArrayRead(B, &b)); 562 PetscCall(MatDenseRestoreArray(workB, &rvalues)); 563 PetscFunctionReturn(0); 564 } 565 566 static PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIDense(Mat A, Mat B, Mat C) { 567 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 568 Mat_MPIDense *bdense = (Mat_MPIDense *)B->data; 569 Mat_MPIDense *cdense = (Mat_MPIDense *)C->data; 570 Mat workB; 571 MPIAIJ_MPIDense *contents; 572 573 PetscFunctionBegin; 574 MatCheckProduct(C, 3); 575 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 576 contents = (MPIAIJ_MPIDense *)C->product->data; 577 /* diagonal block of A times all local rows of B */ 578 /* TODO: this calls a symbolic multiplication every time, which could be avoided */ 579 PetscCall(MatMatMult(aij->A, bdense->A, MAT_REUSE_MATRIX, PETSC_DEFAULT, &cdense->A)); 580 if (contents->workB->cmap->n == B->cmap->N) { 581 /* get off processor parts of B needed to complete C=A*B */ 582 PetscCall(MatMPIDenseScatter(A, B, 0, C, &workB)); 583 584 /* off-diagonal block of A times nonlocal rows of B */ 585 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B, workB, cdense->A, PETSC_TRUE)); 586 } else { 587 Mat Bb, Cb; 588 PetscInt BN = B->cmap->N, n = contents->workB->cmap->n, i; 589 PetscBool ccpu; 590 591 PetscCheck(n > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Column block size %" PetscInt_FMT " must be positive", n); 592 /* Prevent from unneeded copies back and forth from the GPU 593 when getting and restoring the submatrix 594 We need a proper GPU code for AIJ * dense in parallel */ 595 PetscCall(MatBoundToCPU(C, &ccpu)); 596 PetscCall(MatBindToCPU(C, PETSC_TRUE)); 597 for (i = 0; i < BN; i += n) { 598 PetscCall(MatDenseGetSubMatrix(B, PETSC_DECIDE, PETSC_DECIDE, i, PetscMin(i + n, BN), &Bb)); 599 PetscCall(MatDenseGetSubMatrix(C, PETSC_DECIDE, PETSC_DECIDE, i, PetscMin(i + n, BN), &Cb)); 600 601 /* get off processor parts of B needed to complete C=A*B */ 602 PetscCall(MatMPIDenseScatter(A, Bb, (i + n) > BN, C, &workB)); 603 604 /* off-diagonal block of A times nonlocal rows of B */ 605 cdense = (Mat_MPIDense *)Cb->data; 606 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(aij->B, workB, cdense->A, PETSC_TRUE)); 607 PetscCall(MatDenseRestoreSubMatrix(B, &Bb)); 608 PetscCall(MatDenseRestoreSubMatrix(C, &Cb)); 609 } 610 PetscCall(MatBindToCPU(C, ccpu)); 611 } 612 PetscFunctionReturn(0); 613 } 614 615 PetscErrorCode MatMatMultNumeric_MPIAIJ_MPIAIJ(Mat A, Mat P, Mat C) { 616 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data, *c = (Mat_MPIAIJ *)C->data; 617 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data; 618 Mat_SeqAIJ *cd = (Mat_SeqAIJ *)(c->A)->data, *co = (Mat_SeqAIJ *)(c->B)->data; 619 PetscInt *adi = ad->i, *adj, *aoi = ao->i, *aoj; 620 PetscScalar *ada, *aoa, *cda = cd->a, *coa = co->a; 621 Mat_SeqAIJ *p_loc, *p_oth; 622 PetscInt *pi_loc, *pj_loc, *pi_oth, *pj_oth, *pj; 623 PetscScalar *pa_loc, *pa_oth, *pa, valtmp, *ca; 624 PetscInt cm = C->rmap->n, anz, pnz; 625 Mat_APMPI *ptap; 626 PetscScalar *apa_sparse; 627 const PetscScalar *dummy; 628 PetscInt *api, *apj, *apJ, i, j, k, row; 629 PetscInt cstart = C->cmap->rstart; 630 PetscInt cdnz, conz, k0, k1, nextp; 631 MPI_Comm comm; 632 PetscMPIInt size; 633 634 PetscFunctionBegin; 635 MatCheckProduct(C, 3); 636 ptap = (Mat_APMPI *)C->product->data; 637 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 638 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 639 PetscCallMPI(MPI_Comm_size(comm, &size)); 640 PetscCheck(ptap->P_oth || size <= 1, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "AP cannot be reused. Do not call MatProductClear()"); 641 642 /* flag CPU mask for C */ 643 #if defined(PETSC_HAVE_DEVICE) 644 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 645 if (c->A->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->A->offloadmask = PETSC_OFFLOAD_CPU; 646 if (c->B->offloadmask != PETSC_OFFLOAD_UNALLOCATED) c->B->offloadmask = PETSC_OFFLOAD_CPU; 647 #endif 648 apa_sparse = ptap->apa; 649 650 /* 1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */ 651 /*-----------------------------------------------------*/ 652 /* update numerical values of P_oth and P_loc */ 653 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_REUSE_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 654 PetscCall(MatMPIAIJGetLocalMat(P, MAT_REUSE_MATRIX, &ptap->P_loc)); 655 656 /* 2) compute numeric C_loc = A_loc*P = Ad*P_loc + Ao*P_oth */ 657 /*----------------------------------------------------------*/ 658 /* get data from symbolic products */ 659 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 660 pi_loc = p_loc->i; 661 pj_loc = p_loc->j; 662 pa_loc = p_loc->a; 663 if (size > 1) { 664 p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 665 pi_oth = p_oth->i; 666 pj_oth = p_oth->j; 667 pa_oth = p_oth->a; 668 } else { 669 p_oth = NULL; 670 pi_oth = NULL; 671 pj_oth = NULL; 672 pa_oth = NULL; 673 } 674 675 /* trigger copy to CPU */ 676 PetscCall(MatSeqAIJGetArrayRead(a->A, &dummy)); 677 PetscCall(MatSeqAIJRestoreArrayRead(a->A, &dummy)); 678 PetscCall(MatSeqAIJGetArrayRead(a->B, &dummy)); 679 PetscCall(MatSeqAIJRestoreArrayRead(a->B, &dummy)); 680 api = ptap->api; 681 apj = ptap->apj; 682 for (i = 0; i < cm; i++) { 683 apJ = apj + api[i]; 684 685 /* diagonal portion of A */ 686 anz = adi[i + 1] - adi[i]; 687 adj = ad->j + adi[i]; 688 ada = ad->a + adi[i]; 689 for (j = 0; j < anz; j++) { 690 row = adj[j]; 691 pnz = pi_loc[row + 1] - pi_loc[row]; 692 pj = pj_loc + pi_loc[row]; 693 pa = pa_loc + pi_loc[row]; 694 /* perform sparse axpy */ 695 valtmp = ada[j]; 696 nextp = 0; 697 for (k = 0; nextp < pnz; k++) { 698 if (apJ[k] == pj[nextp]) { /* column of AP == column of P */ 699 apa_sparse[k] += valtmp * pa[nextp++]; 700 } 701 } 702 PetscCall(PetscLogFlops(2.0 * pnz)); 703 } 704 705 /* off-diagonal portion of A */ 706 anz = aoi[i + 1] - aoi[i]; 707 aoj = ao->j + aoi[i]; 708 aoa = ao->a + aoi[i]; 709 for (j = 0; j < anz; j++) { 710 row = aoj[j]; 711 pnz = pi_oth[row + 1] - pi_oth[row]; 712 pj = pj_oth + pi_oth[row]; 713 pa = pa_oth + pi_oth[row]; 714 /* perform sparse axpy */ 715 valtmp = aoa[j]; 716 nextp = 0; 717 for (k = 0; nextp < pnz; k++) { 718 if (apJ[k] == pj[nextp]) { /* column of AP == column of P */ 719 apa_sparse[k] += valtmp * pa[nextp++]; 720 } 721 } 722 PetscCall(PetscLogFlops(2.0 * pnz)); 723 } 724 725 /* set values in C */ 726 cdnz = cd->i[i + 1] - cd->i[i]; 727 conz = co->i[i + 1] - co->i[i]; 728 729 /* 1st off-diagonal part of C */ 730 ca = coa + co->i[i]; 731 k = 0; 732 for (k0 = 0; k0 < conz; k0++) { 733 if (apJ[k] >= cstart) break; 734 ca[k0] = apa_sparse[k]; 735 apa_sparse[k] = 0.0; 736 k++; 737 } 738 739 /* diagonal part of C */ 740 ca = cda + cd->i[i]; 741 for (k1 = 0; k1 < cdnz; k1++) { 742 ca[k1] = apa_sparse[k]; 743 apa_sparse[k] = 0.0; 744 k++; 745 } 746 747 /* 2nd off-diagonal part of C */ 748 ca = coa + co->i[i]; 749 for (; k0 < conz; k0++) { 750 ca[k0] = apa_sparse[k]; 751 apa_sparse[k] = 0.0; 752 k++; 753 } 754 } 755 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 756 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 757 PetscFunctionReturn(0); 758 } 759 760 /* same as MatMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(), except using LLCondensed to avoid O(BN) memory requirement */ 761 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ(Mat A, Mat P, PetscReal fill, Mat C) { 762 MPI_Comm comm; 763 PetscMPIInt size; 764 Mat_APMPI *ptap; 765 PetscFreeSpaceList free_space = NULL, current_space = NULL; 766 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 767 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_loc, *p_oth; 768 PetscInt *pi_loc, *pj_loc, *pi_oth, *pj_oth, *dnz, *onz; 769 PetscInt *adi = ad->i, *adj = ad->j, *aoi = ao->i, *aoj = ao->j, rstart = A->rmap->rstart; 770 PetscInt i, pnz, row, *api, *apj, *Jptr, apnz, nspacedouble = 0, j, nzi, *lnk, apnz_max = 1; 771 PetscInt am = A->rmap->n, pn = P->cmap->n, pm = P->rmap->n, lsize = pn + 20; 772 PetscReal afill; 773 MatType mtype; 774 775 PetscFunctionBegin; 776 MatCheckProduct(C, 4); 777 PetscCheck(!C->product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 778 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 779 PetscCallMPI(MPI_Comm_size(comm, &size)); 780 781 /* create struct Mat_APMPI and attached it to C later */ 782 PetscCall(PetscNew(&ptap)); 783 784 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 785 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 786 787 /* get P_loc by taking all local rows of P */ 788 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 789 790 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 791 pi_loc = p_loc->i; 792 pj_loc = p_loc->j; 793 if (size > 1) { 794 p_oth = (Mat_SeqAIJ *)(ptap->P_oth)->data; 795 pi_oth = p_oth->i; 796 pj_oth = p_oth->j; 797 } else { 798 p_oth = NULL; 799 pi_oth = NULL; 800 pj_oth = NULL; 801 } 802 803 /* first, compute symbolic AP = A_loc*P = A_diag*P_loc + A_off*P_oth */ 804 /*-------------------------------------------------------------------*/ 805 PetscCall(PetscMalloc1(am + 2, &api)); 806 ptap->api = api; 807 api[0] = 0; 808 809 PetscCall(PetscLLCondensedCreate_Scalable(lsize, &lnk)); 810 811 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 812 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(adi[am], PetscIntSumTruncate(aoi[am], pi_loc[pm]))), &free_space)); 813 current_space = free_space; 814 MatPreallocateBegin(comm, am, pn, dnz, onz); 815 for (i = 0; i < am; i++) { 816 /* diagonal portion of A */ 817 nzi = adi[i + 1] - adi[i]; 818 for (j = 0; j < nzi; j++) { 819 row = *adj++; 820 pnz = pi_loc[row + 1] - pi_loc[row]; 821 Jptr = pj_loc + pi_loc[row]; 822 /* Expand list if it is not long enough */ 823 if (pnz + apnz_max > lsize) { 824 lsize = pnz + apnz_max; 825 PetscCall(PetscLLCondensedExpand_Scalable(lsize, &lnk)); 826 } 827 /* add non-zero cols of P into the sorted linked list lnk */ 828 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 829 apnz = *lnk; /* The first element in the list is the number of items in the list */ 830 api[i + 1] = api[i] + apnz; 831 if (apnz > apnz_max) apnz_max = apnz + 1; /* '1' for diagonal entry */ 832 } 833 /* off-diagonal portion of A */ 834 nzi = aoi[i + 1] - aoi[i]; 835 for (j = 0; j < nzi; j++) { 836 row = *aoj++; 837 pnz = pi_oth[row + 1] - pi_oth[row]; 838 Jptr = pj_oth + pi_oth[row]; 839 /* Expand list if it is not long enough */ 840 if (pnz + apnz_max > lsize) { 841 lsize = pnz + apnz_max; 842 PetscCall(PetscLLCondensedExpand_Scalable(lsize, &lnk)); 843 } 844 /* add non-zero cols of P into the sorted linked list lnk */ 845 PetscCall(PetscLLCondensedAddSorted_Scalable(pnz, Jptr, lnk)); 846 apnz = *lnk; /* The first element in the list is the number of items in the list */ 847 api[i + 1] = api[i] + apnz; 848 if (apnz > apnz_max) apnz_max = apnz + 1; /* '1' for diagonal entry */ 849 } 850 851 /* add missing diagonal entry */ 852 if (C->force_diagonals) { 853 j = i + rstart; /* column index */ 854 PetscCall(PetscLLCondensedAddSorted_Scalable(1, &j, lnk)); 855 } 856 857 apnz = *lnk; 858 api[i + 1] = api[i] + apnz; 859 if (apnz > apnz_max) apnz_max = apnz; 860 861 /* if free space is not available, double the total space in the list */ 862 if (current_space->local_remaining < apnz) { 863 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(apnz, current_space->total_array_size), ¤t_space)); 864 nspacedouble++; 865 } 866 867 /* Copy data into free space, then initialize lnk */ 868 PetscCall(PetscLLCondensedClean_Scalable(apnz, current_space->array, lnk)); 869 PetscCall(MatPreallocateSet(i + rstart, apnz, current_space->array, dnz, onz)); 870 871 current_space->array += apnz; 872 current_space->local_used += apnz; 873 current_space->local_remaining -= apnz; 874 } 875 876 /* Allocate space for apj, initialize apj, and */ 877 /* destroy list of free space and other temporary array(s) */ 878 PetscCall(PetscMalloc1(api[am] + 1, &ptap->apj)); 879 apj = ptap->apj; 880 PetscCall(PetscFreeSpaceContiguous(&free_space, ptap->apj)); 881 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 882 883 /* create and assemble symbolic parallel matrix C */ 884 /*----------------------------------------------------*/ 885 PetscCall(MatSetSizes(C, am, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 886 PetscCall(MatSetBlockSizesFromMats(C, A, P)); 887 PetscCall(MatGetType(A, &mtype)); 888 PetscCall(MatSetType(C, mtype)); 889 PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz)); 890 MatPreallocateEnd(dnz, onz); 891 892 /* malloc apa for assembly C */ 893 PetscCall(PetscCalloc1(apnz_max, &ptap->apa)); 894 895 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(C, apj, api)); 896 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 897 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 898 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 899 900 C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ; 901 C->ops->productnumeric = MatProductNumeric_AB; 902 903 /* attach the supporting struct to C for reuse */ 904 C->product->data = ptap; 905 C->product->destroy = MatDestroy_MPIAIJ_MatMatMult; 906 907 /* set MatInfo */ 908 afill = (PetscReal)api[am] / (adi[am] + aoi[am] + pi_loc[pm] + 1) + 1.e-5; 909 if (afill < 1.0) afill = 1.0; 910 C->info.mallocs = nspacedouble; 911 C->info.fill_ratio_given = fill; 912 C->info.fill_ratio_needed = afill; 913 914 #if defined(PETSC_USE_INFO) 915 if (api[am]) { 916 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill)); 917 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 918 } else { 919 PetscCall(PetscInfo(C, "Empty matrix product\n")); 920 } 921 #endif 922 PetscFunctionReturn(0); 923 } 924 925 /* This function is needed for the seqMPI matrix-matrix multiplication. */ 926 /* Three input arrays are merged to one output array. The size of the */ 927 /* output array is also output. Duplicate entries only show up once. */ 928 static void Merge3SortedArrays(PetscInt size1, PetscInt *in1, PetscInt size2, PetscInt *in2, PetscInt size3, PetscInt *in3, PetscInt *size4, PetscInt *out) { 929 int i = 0, j = 0, k = 0, l = 0; 930 931 /* Traverse all three arrays */ 932 while (i < size1 && j < size2 && k < size3) { 933 if (in1[i] < in2[j] && in1[i] < in3[k]) { 934 out[l++] = in1[i++]; 935 } else if (in2[j] < in1[i] && in2[j] < in3[k]) { 936 out[l++] = in2[j++]; 937 } else if (in3[k] < in1[i] && in3[k] < in2[j]) { 938 out[l++] = in3[k++]; 939 } else if (in1[i] == in2[j] && in1[i] < in3[k]) { 940 out[l++] = in1[i]; 941 i++, j++; 942 } else if (in1[i] == in3[k] && in1[i] < in2[j]) { 943 out[l++] = in1[i]; 944 i++, k++; 945 } else if (in3[k] == in2[j] && in2[j] < in1[i]) { 946 out[l++] = in2[j]; 947 k++, j++; 948 } else if (in1[i] == in2[j] && in1[i] == in3[k]) { 949 out[l++] = in1[i]; 950 i++, j++, k++; 951 } 952 } 953 954 /* Traverse two remaining arrays */ 955 while (i < size1 && j < size2) { 956 if (in1[i] < in2[j]) { 957 out[l++] = in1[i++]; 958 } else if (in1[i] > in2[j]) { 959 out[l++] = in2[j++]; 960 } else { 961 out[l++] = in1[i]; 962 i++, j++; 963 } 964 } 965 966 while (i < size1 && k < size3) { 967 if (in1[i] < in3[k]) { 968 out[l++] = in1[i++]; 969 } else if (in1[i] > in3[k]) { 970 out[l++] = in3[k++]; 971 } else { 972 out[l++] = in1[i]; 973 i++, k++; 974 } 975 } 976 977 while (k < size3 && j < size2) { 978 if (in3[k] < in2[j]) { 979 out[l++] = in3[k++]; 980 } else if (in3[k] > in2[j]) { 981 out[l++] = in2[j++]; 982 } else { 983 out[l++] = in3[k]; 984 k++, j++; 985 } 986 } 987 988 /* Traverse one remaining array */ 989 while (i < size1) out[l++] = in1[i++]; 990 while (j < size2) out[l++] = in2[j++]; 991 while (k < size3) out[l++] = in3[k++]; 992 993 *size4 = l; 994 } 995 996 /* This matrix-matrix multiplication algorithm divides the multiplication into three multiplications and */ 997 /* adds up the products. Two of these three multiplications are performed with existing (sequential) */ 998 /* matrix-matrix multiplications. */ 999 PetscErrorCode MatMatMultSymbolic_MPIAIJ_MPIAIJ_seqMPI(Mat A, Mat P, PetscReal fill, Mat C) { 1000 MPI_Comm comm; 1001 PetscMPIInt size; 1002 Mat_APMPI *ptap; 1003 PetscFreeSpaceList free_space_diag = NULL, current_space = NULL; 1004 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 1005 Mat_SeqAIJ *ad = (Mat_SeqAIJ *)(a->A)->data, *ao = (Mat_SeqAIJ *)(a->B)->data, *p_loc; 1006 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1007 Mat_SeqAIJ *adpd_seq, *p_off, *aopoth_seq; 1008 PetscInt adponz, adpdnz; 1009 PetscInt *pi_loc, *dnz, *onz; 1010 PetscInt *adi = ad->i, *adj = ad->j, *aoi = ao->i, rstart = A->rmap->rstart; 1011 PetscInt *lnk, i, i1 = 0, pnz, row, *adpoi, *adpoj, *api, *adpoJ, *aopJ, *apJ, *Jptr, aopnz, nspacedouble = 0, j, nzi, *apj, apnz, *adpdi, *adpdj, *adpdJ, *poff_i, *poff_j, *j_temp, *aopothi, *aopothj; 1012 PetscInt am = A->rmap->n, pN = P->cmap->N, pn = P->cmap->n, pm = P->rmap->n, p_colstart, p_colend; 1013 PetscBT lnkbt; 1014 PetscReal afill; 1015 PetscMPIInt rank; 1016 Mat adpd, aopoth; 1017 MatType mtype; 1018 const char *prefix; 1019 1020 PetscFunctionBegin; 1021 MatCheckProduct(C, 4); 1022 PetscCheck(!C->product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1023 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1024 PetscCallMPI(MPI_Comm_size(comm, &size)); 1025 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1026 PetscCall(MatGetOwnershipRangeColumn(P, &p_colstart, &p_colend)); 1027 1028 /* create struct Mat_APMPI and attached it to C later */ 1029 PetscCall(PetscNew(&ptap)); 1030 1031 /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */ 1032 PetscCall(MatGetBrowsOfAoCols_MPIAIJ(A, P, MAT_INITIAL_MATRIX, &ptap->startsj_s, &ptap->startsj_r, &ptap->bufa, &ptap->P_oth)); 1033 1034 /* get P_loc by taking all local rows of P */ 1035 PetscCall(MatMPIAIJGetLocalMat(P, MAT_INITIAL_MATRIX, &ptap->P_loc)); 1036 1037 p_loc = (Mat_SeqAIJ *)(ptap->P_loc)->data; 1038 pi_loc = p_loc->i; 1039 1040 /* Allocate memory for the i arrays of the matrices A*P, A_diag*P_off and A_offd * P */ 1041 PetscCall(PetscMalloc1(am + 2, &api)); 1042 PetscCall(PetscMalloc1(am + 2, &adpoi)); 1043 1044 adpoi[0] = 0; 1045 ptap->api = api; 1046 api[0] = 0; 1047 1048 /* create and initialize a linked list, will be used for both A_diag * P_loc_off and A_offd * P_oth */ 1049 PetscCall(PetscLLCondensedCreate(pN, pN, &lnk, &lnkbt)); 1050 MatPreallocateBegin(comm, am, pn, dnz, onz); 1051 1052 /* Symbolic calc of A_loc_diag * P_loc_diag */ 1053 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1054 PetscCall(MatProductCreate(a->A, p->A, NULL, &adpd)); 1055 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1056 PetscCall(MatSetOptionsPrefix(adpd, prefix)); 1057 PetscCall(MatAppendOptionsPrefix(adpd, "inner_diag_")); 1058 1059 PetscCall(MatProductSetType(adpd, MATPRODUCT_AB)); 1060 PetscCall(MatProductSetAlgorithm(adpd, "sorted")); 1061 PetscCall(MatProductSetFill(adpd, fill)); 1062 PetscCall(MatProductSetFromOptions(adpd)); 1063 1064 adpd->force_diagonals = C->force_diagonals; 1065 PetscCall(MatProductSymbolic(adpd)); 1066 1067 adpd_seq = (Mat_SeqAIJ *)((adpd)->data); 1068 adpdi = adpd_seq->i; 1069 adpdj = adpd_seq->j; 1070 p_off = (Mat_SeqAIJ *)((p->B)->data); 1071 poff_i = p_off->i; 1072 poff_j = p_off->j; 1073 1074 /* j_temp stores indices of a result row before they are added to the linked list */ 1075 PetscCall(PetscMalloc1(pN + 2, &j_temp)); 1076 1077 /* Symbolic calc of the A_diag * p_loc_off */ 1078 /* Initial FreeSpace size is fill*(nnz(A)+nnz(P)) */ 1079 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(adi[am], PetscIntSumTruncate(aoi[am], pi_loc[pm]))), &free_space_diag)); 1080 current_space = free_space_diag; 1081 1082 for (i = 0; i < am; i++) { 1083 /* A_diag * P_loc_off */ 1084 nzi = adi[i + 1] - adi[i]; 1085 for (j = 0; j < nzi; j++) { 1086 row = *adj++; 1087 pnz = poff_i[row + 1] - poff_i[row]; 1088 Jptr = poff_j + poff_i[row]; 1089 for (i1 = 0; i1 < pnz; i1++) j_temp[i1] = p->garray[Jptr[i1]]; 1090 /* add non-zero cols of P into the sorted linked list lnk */ 1091 PetscCall(PetscLLCondensedAddSorted(pnz, j_temp, lnk, lnkbt)); 1092 } 1093 1094 adponz = lnk[0]; 1095 adpoi[i + 1] = adpoi[i] + adponz; 1096 1097 /* if free space is not available, double the total space in the list */ 1098 if (current_space->local_remaining < adponz) { 1099 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(adponz, current_space->total_array_size), ¤t_space)); 1100 nspacedouble++; 1101 } 1102 1103 /* Copy data into free space, then initialize lnk */ 1104 PetscCall(PetscLLCondensedClean(pN, adponz, current_space->array, lnk, lnkbt)); 1105 1106 current_space->array += adponz; 1107 current_space->local_used += adponz; 1108 current_space->local_remaining -= adponz; 1109 } 1110 1111 /* Symbolic calc of A_off * P_oth */ 1112 PetscCall(MatSetOptionsPrefix(a->B, prefix)); 1113 PetscCall(MatAppendOptionsPrefix(a->B, "inner_offdiag_")); 1114 PetscCall(MatCreate(PETSC_COMM_SELF, &aopoth)); 1115 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(a->B, ptap->P_oth, fill, aopoth)); 1116 aopoth_seq = (Mat_SeqAIJ *)((aopoth)->data); 1117 aopothi = aopoth_seq->i; 1118 aopothj = aopoth_seq->j; 1119 1120 /* Allocate space for apj, adpj, aopj, ... */ 1121 /* destroy lists of free space and other temporary array(s) */ 1122 1123 PetscCall(PetscMalloc1(aopothi[am] + adpoi[am] + adpdi[am] + 2, &ptap->apj)); 1124 PetscCall(PetscMalloc1(adpoi[am] + 2, &adpoj)); 1125 1126 /* Copy from linked list to j-array */ 1127 PetscCall(PetscFreeSpaceContiguous(&free_space_diag, adpoj)); 1128 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1129 1130 adpoJ = adpoj; 1131 adpdJ = adpdj; 1132 aopJ = aopothj; 1133 apj = ptap->apj; 1134 apJ = apj; /* still empty */ 1135 1136 /* Merge j-arrays of A_off * P, A_diag * P_loc_off, and */ 1137 /* A_diag * P_loc_diag to get A*P */ 1138 for (i = 0; i < am; i++) { 1139 aopnz = aopothi[i + 1] - aopothi[i]; 1140 adponz = adpoi[i + 1] - adpoi[i]; 1141 adpdnz = adpdi[i + 1] - adpdi[i]; 1142 1143 /* Correct indices from A_diag*P_diag */ 1144 for (i1 = 0; i1 < adpdnz; i1++) adpdJ[i1] += p_colstart; 1145 /* Merge j-arrays of A_diag * P_loc_off and A_diag * P_loc_diag and A_off * P_oth */ 1146 Merge3SortedArrays(adponz, adpoJ, adpdnz, adpdJ, aopnz, aopJ, &apnz, apJ); 1147 PetscCall(MatPreallocateSet(i + rstart, apnz, apJ, dnz, onz)); 1148 1149 aopJ += aopnz; 1150 adpoJ += adponz; 1151 adpdJ += adpdnz; 1152 apJ += apnz; 1153 api[i + 1] = api[i] + apnz; 1154 } 1155 1156 /* malloc apa to store dense row A[i,:]*P */ 1157 PetscCall(PetscCalloc1(pN + 2, &ptap->apa)); 1158 1159 /* create and assemble symbolic parallel matrix C */ 1160 PetscCall(MatSetSizes(C, am, pn, PETSC_DETERMINE, PETSC_DETERMINE)); 1161 PetscCall(MatSetBlockSizesFromMats(C, A, P)); 1162 PetscCall(MatGetType(A, &mtype)); 1163 PetscCall(MatSetType(C, mtype)); 1164 PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz)); 1165 MatPreallocateEnd(dnz, onz); 1166 1167 PetscCall(MatSetValues_MPIAIJ_CopyFromCSRFormat_Symbolic(C, apj, api)); 1168 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1169 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1170 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 1171 1172 C->ops->matmultnumeric = MatMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable; 1173 C->ops->productnumeric = MatProductNumeric_AB; 1174 1175 /* attach the supporting struct to C for reuse */ 1176 C->product->data = ptap; 1177 C->product->destroy = MatDestroy_MPIAIJ_MatMatMult; 1178 1179 /* set MatInfo */ 1180 afill = (PetscReal)api[am] / (adi[am] + aoi[am] + pi_loc[pm] + 1) + 1.e-5; 1181 if (afill < 1.0) afill = 1.0; 1182 C->info.mallocs = nspacedouble; 1183 C->info.fill_ratio_given = fill; 1184 C->info.fill_ratio_needed = afill; 1185 1186 #if defined(PETSC_USE_INFO) 1187 if (api[am]) { 1188 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill)); 1189 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1190 } else { 1191 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1192 } 1193 #endif 1194 1195 PetscCall(MatDestroy(&aopoth)); 1196 PetscCall(MatDestroy(&adpd)); 1197 PetscCall(PetscFree(j_temp)); 1198 PetscCall(PetscFree(adpoj)); 1199 PetscCall(PetscFree(adpoi)); 1200 PetscFunctionReturn(0); 1201 } 1202 1203 /*-------------------------------------------------------------------------*/ 1204 /* This routine only works when scall=MAT_REUSE_MATRIX! */ 1205 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult(Mat P, Mat A, Mat C) { 1206 Mat_APMPI *ptap; 1207 Mat Pt; 1208 1209 PetscFunctionBegin; 1210 MatCheckProduct(C, 3); 1211 ptap = (Mat_APMPI *)C->product->data; 1212 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 1213 PetscCheck(ptap->Pt, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be reused. Do not call MatProductClear()"); 1214 1215 Pt = ptap->Pt; 1216 PetscCall(MatTransposeSetPrecursor(P, Pt)); 1217 PetscCall(MatTranspose(P, MAT_REUSE_MATRIX, &Pt)); 1218 PetscCall(MatMatMultNumeric_MPIAIJ_MPIAIJ(Pt, A, C)); 1219 PetscFunctionReturn(0); 1220 } 1221 1222 /* This routine is modified from MatPtAPSymbolic_MPIAIJ_MPIAIJ() */ 1223 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(Mat P, Mat A, PetscReal fill, Mat C) { 1224 Mat_APMPI *ptap; 1225 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1226 MPI_Comm comm; 1227 PetscMPIInt size, rank; 1228 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1229 PetscInt pn = P->cmap->n, aN = A->cmap->N, an = A->cmap->n; 1230 PetscInt *lnk, i, k, nsend, rstart; 1231 PetscBT lnkbt; 1232 PetscMPIInt tagi, tagj, *len_si, *len_s, *len_ri, nrecv; 1233 PETSC_UNUSED PetscMPIInt icompleted = 0; 1234 PetscInt **buf_rj, **buf_ri, **buf_ri_k, row, ncols, *cols; 1235 PetscInt len, proc, *dnz, *onz, *owners, nzi; 1236 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 1237 MPI_Request *swaits, *rwaits; 1238 MPI_Status *sstatus, rstatus; 1239 PetscLayout rowmap; 1240 PetscInt *owners_co, *coi, *coj; /* i and j array of (p->B)^T*A*P - used in the communication */ 1241 PetscMPIInt *len_r, *id_r; /* array of length of comm->size, store send/recv matrix values */ 1242 PetscInt *Jptr, *prmap = p->garray, con, j, Crmax; 1243 Mat_SeqAIJ *a_loc, *c_loc, *c_oth; 1244 PetscTable ta; 1245 MatType mtype; 1246 const char *prefix; 1247 1248 PetscFunctionBegin; 1249 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1250 PetscCallMPI(MPI_Comm_size(comm, &size)); 1251 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1252 1253 /* create symbolic parallel matrix C */ 1254 PetscCall(MatGetType(A, &mtype)); 1255 PetscCall(MatSetType(C, mtype)); 1256 1257 C->ops->transposematmultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable; 1258 1259 /* create struct Mat_APMPI and attached it to C later */ 1260 PetscCall(PetscNew(&ptap)); 1261 1262 /* (0) compute Rd = Pd^T, Ro = Po^T */ 1263 /* --------------------------------- */ 1264 PetscCall(MatTranspose(p->A, MAT_INITIAL_MATRIX, &ptap->Rd)); 1265 PetscCall(MatTranspose(p->B, MAT_INITIAL_MATRIX, &ptap->Ro)); 1266 1267 /* (1) compute symbolic A_loc */ 1268 /* ---------------------------*/ 1269 PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &ptap->A_loc)); 1270 1271 /* (2-1) compute symbolic C_oth = Ro*A_loc */ 1272 /* ------------------------------------ */ 1273 PetscCall(MatGetOptionsPrefix(A, &prefix)); 1274 PetscCall(MatSetOptionsPrefix(ptap->Ro, prefix)); 1275 PetscCall(MatAppendOptionsPrefix(ptap->Ro, "inner_offdiag_")); 1276 PetscCall(MatCreate(PETSC_COMM_SELF, &ptap->C_oth)); 1277 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Ro, ptap->A_loc, fill, ptap->C_oth)); 1278 1279 /* (3) send coj of C_oth to other processors */ 1280 /* ------------------------------------------ */ 1281 /* determine row ownership */ 1282 PetscCall(PetscLayoutCreate(comm, &rowmap)); 1283 rowmap->n = pn; 1284 rowmap->bs = 1; 1285 PetscCall(PetscLayoutSetUp(rowmap)); 1286 owners = rowmap->range; 1287 1288 /* determine the number of messages to send, their lengths */ 1289 PetscCall(PetscMalloc4(size, &len_s, size, &len_si, size, &sstatus, size + 2, &owners_co)); 1290 PetscCall(PetscArrayzero(len_s, size)); 1291 PetscCall(PetscArrayzero(len_si, size)); 1292 1293 c_oth = (Mat_SeqAIJ *)ptap->C_oth->data; 1294 coi = c_oth->i; 1295 coj = c_oth->j; 1296 con = ptap->C_oth->rmap->n; 1297 proc = 0; 1298 for (i = 0; i < con; i++) { 1299 while (prmap[i] >= owners[proc + 1]) proc++; 1300 len_si[proc]++; /* num of rows in Co(=Pt*A) to be sent to [proc] */ 1301 len_s[proc] += coi[i + 1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */ 1302 } 1303 1304 len = 0; /* max length of buf_si[], see (4) */ 1305 owners_co[0] = 0; 1306 nsend = 0; 1307 for (proc = 0; proc < size; proc++) { 1308 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 1309 if (len_s[proc]) { 1310 nsend++; 1311 len_si[proc] = 2 * (len_si[proc] + 1); /* length of buf_si to be sent to [proc] */ 1312 len += len_si[proc]; 1313 } 1314 } 1315 1316 /* determine the number and length of messages to receive for coi and coj */ 1317 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &nrecv)); 1318 PetscCall(PetscGatherMessageLengths2(comm, nsend, nrecv, len_s, len_si, &id_r, &len_r, &len_ri)); 1319 1320 /* post the Irecv and Isend of coj */ 1321 PetscCall(PetscCommGetNewTag(comm, &tagj)); 1322 PetscCall(PetscPostIrecvInt(comm, tagj, nrecv, id_r, len_r, &buf_rj, &rwaits)); 1323 PetscCall(PetscMalloc1(nsend + 1, &swaits)); 1324 for (proc = 0, k = 0; proc < size; proc++) { 1325 if (!len_s[proc]) continue; 1326 i = owners_co[proc]; 1327 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 1328 k++; 1329 } 1330 1331 /* (2-2) compute symbolic C_loc = Rd*A_loc */ 1332 /* ---------------------------------------- */ 1333 PetscCall(MatSetOptionsPrefix(ptap->Rd, prefix)); 1334 PetscCall(MatAppendOptionsPrefix(ptap->Rd, "inner_diag_")); 1335 PetscCall(MatCreate(PETSC_COMM_SELF, &ptap->C_loc)); 1336 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(ptap->Rd, ptap->A_loc, fill, ptap->C_loc)); 1337 c_loc = (Mat_SeqAIJ *)ptap->C_loc->data; 1338 1339 /* receives coj are complete */ 1340 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1341 PetscCall(PetscFree(rwaits)); 1342 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1343 1344 /* add received column indices into ta to update Crmax */ 1345 a_loc = (Mat_SeqAIJ *)(ptap->A_loc)->data; 1346 1347 /* create and initialize a linked list */ 1348 PetscCall(PetscTableCreate(an, aN, &ta)); /* for compute Crmax */ 1349 MatRowMergeMax_SeqAIJ(a_loc, ptap->A_loc->rmap->N, ta); 1350 1351 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1352 Jptr = buf_rj[k]; 1353 for (j = 0; j < len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES)); 1354 } 1355 PetscCall(PetscTableGetCount(ta, &Crmax)); 1356 PetscCall(PetscTableDestroy(&ta)); 1357 1358 /* (4) send and recv coi */ 1359 /*-----------------------*/ 1360 PetscCall(PetscCommGetNewTag(comm, &tagi)); 1361 PetscCall(PetscPostIrecvInt(comm, tagi, nrecv, id_r, len_ri, &buf_ri, &rwaits)); 1362 PetscCall(PetscMalloc1(len + 1, &buf_s)); 1363 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1364 for (proc = 0, k = 0; proc < size; proc++) { 1365 if (!len_s[proc]) continue; 1366 /* form outgoing message for i-structure: 1367 buf_si[0]: nrows to be sent 1368 [1:nrows]: row index (global) 1369 [nrows+1:2*nrows+1]: i-structure index 1370 */ 1371 /*-------------------------------------------*/ 1372 nrows = len_si[proc] / 2 - 1; /* num of rows in Co to be sent to [proc] */ 1373 buf_si_i = buf_si + nrows + 1; 1374 buf_si[0] = nrows; 1375 buf_si_i[0] = 0; 1376 nrows = 0; 1377 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 1378 nzi = coi[i + 1] - coi[i]; 1379 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 1380 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 1381 nrows++; 1382 } 1383 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 1384 k++; 1385 buf_si += len_si[proc]; 1386 } 1387 for (i = 0; i < nrecv; i++) PetscCallMPI(MPI_Waitany(nrecv, rwaits, &icompleted, &rstatus)); 1388 PetscCall(PetscFree(rwaits)); 1389 if (nsend) PetscCallMPI(MPI_Waitall(nsend, swaits, sstatus)); 1390 1391 PetscCall(PetscFree4(len_s, len_si, sstatus, owners_co)); 1392 PetscCall(PetscFree(len_ri)); 1393 PetscCall(PetscFree(swaits)); 1394 PetscCall(PetscFree(buf_s)); 1395 1396 /* (5) compute the local portion of C */ 1397 /* ------------------------------------------ */ 1398 /* set initial free space to be Crmax, sufficient for holding nozeros in each row of C */ 1399 PetscCall(PetscFreeSpaceGet(Crmax, &free_space)); 1400 current_space = free_space; 1401 1402 PetscCall(PetscMalloc3(nrecv, &buf_ri_k, nrecv, &nextrow, nrecv, &nextci)); 1403 for (k = 0; k < nrecv; k++) { 1404 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1405 nrows = *buf_ri_k[k]; 1406 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1407 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1408 } 1409 1410 MatPreallocateBegin(comm, pn, an, dnz, onz); 1411 PetscCall(PetscLLCondensedCreate(Crmax, aN, &lnk, &lnkbt)); 1412 for (i = 0; i < pn; i++) { /* for each local row of C */ 1413 /* add C_loc into C */ 1414 nzi = c_loc->i[i + 1] - c_loc->i[i]; 1415 Jptr = c_loc->j + c_loc->i[i]; 1416 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1417 1418 /* add received col data into lnk */ 1419 for (k = 0; k < nrecv; k++) { /* k-th received message */ 1420 if (i == *nextrow[k]) { /* i-th row */ 1421 nzi = *(nextci[k] + 1) - *nextci[k]; 1422 Jptr = buf_rj[k] + *nextci[k]; 1423 PetscCall(PetscLLCondensedAddSorted(nzi, Jptr, lnk, lnkbt)); 1424 nextrow[k]++; 1425 nextci[k]++; 1426 } 1427 } 1428 1429 /* add missing diagonal entry */ 1430 if (C->force_diagonals) { 1431 k = i + owners[rank]; /* column index */ 1432 PetscCall(PetscLLCondensedAddSorted(1, &k, lnk, lnkbt)); 1433 } 1434 1435 nzi = lnk[0]; 1436 1437 /* copy data into free space, then initialize lnk */ 1438 PetscCall(PetscLLCondensedClean(aN, nzi, current_space->array, lnk, lnkbt)); 1439 PetscCall(MatPreallocateSet(i + owners[rank], nzi, current_space->array, dnz, onz)); 1440 } 1441 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 1442 PetscCall(PetscLLDestroy(lnk, lnkbt)); 1443 PetscCall(PetscFreeSpaceDestroy(free_space)); 1444 1445 /* local sizes and preallocation */ 1446 PetscCall(MatSetSizes(C, pn, an, PETSC_DETERMINE, PETSC_DETERMINE)); 1447 if (P->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, P->cmap->bs)); 1448 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, A->cmap->bs)); 1449 PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz)); 1450 MatPreallocateEnd(dnz, onz); 1451 1452 /* add C_loc and C_oth to C */ 1453 PetscCall(MatGetOwnershipRange(C, &rstart, NULL)); 1454 for (i = 0; i < pn; i++) { 1455 ncols = c_loc->i[i + 1] - c_loc->i[i]; 1456 cols = c_loc->j + c_loc->i[i]; 1457 row = rstart + i; 1458 PetscCall(MatSetValues(C, 1, (const PetscInt *)&row, ncols, (const PetscInt *)cols, NULL, INSERT_VALUES)); 1459 1460 if (C->force_diagonals) PetscCall(MatSetValues(C, 1, (const PetscInt *)&row, 1, (const PetscInt *)&row, NULL, INSERT_VALUES)); 1461 } 1462 for (i = 0; i < con; i++) { 1463 ncols = c_oth->i[i + 1] - c_oth->i[i]; 1464 cols = c_oth->j + c_oth->i[i]; 1465 row = prmap[i]; 1466 PetscCall(MatSetValues(C, 1, (const PetscInt *)&row, ncols, (const PetscInt *)cols, NULL, INSERT_VALUES)); 1467 } 1468 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1469 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1470 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 1471 1472 /* members in merge */ 1473 PetscCall(PetscFree(id_r)); 1474 PetscCall(PetscFree(len_r)); 1475 PetscCall(PetscFree(buf_ri[0])); 1476 PetscCall(PetscFree(buf_ri)); 1477 PetscCall(PetscFree(buf_rj[0])); 1478 PetscCall(PetscFree(buf_rj)); 1479 PetscCall(PetscLayoutDestroy(&rowmap)); 1480 1481 /* attach the supporting struct to C for reuse */ 1482 C->product->data = ptap; 1483 C->product->destroy = MatDestroy_MPIAIJ_PtAP; 1484 PetscFunctionReturn(0); 1485 } 1486 1487 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_nonscalable(Mat P, Mat A, Mat C) { 1488 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1489 Mat_SeqAIJ *c_seq; 1490 Mat_APMPI *ptap; 1491 Mat A_loc, C_loc, C_oth; 1492 PetscInt i, rstart, rend, cm, ncols, row; 1493 const PetscInt *cols; 1494 const PetscScalar *vals; 1495 1496 PetscFunctionBegin; 1497 MatCheckProduct(C, 3); 1498 ptap = (Mat_APMPI *)C->product->data; 1499 PetscCheck(ptap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtAP cannot be computed. Missing data"); 1500 PetscCheck(ptap->A_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be reused. Do not call MatProductClear()"); 1501 PetscCall(MatZeroEntries(C)); 1502 1503 /* These matrices are obtained in MatTransposeMatMultSymbolic() */ 1504 /* 1) get R = Pd^T, Ro = Po^T */ 1505 /*----------------------------*/ 1506 PetscCall(MatTransposeSetPrecursor(p->A, ptap->Rd)); 1507 PetscCall(MatTranspose(p->A, MAT_REUSE_MATRIX, &ptap->Rd)); 1508 PetscCall(MatTransposeSetPrecursor(p->B, ptap->Ro)); 1509 PetscCall(MatTranspose(p->B, MAT_REUSE_MATRIX, &ptap->Ro)); 1510 1511 /* 2) compute numeric A_loc */ 1512 /*--------------------------*/ 1513 PetscCall(MatMPIAIJGetLocalMat(A, MAT_REUSE_MATRIX, &ptap->A_loc)); 1514 1515 /* 3) C_loc = Rd*A_loc, C_oth = Ro*A_loc */ 1516 A_loc = ptap->A_loc; 1517 PetscCall(((ptap->C_loc)->ops->matmultnumeric)(ptap->Rd, A_loc, ptap->C_loc)); 1518 PetscCall(((ptap->C_oth)->ops->matmultnumeric)(ptap->Ro, A_loc, ptap->C_oth)); 1519 C_loc = ptap->C_loc; 1520 C_oth = ptap->C_oth; 1521 1522 /* add C_loc and C_oth to C */ 1523 PetscCall(MatGetOwnershipRange(C, &rstart, &rend)); 1524 1525 /* C_loc -> C */ 1526 cm = C_loc->rmap->N; 1527 c_seq = (Mat_SeqAIJ *)C_loc->data; 1528 cols = c_seq->j; 1529 vals = c_seq->a; 1530 for (i = 0; i < cm; i++) { 1531 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1532 row = rstart + i; 1533 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1534 cols += ncols; 1535 vals += ncols; 1536 } 1537 1538 /* Co -> C, off-processor part */ 1539 cm = C_oth->rmap->N; 1540 c_seq = (Mat_SeqAIJ *)C_oth->data; 1541 cols = c_seq->j; 1542 vals = c_seq->a; 1543 for (i = 0; i < cm; i++) { 1544 ncols = c_seq->i[i + 1] - c_seq->i[i]; 1545 row = p->garray[i]; 1546 PetscCall(MatSetValues(C, 1, &row, ncols, cols, vals, ADD_VALUES)); 1547 cols += ncols; 1548 vals += ncols; 1549 } 1550 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1551 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1552 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 1553 PetscFunctionReturn(0); 1554 } 1555 1556 PetscErrorCode MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ(Mat P, Mat A, Mat C) { 1557 Mat_Merge_SeqsToMPI *merge; 1558 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data; 1559 Mat_SeqAIJ *pd = (Mat_SeqAIJ *)(p->A)->data, *po = (Mat_SeqAIJ *)(p->B)->data; 1560 Mat_APMPI *ap; 1561 PetscInt *adj; 1562 PetscInt i, j, k, anz, pnz, row, *cj, nexta; 1563 MatScalar *ada, *ca, valtmp; 1564 PetscInt am = A->rmap->n, cm = C->rmap->n, pon = (p->B)->cmap->n; 1565 MPI_Comm comm; 1566 PetscMPIInt size, rank, taga, *len_s; 1567 PetscInt *owners, proc, nrows, **buf_ri_k, **nextrow, **nextci; 1568 PetscInt **buf_ri, **buf_rj; 1569 PetscInt cnz = 0, *bj_i, *bi, *bj, bnz, nextcj; /* bi,bj,ba: local array of C(mpi mat) */ 1570 MPI_Request *s_waits, *r_waits; 1571 MPI_Status *status; 1572 MatScalar **abuf_r, *ba_i, *pA, *coa, *ba; 1573 const PetscScalar *dummy; 1574 PetscInt *ai, *aj, *coi, *coj, *poJ, *pdJ; 1575 Mat A_loc; 1576 Mat_SeqAIJ *a_loc; 1577 1578 PetscFunctionBegin; 1579 MatCheckProduct(C, 3); 1580 ap = (Mat_APMPI *)C->product->data; 1581 PetscCheck(ap, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be computed. Missing data"); 1582 PetscCheck(ap->A_loc, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "PtA cannot be reused. Do not call MatProductClear()"); 1583 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 1584 PetscCallMPI(MPI_Comm_size(comm, &size)); 1585 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1586 1587 merge = ap->merge; 1588 1589 /* 2) compute numeric C_seq = P_loc^T*A_loc */ 1590 /*------------------------------------------*/ 1591 /* get data from symbolic products */ 1592 coi = merge->coi; 1593 coj = merge->coj; 1594 PetscCall(PetscCalloc1(coi[pon] + 1, &coa)); 1595 bi = merge->bi; 1596 bj = merge->bj; 1597 owners = merge->rowmap->range; 1598 PetscCall(PetscCalloc1(bi[cm] + 1, &ba)); 1599 1600 /* get A_loc by taking all local rows of A */ 1601 A_loc = ap->A_loc; 1602 PetscCall(MatMPIAIJGetLocalMat(A, MAT_REUSE_MATRIX, &A_loc)); 1603 a_loc = (Mat_SeqAIJ *)(A_loc)->data; 1604 ai = a_loc->i; 1605 aj = a_loc->j; 1606 1607 /* trigger copy to CPU */ 1608 PetscCall(MatSeqAIJGetArrayRead(p->A, &dummy)); 1609 PetscCall(MatSeqAIJRestoreArrayRead(p->A, &dummy)); 1610 PetscCall(MatSeqAIJGetArrayRead(p->B, &dummy)); 1611 PetscCall(MatSeqAIJRestoreArrayRead(p->B, &dummy)); 1612 for (i = 0; i < am; i++) { 1613 anz = ai[i + 1] - ai[i]; 1614 adj = aj + ai[i]; 1615 ada = a_loc->a + ai[i]; 1616 1617 /* 2-b) Compute Cseq = P_loc[i,:]^T*A[i,:] using outer product */ 1618 /*-------------------------------------------------------------*/ 1619 /* put the value into Co=(p->B)^T*A (off-diagonal part, send to others) */ 1620 pnz = po->i[i + 1] - po->i[i]; 1621 poJ = po->j + po->i[i]; 1622 pA = po->a + po->i[i]; 1623 for (j = 0; j < pnz; j++) { 1624 row = poJ[j]; 1625 cj = coj + coi[row]; 1626 ca = coa + coi[row]; 1627 /* perform sparse axpy */ 1628 nexta = 0; 1629 valtmp = pA[j]; 1630 for (k = 0; nexta < anz; k++) { 1631 if (cj[k] == adj[nexta]) { 1632 ca[k] += valtmp * ada[nexta]; 1633 nexta++; 1634 } 1635 } 1636 PetscCall(PetscLogFlops(2.0 * anz)); 1637 } 1638 1639 /* put the value into Cd (diagonal part) */ 1640 pnz = pd->i[i + 1] - pd->i[i]; 1641 pdJ = pd->j + pd->i[i]; 1642 pA = pd->a + pd->i[i]; 1643 for (j = 0; j < pnz; j++) { 1644 row = pdJ[j]; 1645 cj = bj + bi[row]; 1646 ca = ba + bi[row]; 1647 /* perform sparse axpy */ 1648 nexta = 0; 1649 valtmp = pA[j]; 1650 for (k = 0; nexta < anz; k++) { 1651 if (cj[k] == adj[nexta]) { 1652 ca[k] += valtmp * ada[nexta]; 1653 nexta++; 1654 } 1655 } 1656 PetscCall(PetscLogFlops(2.0 * anz)); 1657 } 1658 } 1659 1660 /* 3) send and recv matrix values coa */ 1661 /*------------------------------------*/ 1662 buf_ri = merge->buf_ri; 1663 buf_rj = merge->buf_rj; 1664 len_s = merge->len_s; 1665 PetscCall(PetscCommGetNewTag(comm, &taga)); 1666 PetscCall(PetscPostIrecvScalar(comm, taga, merge->nrecv, merge->id_r, merge->len_r, &abuf_r, &r_waits)); 1667 1668 PetscCall(PetscMalloc2(merge->nsend + 1, &s_waits, size, &status)); 1669 for (proc = 0, k = 0; proc < size; proc++) { 1670 if (!len_s[proc]) continue; 1671 i = merge->owners_co[proc]; 1672 PetscCallMPI(MPI_Isend(coa + coi[i], len_s[proc], MPIU_MATSCALAR, proc, taga, comm, s_waits + k)); 1673 k++; 1674 } 1675 if (merge->nrecv) PetscCallMPI(MPI_Waitall(merge->nrecv, r_waits, status)); 1676 if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, s_waits, status)); 1677 1678 PetscCall(PetscFree2(s_waits, status)); 1679 PetscCall(PetscFree(r_waits)); 1680 PetscCall(PetscFree(coa)); 1681 1682 /* 4) insert local Cseq and received values into Cmpi */ 1683 /*----------------------------------------------------*/ 1684 PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextci)); 1685 for (k = 0; k < merge->nrecv; k++) { 1686 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1687 nrows = *(buf_ri_k[k]); 1688 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1689 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */ 1690 } 1691 1692 for (i = 0; i < cm; i++) { 1693 row = owners[rank] + i; /* global row index of C_seq */ 1694 bj_i = bj + bi[i]; /* col indices of the i-th row of C */ 1695 ba_i = ba + bi[i]; 1696 bnz = bi[i + 1] - bi[i]; 1697 /* add received vals into ba */ 1698 for (k = 0; k < merge->nrecv; k++) { /* k-th received message */ 1699 /* i-th row */ 1700 if (i == *nextrow[k]) { 1701 cnz = *(nextci[k] + 1) - *nextci[k]; 1702 cj = buf_rj[k] + *(nextci[k]); 1703 ca = abuf_r[k] + *(nextci[k]); 1704 nextcj = 0; 1705 for (j = 0; nextcj < cnz; j++) { 1706 if (bj_i[j] == cj[nextcj]) { /* bcol == ccol */ 1707 ba_i[j] += ca[nextcj++]; 1708 } 1709 } 1710 nextrow[k]++; 1711 nextci[k]++; 1712 PetscCall(PetscLogFlops(2.0 * cnz)); 1713 } 1714 } 1715 PetscCall(MatSetValues(C, 1, &row, bnz, bj_i, ba_i, INSERT_VALUES)); 1716 } 1717 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1718 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1719 1720 PetscCall(PetscFree(ba)); 1721 PetscCall(PetscFree(abuf_r[0])); 1722 PetscCall(PetscFree(abuf_r)); 1723 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 1724 PetscFunctionReturn(0); 1725 } 1726 1727 PetscErrorCode MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(Mat P, Mat A, PetscReal fill, Mat C) { 1728 Mat A_loc; 1729 Mat_APMPI *ap; 1730 PetscFreeSpaceList free_space = NULL, current_space = NULL; 1731 Mat_MPIAIJ *p = (Mat_MPIAIJ *)P->data, *a = (Mat_MPIAIJ *)A->data; 1732 PetscInt *pdti, *pdtj, *poti, *potj, *ptJ; 1733 PetscInt nnz; 1734 PetscInt *lnk, *owners_co, *coi, *coj, i, k, pnz, row; 1735 PetscInt am = A->rmap->n, pn = P->cmap->n; 1736 MPI_Comm comm; 1737 PetscMPIInt size, rank, tagi, tagj, *len_si, *len_s, *len_ri; 1738 PetscInt **buf_rj, **buf_ri, **buf_ri_k; 1739 PetscInt len, proc, *dnz, *onz, *owners; 1740 PetscInt nzi, *bi, *bj; 1741 PetscInt nrows, *buf_s, *buf_si, *buf_si_i, **nextrow, **nextci; 1742 MPI_Request *swaits, *rwaits; 1743 MPI_Status *sstatus, rstatus; 1744 Mat_Merge_SeqsToMPI *merge; 1745 PetscInt *ai, *aj, *Jptr, anz, *prmap = p->garray, pon, nspacedouble = 0, j; 1746 PetscReal afill = 1.0, afill_tmp; 1747 PetscInt rstart = P->cmap->rstart, rmax, aN = A->cmap->N, Armax; 1748 Mat_SeqAIJ *a_loc; 1749 PetscTable ta; 1750 MatType mtype; 1751 1752 PetscFunctionBegin; 1753 PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 1754 /* check if matrix local sizes are compatible */ 1755 PetscCheck(A->rmap->rstart == P->rmap->rstart && A->rmap->rend == P->rmap->rend, comm, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != P (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->rstart, 1756 A->rmap->rend, P->rmap->rstart, P->rmap->rend); 1757 1758 PetscCallMPI(MPI_Comm_size(comm, &size)); 1759 PetscCallMPI(MPI_Comm_rank(comm, &rank)); 1760 1761 /* create struct Mat_APMPI and attached it to C later */ 1762 PetscCall(PetscNew(&ap)); 1763 1764 /* get A_loc by taking all local rows of A */ 1765 PetscCall(MatMPIAIJGetLocalMat(A, MAT_INITIAL_MATRIX, &A_loc)); 1766 1767 ap->A_loc = A_loc; 1768 a_loc = (Mat_SeqAIJ *)(A_loc)->data; 1769 ai = a_loc->i; 1770 aj = a_loc->j; 1771 1772 /* determine symbolic Co=(p->B)^T*A - send to others */ 1773 /*----------------------------------------------------*/ 1774 PetscCall(MatGetSymbolicTranspose_SeqAIJ(p->A, &pdti, &pdtj)); 1775 PetscCall(MatGetSymbolicTranspose_SeqAIJ(p->B, &poti, &potj)); 1776 pon = (p->B)->cmap->n; /* total num of rows to be sent to other processors 1777 >= (num of nonzero rows of C_seq) - pn */ 1778 PetscCall(PetscMalloc1(pon + 1, &coi)); 1779 coi[0] = 0; 1780 1781 /* set initial free space to be fill*(nnz(p->B) + nnz(A)) */ 1782 nnz = PetscRealIntMultTruncate(fill, PetscIntSumTruncate(poti[pon], ai[am])); 1783 PetscCall(PetscFreeSpaceGet(nnz, &free_space)); 1784 current_space = free_space; 1785 1786 /* create and initialize a linked list */ 1787 PetscCall(PetscTableCreate(A->cmap->n + a->B->cmap->N, aN, &ta)); 1788 MatRowMergeMax_SeqAIJ(a_loc, am, ta); 1789 PetscCall(PetscTableGetCount(ta, &Armax)); 1790 1791 PetscCall(PetscLLCondensedCreate_Scalable(Armax, &lnk)); 1792 1793 for (i = 0; i < pon; i++) { 1794 pnz = poti[i + 1] - poti[i]; 1795 ptJ = potj + poti[i]; 1796 for (j = 0; j < pnz; j++) { 1797 row = ptJ[j]; /* row of A_loc == col of Pot */ 1798 anz = ai[row + 1] - ai[row]; 1799 Jptr = aj + ai[row]; 1800 /* add non-zero cols of AP into the sorted linked list lnk */ 1801 PetscCall(PetscLLCondensedAddSorted_Scalable(anz, Jptr, lnk)); 1802 } 1803 nnz = lnk[0]; 1804 1805 /* If free space is not available, double the total space in the list */ 1806 if (current_space->local_remaining < nnz) { 1807 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(nnz, current_space->total_array_size), ¤t_space)); 1808 nspacedouble++; 1809 } 1810 1811 /* Copy data into free space, and zero out denserows */ 1812 PetscCall(PetscLLCondensedClean_Scalable(nnz, current_space->array, lnk)); 1813 1814 current_space->array += nnz; 1815 current_space->local_used += nnz; 1816 current_space->local_remaining -= nnz; 1817 1818 coi[i + 1] = coi[i] + nnz; 1819 } 1820 1821 PetscCall(PetscMalloc1(coi[pon] + 1, &coj)); 1822 PetscCall(PetscFreeSpaceContiguous(&free_space, coj)); 1823 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); /* must destroy to get a new one for C */ 1824 1825 afill_tmp = (PetscReal)coi[pon] / (poti[pon] + ai[am] + 1); 1826 if (afill_tmp > afill) afill = afill_tmp; 1827 1828 /* send j-array (coj) of Co to other processors */ 1829 /*----------------------------------------------*/ 1830 /* determine row ownership */ 1831 PetscCall(PetscNew(&merge)); 1832 PetscCall(PetscLayoutCreate(comm, &merge->rowmap)); 1833 1834 merge->rowmap->n = pn; 1835 merge->rowmap->bs = 1; 1836 1837 PetscCall(PetscLayoutSetUp(merge->rowmap)); 1838 owners = merge->rowmap->range; 1839 1840 /* determine the number of messages to send, their lengths */ 1841 PetscCall(PetscCalloc1(size, &len_si)); 1842 PetscCall(PetscCalloc1(size, &merge->len_s)); 1843 1844 len_s = merge->len_s; 1845 merge->nsend = 0; 1846 1847 PetscCall(PetscMalloc1(size + 2, &owners_co)); 1848 1849 proc = 0; 1850 for (i = 0; i < pon; i++) { 1851 while (prmap[i] >= owners[proc + 1]) proc++; 1852 len_si[proc]++; /* num of rows in Co to be sent to [proc] */ 1853 len_s[proc] += coi[i + 1] - coi[i]; 1854 } 1855 1856 len = 0; /* max length of buf_si[] */ 1857 owners_co[0] = 0; 1858 for (proc = 0; proc < size; proc++) { 1859 owners_co[proc + 1] = owners_co[proc] + len_si[proc]; 1860 if (len_si[proc]) { 1861 merge->nsend++; 1862 len_si[proc] = 2 * (len_si[proc] + 1); 1863 len += len_si[proc]; 1864 } 1865 } 1866 1867 /* determine the number and length of messages to receive for coi and coj */ 1868 PetscCall(PetscGatherNumberOfMessages(comm, NULL, len_s, &merge->nrecv)); 1869 PetscCall(PetscGatherMessageLengths2(comm, merge->nsend, merge->nrecv, len_s, len_si, &merge->id_r, &merge->len_r, &len_ri)); 1870 1871 /* post the Irecv and Isend of coj */ 1872 PetscCall(PetscCommGetNewTag(comm, &tagj)); 1873 PetscCall(PetscPostIrecvInt(comm, tagj, merge->nrecv, merge->id_r, merge->len_r, &buf_rj, &rwaits)); 1874 PetscCall(PetscMalloc1(merge->nsend + 1, &swaits)); 1875 for (proc = 0, k = 0; proc < size; proc++) { 1876 if (!len_s[proc]) continue; 1877 i = owners_co[proc]; 1878 PetscCallMPI(MPI_Isend(coj + coi[i], len_s[proc], MPIU_INT, proc, tagj, comm, swaits + k)); 1879 k++; 1880 } 1881 1882 /* receives and sends of coj are complete */ 1883 PetscCall(PetscMalloc1(size, &sstatus)); 1884 for (i = 0; i < merge->nrecv; i++) { 1885 PETSC_UNUSED PetscMPIInt icompleted; 1886 PetscCallMPI(MPI_Waitany(merge->nrecv, rwaits, &icompleted, &rstatus)); 1887 } 1888 PetscCall(PetscFree(rwaits)); 1889 if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, swaits, sstatus)); 1890 1891 /* add received column indices into table to update Armax */ 1892 /* Armax can be as large as aN if a P[row,:] is dense, see src/ksp/ksp/tutorials/ex56.c! */ 1893 for (k = 0; k < merge->nrecv; k++) { /* k-th received message */ 1894 Jptr = buf_rj[k]; 1895 for (j = 0; j < merge->len_r[k]; j++) PetscCall(PetscTableAdd(ta, *(Jptr + j) + 1, 1, INSERT_VALUES)); 1896 } 1897 PetscCall(PetscTableGetCount(ta, &Armax)); 1898 1899 /* send and recv coi */ 1900 /*-------------------*/ 1901 PetscCall(PetscCommGetNewTag(comm, &tagi)); 1902 PetscCall(PetscPostIrecvInt(comm, tagi, merge->nrecv, merge->id_r, len_ri, &buf_ri, &rwaits)); 1903 PetscCall(PetscMalloc1(len + 1, &buf_s)); 1904 buf_si = buf_s; /* points to the beginning of k-th msg to be sent */ 1905 for (proc = 0, k = 0; proc < size; proc++) { 1906 if (!len_s[proc]) continue; 1907 /* form outgoing message for i-structure: 1908 buf_si[0]: nrows to be sent 1909 [1:nrows]: row index (global) 1910 [nrows+1:2*nrows+1]: i-structure index 1911 */ 1912 /*-------------------------------------------*/ 1913 nrows = len_si[proc] / 2 - 1; 1914 buf_si_i = buf_si + nrows + 1; 1915 buf_si[0] = nrows; 1916 buf_si_i[0] = 0; 1917 nrows = 0; 1918 for (i = owners_co[proc]; i < owners_co[proc + 1]; i++) { 1919 nzi = coi[i + 1] - coi[i]; 1920 buf_si_i[nrows + 1] = buf_si_i[nrows] + nzi; /* i-structure */ 1921 buf_si[nrows + 1] = prmap[i] - owners[proc]; /* local row index */ 1922 nrows++; 1923 } 1924 PetscCallMPI(MPI_Isend(buf_si, len_si[proc], MPIU_INT, proc, tagi, comm, swaits + k)); 1925 k++; 1926 buf_si += len_si[proc]; 1927 } 1928 i = merge->nrecv; 1929 while (i--) { 1930 PETSC_UNUSED PetscMPIInt icompleted; 1931 PetscCallMPI(MPI_Waitany(merge->nrecv, rwaits, &icompleted, &rstatus)); 1932 } 1933 PetscCall(PetscFree(rwaits)); 1934 if (merge->nsend) PetscCallMPI(MPI_Waitall(merge->nsend, swaits, sstatus)); 1935 PetscCall(PetscFree(len_si)); 1936 PetscCall(PetscFree(len_ri)); 1937 PetscCall(PetscFree(swaits)); 1938 PetscCall(PetscFree(sstatus)); 1939 PetscCall(PetscFree(buf_s)); 1940 1941 /* compute the local portion of C (mpi mat) */ 1942 /*------------------------------------------*/ 1943 /* allocate bi array and free space for accumulating nonzero column info */ 1944 PetscCall(PetscMalloc1(pn + 1, &bi)); 1945 bi[0] = 0; 1946 1947 /* set initial free space to be fill*(nnz(P) + nnz(AP)) */ 1948 nnz = PetscRealIntMultTruncate(fill, PetscIntSumTruncate(pdti[pn], PetscIntSumTruncate(poti[pon], ai[am]))); 1949 PetscCall(PetscFreeSpaceGet(nnz, &free_space)); 1950 current_space = free_space; 1951 1952 PetscCall(PetscMalloc3(merge->nrecv, &buf_ri_k, merge->nrecv, &nextrow, merge->nrecv, &nextci)); 1953 for (k = 0; k < merge->nrecv; k++) { 1954 buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */ 1955 nrows = *buf_ri_k[k]; 1956 nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */ 1957 nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th received i-structure */ 1958 } 1959 1960 PetscCall(PetscLLCondensedCreate_Scalable(Armax, &lnk)); 1961 MatPreallocateBegin(comm, pn, A->cmap->n, dnz, onz); 1962 rmax = 0; 1963 for (i = 0; i < pn; i++) { 1964 /* add pdt[i,:]*AP into lnk */ 1965 pnz = pdti[i + 1] - pdti[i]; 1966 ptJ = pdtj + pdti[i]; 1967 for (j = 0; j < pnz; j++) { 1968 row = ptJ[j]; /* row of AP == col of Pt */ 1969 anz = ai[row + 1] - ai[row]; 1970 Jptr = aj + ai[row]; 1971 /* add non-zero cols of AP into the sorted linked list lnk */ 1972 PetscCall(PetscLLCondensedAddSorted_Scalable(anz, Jptr, lnk)); 1973 } 1974 1975 /* add received col data into lnk */ 1976 for (k = 0; k < merge->nrecv; k++) { /* k-th received message */ 1977 if (i == *nextrow[k]) { /* i-th row */ 1978 nzi = *(nextci[k] + 1) - *nextci[k]; 1979 Jptr = buf_rj[k] + *nextci[k]; 1980 PetscCall(PetscLLCondensedAddSorted_Scalable(nzi, Jptr, lnk)); 1981 nextrow[k]++; 1982 nextci[k]++; 1983 } 1984 } 1985 1986 /* add missing diagonal entry */ 1987 if (C->force_diagonals) { 1988 k = i + owners[rank]; /* column index */ 1989 PetscCall(PetscLLCondensedAddSorted_Scalable(1, &k, lnk)); 1990 } 1991 1992 nnz = lnk[0]; 1993 1994 /* if free space is not available, make more free space */ 1995 if (current_space->local_remaining < nnz) { 1996 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(nnz, current_space->total_array_size), ¤t_space)); 1997 nspacedouble++; 1998 } 1999 /* copy data into free space, then initialize lnk */ 2000 PetscCall(PetscLLCondensedClean_Scalable(nnz, current_space->array, lnk)); 2001 PetscCall(MatPreallocateSet(i + owners[rank], nnz, current_space->array, dnz, onz)); 2002 2003 current_space->array += nnz; 2004 current_space->local_used += nnz; 2005 current_space->local_remaining -= nnz; 2006 2007 bi[i + 1] = bi[i] + nnz; 2008 if (nnz > rmax) rmax = nnz; 2009 } 2010 PetscCall(PetscFree3(buf_ri_k, nextrow, nextci)); 2011 2012 PetscCall(PetscMalloc1(bi[pn] + 1, &bj)); 2013 PetscCall(PetscFreeSpaceContiguous(&free_space, bj)); 2014 afill_tmp = (PetscReal)bi[pn] / (pdti[pn] + poti[pon] + ai[am] + 1); 2015 if (afill_tmp > afill) afill = afill_tmp; 2016 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 2017 PetscCall(PetscTableDestroy(&ta)); 2018 PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(p->A, &pdti, &pdtj)); 2019 PetscCall(MatRestoreSymbolicTranspose_SeqAIJ(p->B, &poti, &potj)); 2020 2021 /* create symbolic parallel matrix C - why cannot be assembled in Numeric part */ 2022 /*-------------------------------------------------------------------------------*/ 2023 PetscCall(MatSetSizes(C, pn, A->cmap->n, PETSC_DETERMINE, PETSC_DETERMINE)); 2024 PetscCall(MatSetBlockSizes(C, PetscAbs(P->cmap->bs), PetscAbs(A->cmap->bs))); 2025 PetscCall(MatGetType(A, &mtype)); 2026 PetscCall(MatSetType(C, mtype)); 2027 PetscCall(MatMPIAIJSetPreallocation(C, 0, dnz, 0, onz)); 2028 MatPreallocateEnd(dnz, onz); 2029 PetscCall(MatSetBlockSize(C, 1)); 2030 PetscCall(MatSetOption(C, MAT_NO_OFF_PROC_ENTRIES, PETSC_TRUE)); 2031 for (i = 0; i < pn; i++) { 2032 row = i + rstart; 2033 nnz = bi[i + 1] - bi[i]; 2034 Jptr = bj + bi[i]; 2035 PetscCall(MatSetValues(C, 1, &row, nnz, Jptr, NULL, INSERT_VALUES)); 2036 } 2037 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 2038 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 2039 PetscCall(MatSetOption(C, MAT_NEW_NONZERO_LOCATION_ERR, PETSC_TRUE)); 2040 merge->bi = bi; 2041 merge->bj = bj; 2042 merge->coi = coi; 2043 merge->coj = coj; 2044 merge->buf_ri = buf_ri; 2045 merge->buf_rj = buf_rj; 2046 merge->owners_co = owners_co; 2047 2048 /* attach the supporting struct to C for reuse */ 2049 C->product->data = ap; 2050 C->product->destroy = MatDestroy_MPIAIJ_PtAP; 2051 ap->merge = merge; 2052 2053 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ; 2054 2055 #if defined(PETSC_USE_INFO) 2056 if (bi[pn] != 0) { 2057 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", nspacedouble, (double)fill, (double)afill)); 2058 PetscCall(PetscInfo(C, "Use MatTransposeMatMult(A,B,MatReuse,%g,&C) for best performance.\n", (double)afill)); 2059 } else { 2060 PetscCall(PetscInfo(C, "Empty matrix product\n")); 2061 } 2062 #endif 2063 PetscFunctionReturn(0); 2064 } 2065 2066 /* ---------------------------------------------------------------- */ 2067 static PetscErrorCode MatProductSymbolic_AtB_MPIAIJ_MPIAIJ(Mat C) { 2068 Mat_Product *product = C->product; 2069 Mat A = product->A, B = product->B; 2070 PetscReal fill = product->fill; 2071 PetscBool flg; 2072 2073 PetscFunctionBegin; 2074 /* scalable */ 2075 PetscCall(PetscStrcmp(product->alg, "scalable", &flg)); 2076 if (flg) { 2077 PetscCall(MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ(A, B, fill, C)); 2078 goto next; 2079 } 2080 2081 /* nonscalable */ 2082 PetscCall(PetscStrcmp(product->alg, "nonscalable", &flg)); 2083 if (flg) { 2084 PetscCall(MatTransposeMatMultSymbolic_MPIAIJ_MPIAIJ_nonscalable(A, B, fill, C)); 2085 goto next; 2086 } 2087 2088 /* matmatmult */ 2089 PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 2090 if (flg) { 2091 Mat At; 2092 Mat_APMPI *ptap; 2093 2094 PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 2095 PetscCall(MatMatMultSymbolic_MPIAIJ_MPIAIJ(At, B, fill, C)); 2096 ptap = (Mat_APMPI *)C->product->data; 2097 if (ptap) { 2098 ptap->Pt = At; 2099 C->product->destroy = MatDestroy_MPIAIJ_PtAP; 2100 } 2101 C->ops->transposematmultnumeric = MatTransposeMatMultNumeric_MPIAIJ_MPIAIJ_matmatmult; 2102 goto next; 2103 } 2104 2105 /* backend general code */ 2106 PetscCall(PetscStrcmp(product->alg, "backend", &flg)); 2107 if (flg) { 2108 PetscCall(MatProductSymbolic_MPIAIJBACKEND(C)); 2109 PetscFunctionReturn(0); 2110 } 2111 2112 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatProduct type is not supported"); 2113 2114 next: 2115 C->ops->productnumeric = MatProductNumeric_AtB; 2116 PetscFunctionReturn(0); 2117 } 2118 2119 /* ---------------------------------------------------------------- */ 2120 /* Set options for MatMatMultxxx_MPIAIJ_MPIAIJ */ 2121 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_AB(Mat C) { 2122 Mat_Product *product = C->product; 2123 Mat A = product->A, B = product->B; 2124 #if defined(PETSC_HAVE_HYPRE) 2125 const char *algTypes[5] = {"scalable", "nonscalable", "seqmpi", "backend", "hypre"}; 2126 PetscInt nalg = 5; 2127 #else 2128 const char *algTypes[4] = { 2129 "scalable", 2130 "nonscalable", 2131 "seqmpi", 2132 "backend", 2133 }; 2134 PetscInt nalg = 4; 2135 #endif 2136 PetscInt alg = 1; /* set nonscalable algorithm as default */ 2137 PetscBool flg; 2138 MPI_Comm comm; 2139 2140 PetscFunctionBegin; 2141 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2142 2143 /* Set "nonscalable" as default algorithm */ 2144 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2145 if (flg) { 2146 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2147 2148 /* Set "scalable" as default if BN and local nonzeros of A and B are large */ 2149 if (B->cmap->N > 100000) { /* may switch to scalable algorithm as default */ 2150 MatInfo Ainfo, Binfo; 2151 PetscInt nz_local; 2152 PetscBool alg_scalable_loc = PETSC_FALSE, alg_scalable; 2153 2154 PetscCall(MatGetInfo(A, MAT_LOCAL, &Ainfo)); 2155 PetscCall(MatGetInfo(B, MAT_LOCAL, &Binfo)); 2156 nz_local = (PetscInt)(Ainfo.nz_allocated + Binfo.nz_allocated); 2157 2158 if (B->cmap->N > product->fill * nz_local) alg_scalable_loc = PETSC_TRUE; 2159 PetscCall(MPIU_Allreduce(&alg_scalable_loc, &alg_scalable, 1, MPIU_BOOL, MPI_LOR, comm)); 2160 2161 if (alg_scalable) { 2162 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 2163 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2164 PetscCall(PetscInfo(B, "Use scalable algorithm, BN %" PetscInt_FMT ", fill*nz_allocated %g\n", B->cmap->N, (double)(product->fill * nz_local))); 2165 } 2166 } 2167 } 2168 2169 /* Get runtime option */ 2170 if (product->api_user) { 2171 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 2172 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2173 PetscOptionsEnd(); 2174 } else { 2175 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 2176 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2177 PetscOptionsEnd(); 2178 } 2179 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2180 2181 C->ops->productsymbolic = MatProductSymbolic_AB_MPIAIJ_MPIAIJ; 2182 PetscFunctionReturn(0); 2183 } 2184 2185 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_ABt(Mat C) { 2186 PetscFunctionBegin; 2187 PetscCall(MatProductSetFromOptions_MPIAIJ_AB(C)); 2188 C->ops->productsymbolic = MatProductSymbolic_ABt_MPIAIJ_MPIAIJ; 2189 PetscFunctionReturn(0); 2190 } 2191 2192 /* Set options for MatTransposeMatMultXXX_MPIAIJ_MPIAIJ */ 2193 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_AtB(Mat C) { 2194 Mat_Product *product = C->product; 2195 Mat A = product->A, B = product->B; 2196 const char *algTypes[4] = {"scalable", "nonscalable", "at*b", "backend"}; 2197 PetscInt nalg = 4; 2198 PetscInt alg = 1; /* set default algorithm */ 2199 PetscBool flg; 2200 MPI_Comm comm; 2201 2202 PetscFunctionBegin; 2203 /* Check matrix local sizes */ 2204 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2205 PetscCheck(A->rmap->rstart == B->rmap->rstart && A->rmap->rend == B->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A (%" PetscInt_FMT ", %" PetscInt_FMT ") != B (%" PetscInt_FMT ",%" PetscInt_FMT ")", 2206 A->rmap->rstart, A->rmap->rend, B->rmap->rstart, B->rmap->rend); 2207 2208 /* Set default algorithm */ 2209 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2210 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2211 2212 /* Set "scalable" as default if BN and local nonzeros of A and B are large */ 2213 if (alg && B->cmap->N > 100000) { /* may switch to scalable algorithm as default */ 2214 MatInfo Ainfo, Binfo; 2215 PetscInt nz_local; 2216 PetscBool alg_scalable_loc = PETSC_FALSE, alg_scalable; 2217 2218 PetscCall(MatGetInfo(A, MAT_LOCAL, &Ainfo)); 2219 PetscCall(MatGetInfo(B, MAT_LOCAL, &Binfo)); 2220 nz_local = (PetscInt)(Ainfo.nz_allocated + Binfo.nz_allocated); 2221 2222 if (B->cmap->N > product->fill * nz_local) alg_scalable_loc = PETSC_TRUE; 2223 PetscCall(MPIU_Allreduce(&alg_scalable_loc, &alg_scalable, 1, MPIU_BOOL, MPI_LOR, comm)); 2224 2225 if (alg_scalable) { 2226 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 2227 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2228 PetscCall(PetscInfo(B, "Use scalable algorithm, BN %" PetscInt_FMT ", fill*nz_allocated %g\n", B->cmap->N, (double)(product->fill * nz_local))); 2229 } 2230 } 2231 2232 /* Get runtime option */ 2233 if (product->api_user) { 2234 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 2235 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2236 PetscOptionsEnd(); 2237 } else { 2238 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 2239 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2240 PetscOptionsEnd(); 2241 } 2242 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2243 2244 C->ops->productsymbolic = MatProductSymbolic_AtB_MPIAIJ_MPIAIJ; 2245 PetscFunctionReturn(0); 2246 } 2247 2248 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_PtAP(Mat C) { 2249 Mat_Product *product = C->product; 2250 Mat A = product->A, P = product->B; 2251 MPI_Comm comm; 2252 PetscBool flg; 2253 PetscInt alg = 1; /* set default algorithm */ 2254 #if !defined(PETSC_HAVE_HYPRE) 2255 const char *algTypes[5] = {"scalable", "nonscalable", "allatonce", "allatonce_merged", "backend"}; 2256 PetscInt nalg = 5; 2257 #else 2258 const char *algTypes[6] = {"scalable", "nonscalable", "allatonce", "allatonce_merged", "backend", "hypre"}; 2259 PetscInt nalg = 6; 2260 #endif 2261 PetscInt pN = P->cmap->N; 2262 2263 PetscFunctionBegin; 2264 /* Check matrix local sizes */ 2265 PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 2266 PetscCheck(A->rmap->rstart == P->rmap->rstart && A->rmap->rend == P->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, Arow (%" PetscInt_FMT ", %" PetscInt_FMT ") != Prow (%" PetscInt_FMT ",%" PetscInt_FMT ")", 2267 A->rmap->rstart, A->rmap->rend, P->rmap->rstart, P->rmap->rend); 2268 PetscCheck(A->cmap->rstart == P->rmap->rstart && A->cmap->rend == P->rmap->rend, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, Acol (%" PetscInt_FMT ", %" PetscInt_FMT ") != Prow (%" PetscInt_FMT ",%" PetscInt_FMT ")", 2269 A->cmap->rstart, A->cmap->rend, P->rmap->rstart, P->rmap->rend); 2270 2271 /* Set "nonscalable" as default algorithm */ 2272 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2273 if (flg) { 2274 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2275 2276 /* Set "scalable" as default if BN and local nonzeros of A and B are large */ 2277 if (pN > 100000) { 2278 MatInfo Ainfo, Pinfo; 2279 PetscInt nz_local; 2280 PetscBool alg_scalable_loc = PETSC_FALSE, alg_scalable; 2281 2282 PetscCall(MatGetInfo(A, MAT_LOCAL, &Ainfo)); 2283 PetscCall(MatGetInfo(P, MAT_LOCAL, &Pinfo)); 2284 nz_local = (PetscInt)(Ainfo.nz_allocated + Pinfo.nz_allocated); 2285 2286 if (pN > product->fill * nz_local) alg_scalable_loc = PETSC_TRUE; 2287 PetscCall(MPIU_Allreduce(&alg_scalable_loc, &alg_scalable, 1, MPIU_BOOL, MPI_LOR, comm)); 2288 2289 if (alg_scalable) { 2290 alg = 0; /* scalable algorithm would 50% slower than nonscalable algorithm */ 2291 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2292 } 2293 } 2294 } 2295 2296 /* Get runtime option */ 2297 if (product->api_user) { 2298 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 2299 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 2300 PetscOptionsEnd(); 2301 } else { 2302 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 2303 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[alg], &alg, &flg)); 2304 PetscOptionsEnd(); 2305 } 2306 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2307 2308 C->ops->productsymbolic = MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ; 2309 PetscFunctionReturn(0); 2310 } 2311 2312 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_RARt(Mat C) { 2313 Mat_Product *product = C->product; 2314 Mat A = product->A, R = product->B; 2315 2316 PetscFunctionBegin; 2317 /* Check matrix local sizes */ 2318 PetscCheck(A->cmap->n == R->cmap->n && A->rmap->n == R->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Matrix local dimensions are incompatible, A local (%" PetscInt_FMT ", %" PetscInt_FMT "), R local (%" PetscInt_FMT ",%" PetscInt_FMT ")", A->rmap->n, 2319 A->rmap->n, R->rmap->n, R->cmap->n); 2320 2321 C->ops->productsymbolic = MatProductSymbolic_RARt_MPIAIJ_MPIAIJ; 2322 PetscFunctionReturn(0); 2323 } 2324 2325 /* 2326 Set options for ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm 2327 */ 2328 static PetscErrorCode MatProductSetFromOptions_MPIAIJ_ABC(Mat C) { 2329 Mat_Product *product = C->product; 2330 PetscBool flg = PETSC_FALSE; 2331 PetscInt alg = 1; /* default algorithm */ 2332 const char *algTypes[3] = {"scalable", "nonscalable", "seqmpi"}; 2333 PetscInt nalg = 3; 2334 2335 PetscFunctionBegin; 2336 /* Set default algorithm */ 2337 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2338 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2339 2340 /* Get runtime option */ 2341 if (product->api_user) { 2342 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 2343 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2344 PetscOptionsEnd(); 2345 } else { 2346 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 2347 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2348 PetscOptionsEnd(); 2349 } 2350 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2351 2352 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_MPIAIJ_MPIAIJ_MPIAIJ; 2353 C->ops->productsymbolic = MatProductSymbolic_ABC; 2354 PetscFunctionReturn(0); 2355 } 2356 2357 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJ(Mat C) { 2358 Mat_Product *product = C->product; 2359 2360 PetscFunctionBegin; 2361 switch (product->type) { 2362 case MATPRODUCT_AB: PetscCall(MatProductSetFromOptions_MPIAIJ_AB(C)); break; 2363 case MATPRODUCT_ABt: PetscCall(MatProductSetFromOptions_MPIAIJ_ABt(C)); break; 2364 case MATPRODUCT_AtB: PetscCall(MatProductSetFromOptions_MPIAIJ_AtB(C)); break; 2365 case MATPRODUCT_PtAP: PetscCall(MatProductSetFromOptions_MPIAIJ_PtAP(C)); break; 2366 case MATPRODUCT_RARt: PetscCall(MatProductSetFromOptions_MPIAIJ_RARt(C)); break; 2367 case MATPRODUCT_ABC: PetscCall(MatProductSetFromOptions_MPIAIJ_ABC(C)); break; 2368 default: break; 2369 } 2370 PetscFunctionReturn(0); 2371 } 2372