1 2 /* 3 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4 C = A * B 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <petscbt.h> 10 #include <petsc/private/isimpl.h> 11 #include <../src/mat/impls/dense/seq/dense.h> 12 13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) { 14 PetscFunctionBegin; 15 if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 16 else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 17 PetscFunctionReturn(0); 18 } 19 20 /* Modified from MatCreateSeqAIJWithArrays() */ 21 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) { 22 PetscInt ii; 23 Mat_SeqAIJ *aij; 24 PetscBool isseqaij, osingle, ofree_a, ofree_ij; 25 26 PetscFunctionBegin; 27 PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 28 PetscCall(MatSetSizes(mat, m, n, m, n)); 29 30 if (!mtype) { 31 PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 32 if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 33 } else { 34 PetscCall(MatSetType(mat, mtype)); 35 } 36 37 aij = (Mat_SeqAIJ *)(mat)->data; 38 osingle = aij->singlemalloc; 39 ofree_a = aij->free_a; 40 ofree_ij = aij->free_ij; 41 /* changes the free flags */ 42 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 43 44 PetscCall(PetscFree(aij->ilen)); 45 PetscCall(PetscFree(aij->imax)); 46 PetscCall(PetscMalloc1(m, &aij->imax)); 47 PetscCall(PetscMalloc1(m, &aij->ilen)); 48 for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 49 const PetscInt rnz = i[ii + 1] - i[ii]; 50 aij->nonzerorowcnt += !!rnz; 51 aij->rmax = PetscMax(aij->rmax, rnz); 52 aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 53 } 54 aij->maxnz = i[m]; 55 aij->nz = i[m]; 56 57 if (osingle) { 58 PetscCall(PetscFree3(aij->a, aij->j, aij->i)); 59 } else { 60 if (ofree_a) PetscCall(PetscFree(aij->a)); 61 if (ofree_ij) PetscCall(PetscFree(aij->j)); 62 if (ofree_ij) PetscCall(PetscFree(aij->i)); 63 } 64 aij->i = i; 65 aij->j = j; 66 aij->a = a; 67 aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 68 /* default to not retain ownership */ 69 aij->singlemalloc = PETSC_FALSE; 70 aij->free_a = PETSC_FALSE; 71 aij->free_ij = PETSC_FALSE; 72 PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 73 PetscFunctionReturn(0); 74 } 75 76 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) { 77 Mat_Product *product = C->product; 78 MatProductAlgorithm alg; 79 PetscBool flg; 80 81 PetscFunctionBegin; 82 if (product) { 83 alg = product->alg; 84 } else { 85 alg = "sorted"; 86 } 87 /* sorted */ 88 PetscCall(PetscStrcmp(alg, "sorted", &flg)); 89 if (flg) { 90 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 91 PetscFunctionReturn(0); 92 } 93 94 /* scalable */ 95 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 96 if (flg) { 97 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 98 PetscFunctionReturn(0); 99 } 100 101 /* scalable_fast */ 102 PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 103 if (flg) { 104 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 105 PetscFunctionReturn(0); 106 } 107 108 /* heap */ 109 PetscCall(PetscStrcmp(alg, "heap", &flg)); 110 if (flg) { 111 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 112 PetscFunctionReturn(0); 113 } 114 115 /* btheap */ 116 PetscCall(PetscStrcmp(alg, "btheap", &flg)); 117 if (flg) { 118 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 119 PetscFunctionReturn(0); 120 } 121 122 /* llcondensed */ 123 PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 124 if (flg) { 125 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 126 PetscFunctionReturn(0); 127 } 128 129 /* rowmerge */ 130 PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 131 if (flg) { 132 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 133 PetscFunctionReturn(0); 134 } 135 136 #if defined(PETSC_HAVE_HYPRE) 137 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 138 if (flg) { 139 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 140 PetscFunctionReturn(0); 141 } 142 #endif 143 144 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 145 } 146 147 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) { 148 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 149 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 150 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 151 PetscReal afill; 152 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 153 PetscTable ta; 154 PetscBT lnkbt; 155 PetscFreeSpaceList free_space = NULL, current_space = NULL; 156 157 PetscFunctionBegin; 158 /* Get ci and cj */ 159 /*---------------*/ 160 /* Allocate ci array, arrays for fill computation and */ 161 /* free space for accumulating nonzero column info */ 162 PetscCall(PetscMalloc1(am + 2, &ci)); 163 ci[0] = 0; 164 165 /* create and initialize a linked list */ 166 PetscCall(PetscTableCreate(bn, bn, &ta)); 167 MatRowMergeMax_SeqAIJ(b, bm, ta); 168 PetscCall(PetscTableGetCount(ta, &Crmax)); 169 PetscCall(PetscTableDestroy(&ta)); 170 171 PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 172 173 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 174 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 175 176 current_space = free_space; 177 178 /* Determine ci and cj */ 179 for (i = 0; i < am; i++) { 180 anzi = ai[i + 1] - ai[i]; 181 aj = a->j + ai[i]; 182 for (j = 0; j < anzi; j++) { 183 brow = aj[j]; 184 bnzj = bi[brow + 1] - bi[brow]; 185 bj = b->j + bi[brow]; 186 /* add non-zero cols of B into the sorted linked list lnk */ 187 PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 188 } 189 /* add possible missing diagonal entry */ 190 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 191 cnzi = lnk[0]; 192 193 /* If free space is not available, make more free space */ 194 /* Double the amount of total space in the list */ 195 if (current_space->local_remaining < cnzi) { 196 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 197 ndouble++; 198 } 199 200 /* Copy data into free space, then initialize lnk */ 201 PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 202 203 current_space->array += cnzi; 204 current_space->local_used += cnzi; 205 current_space->local_remaining -= cnzi; 206 207 ci[i + 1] = ci[i] + cnzi; 208 } 209 210 /* Column indices are in the list of free space */ 211 /* Allocate space for cj, initialize cj, and */ 212 /* destroy list of free space and other temporary array(s) */ 213 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 214 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 215 PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 216 217 /* put together the new symbolic matrix */ 218 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 219 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 220 221 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 222 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 223 c = (Mat_SeqAIJ *)(C->data); 224 c->free_a = PETSC_FALSE; 225 c->free_ij = PETSC_TRUE; 226 c->nonew = 0; 227 228 /* fast, needs non-scalable O(bn) array 'abdense' */ 229 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 230 231 /* set MatInfo */ 232 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 233 if (afill < 1.0) afill = 1.0; 234 C->info.mallocs = ndouble; 235 C->info.fill_ratio_given = fill; 236 C->info.fill_ratio_needed = afill; 237 238 #if defined(PETSC_USE_INFO) 239 if (ci[am]) { 240 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 241 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 242 } else { 243 PetscCall(PetscInfo(C, "Empty matrix product\n")); 244 } 245 #endif 246 PetscFunctionReturn(0); 247 } 248 249 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) { 250 PetscLogDouble flops = 0.0; 251 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 252 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 253 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 254 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 255 PetscInt am = A->rmap->n, cm = C->rmap->n; 256 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 257 PetscScalar *ca, valtmp; 258 PetscScalar *ab_dense; 259 PetscContainer cab_dense; 260 const PetscScalar *aa, *ba, *baj; 261 262 PetscFunctionBegin; 263 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 264 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 265 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 266 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 267 c->a = ca; 268 c->free_a = PETSC_TRUE; 269 } else ca = c->a; 270 271 /* TODO this should be done in the symbolic phase */ 272 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 273 that is hard to eradicate) */ 274 PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 275 if (!cab_dense) { 276 PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 277 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense)); 278 PetscCall(PetscContainerSetPointer(cab_dense, ab_dense)); 279 PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault)); 280 PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense)); 281 PetscCall(PetscObjectDereference((PetscObject)cab_dense)); 282 } 283 PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense)); 284 PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 285 286 /* clean old values in C */ 287 PetscCall(PetscArrayzero(ca, ci[cm])); 288 /* Traverse A row-wise. */ 289 /* Build the ith row in C by summing over nonzero columns in A, */ 290 /* the rows of B corresponding to nonzeros of A. */ 291 for (i = 0; i < am; i++) { 292 anzi = ai[i + 1] - ai[i]; 293 for (j = 0; j < anzi; j++) { 294 brow = aj[j]; 295 bnzi = bi[brow + 1] - bi[brow]; 296 bjj = bj + bi[brow]; 297 baj = ba + bi[brow]; 298 /* perform dense axpy */ 299 valtmp = aa[j]; 300 for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 301 flops += 2 * bnzi; 302 } 303 aj += anzi; 304 aa += anzi; 305 306 cnzi = ci[i + 1] - ci[i]; 307 for (k = 0; k < cnzi; k++) { 308 ca[k] += ab_dense[cj[k]]; 309 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 310 } 311 flops += cnzi; 312 cj += cnzi; 313 ca += cnzi; 314 } 315 #if defined(PETSC_HAVE_DEVICE) 316 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 317 #endif 318 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 319 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 320 PetscCall(PetscLogFlops(flops)); 321 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 322 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 323 PetscFunctionReturn(0); 324 } 325 326 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) { 327 PetscLogDouble flops = 0.0; 328 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 329 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 330 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 331 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 332 PetscInt am = A->rmap->N, cm = C->rmap->N; 333 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 334 PetscScalar *ca = c->a, valtmp; 335 const PetscScalar *aa, *ba, *baj; 336 PetscInt nextb; 337 338 PetscFunctionBegin; 339 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 340 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 341 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 342 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 343 c->a = ca; 344 c->free_a = PETSC_TRUE; 345 } 346 347 /* clean old values in C */ 348 PetscCall(PetscArrayzero(ca, ci[cm])); 349 /* Traverse A row-wise. */ 350 /* Build the ith row in C by summing over nonzero columns in A, */ 351 /* the rows of B corresponding to nonzeros of A. */ 352 for (i = 0; i < am; i++) { 353 anzi = ai[i + 1] - ai[i]; 354 cnzi = ci[i + 1] - ci[i]; 355 for (j = 0; j < anzi; j++) { 356 brow = aj[j]; 357 bnzi = bi[brow + 1] - bi[brow]; 358 bjj = bj + bi[brow]; 359 baj = ba + bi[brow]; 360 /* perform sparse axpy */ 361 valtmp = aa[j]; 362 nextb = 0; 363 for (k = 0; nextb < bnzi; k++) { 364 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 365 ca[k] += valtmp * baj[nextb++]; 366 } 367 } 368 flops += 2 * bnzi; 369 } 370 aj += anzi; 371 aa += anzi; 372 cj += cnzi; 373 ca += cnzi; 374 } 375 #if defined(PETSC_HAVE_DEVICE) 376 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 377 #endif 378 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 379 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 380 PetscCall(PetscLogFlops(flops)); 381 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 382 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 383 PetscFunctionReturn(0); 384 } 385 386 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) { 387 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 388 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 389 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 390 MatScalar *ca; 391 PetscReal afill; 392 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 393 PetscTable ta; 394 PetscFreeSpaceList free_space = NULL, current_space = NULL; 395 396 PetscFunctionBegin; 397 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 398 /*-----------------------------------------------------------------------------------------*/ 399 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 400 PetscCall(PetscMalloc1(am + 2, &ci)); 401 ci[0] = 0; 402 403 /* create and initialize a linked list */ 404 PetscCall(PetscTableCreate(bn, bn, &ta)); 405 MatRowMergeMax_SeqAIJ(b, bm, ta); 406 PetscCall(PetscTableGetCount(ta, &Crmax)); 407 PetscCall(PetscTableDestroy(&ta)); 408 409 PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 410 411 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 412 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 413 current_space = free_space; 414 415 /* Determine ci and cj */ 416 for (i = 0; i < am; i++) { 417 anzi = ai[i + 1] - ai[i]; 418 aj = a->j + ai[i]; 419 for (j = 0; j < anzi; j++) { 420 brow = aj[j]; 421 bnzj = bi[brow + 1] - bi[brow]; 422 bj = b->j + bi[brow]; 423 /* add non-zero cols of B into the sorted linked list lnk */ 424 PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 425 } 426 /* add possible missing diagonal entry */ 427 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 428 cnzi = lnk[1]; 429 430 /* If free space is not available, make more free space */ 431 /* Double the amount of total space in the list */ 432 if (current_space->local_remaining < cnzi) { 433 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 434 ndouble++; 435 } 436 437 /* Copy data into free space, then initialize lnk */ 438 PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 439 440 current_space->array += cnzi; 441 current_space->local_used += cnzi; 442 current_space->local_remaining -= cnzi; 443 444 ci[i + 1] = ci[i] + cnzi; 445 } 446 447 /* Column indices are in the list of free space */ 448 /* Allocate space for cj, initialize cj, and */ 449 /* destroy list of free space and other temporary array(s) */ 450 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 451 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 452 PetscCall(PetscLLCondensedDestroy_fast(lnk)); 453 454 /* Allocate space for ca */ 455 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 456 457 /* put together the new symbolic matrix */ 458 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 459 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 460 461 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 462 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 463 c = (Mat_SeqAIJ *)(C->data); 464 c->free_a = PETSC_TRUE; 465 c->free_ij = PETSC_TRUE; 466 c->nonew = 0; 467 468 /* slower, less memory */ 469 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 470 471 /* set MatInfo */ 472 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 473 if (afill < 1.0) afill = 1.0; 474 C->info.mallocs = ndouble; 475 C->info.fill_ratio_given = fill; 476 C->info.fill_ratio_needed = afill; 477 478 #if defined(PETSC_USE_INFO) 479 if (ci[am]) { 480 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 481 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 482 } else { 483 PetscCall(PetscInfo(C, "Empty matrix product\n")); 484 } 485 #endif 486 PetscFunctionReturn(0); 487 } 488 489 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) { 490 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 491 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 492 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 493 MatScalar *ca; 494 PetscReal afill; 495 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 496 PetscTable ta; 497 PetscFreeSpaceList free_space = NULL, current_space = NULL; 498 499 PetscFunctionBegin; 500 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 501 /*---------------------------------------------------------------------------------------------*/ 502 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 503 PetscCall(PetscMalloc1(am + 2, &ci)); 504 ci[0] = 0; 505 506 /* create and initialize a linked list */ 507 PetscCall(PetscTableCreate(bn, bn, &ta)); 508 MatRowMergeMax_SeqAIJ(b, bm, ta); 509 PetscCall(PetscTableGetCount(ta, &Crmax)); 510 PetscCall(PetscTableDestroy(&ta)); 511 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 512 513 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 514 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 515 current_space = free_space; 516 517 /* Determine ci and cj */ 518 for (i = 0; i < am; i++) { 519 anzi = ai[i + 1] - ai[i]; 520 aj = a->j + ai[i]; 521 for (j = 0; j < anzi; j++) { 522 brow = aj[j]; 523 bnzj = bi[brow + 1] - bi[brow]; 524 bj = b->j + bi[brow]; 525 /* add non-zero cols of B into the sorted linked list lnk */ 526 PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 527 } 528 /* add possible missing diagonal entry */ 529 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 530 531 cnzi = lnk[0]; 532 533 /* If free space is not available, make more free space */ 534 /* Double the amount of total space in the list */ 535 if (current_space->local_remaining < cnzi) { 536 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 537 ndouble++; 538 } 539 540 /* Copy data into free space, then initialize lnk */ 541 PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 542 543 current_space->array += cnzi; 544 current_space->local_used += cnzi; 545 current_space->local_remaining -= cnzi; 546 547 ci[i + 1] = ci[i] + cnzi; 548 } 549 550 /* Column indices are in the list of free space */ 551 /* Allocate space for cj, initialize cj, and */ 552 /* destroy list of free space and other temporary array(s) */ 553 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 554 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 555 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 556 557 /* Allocate space for ca */ 558 /*-----------------------*/ 559 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 560 561 /* put together the new symbolic matrix */ 562 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 563 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 564 565 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 566 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 567 c = (Mat_SeqAIJ *)(C->data); 568 c->free_a = PETSC_TRUE; 569 c->free_ij = PETSC_TRUE; 570 c->nonew = 0; 571 572 /* slower, less memory */ 573 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 574 575 /* set MatInfo */ 576 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 577 if (afill < 1.0) afill = 1.0; 578 C->info.mallocs = ndouble; 579 C->info.fill_ratio_given = fill; 580 C->info.fill_ratio_needed = afill; 581 582 #if defined(PETSC_USE_INFO) 583 if (ci[am]) { 584 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 585 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 586 } else { 587 PetscCall(PetscInfo(C, "Empty matrix product\n")); 588 } 589 #endif 590 PetscFunctionReturn(0); 591 } 592 593 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) { 594 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 595 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 596 PetscInt *ci, *cj, *bb; 597 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 598 PetscReal afill; 599 PetscInt i, j, col, ndouble = 0; 600 PetscFreeSpaceList free_space = NULL, current_space = NULL; 601 PetscHeap h; 602 603 PetscFunctionBegin; 604 /* Get ci and cj - by merging sorted rows using a heap */ 605 /*---------------------------------------------------------------------------------------------*/ 606 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 607 PetscCall(PetscMalloc1(am + 2, &ci)); 608 ci[0] = 0; 609 610 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 611 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 612 current_space = free_space; 613 614 PetscCall(PetscHeapCreate(a->rmax, &h)); 615 PetscCall(PetscMalloc1(a->rmax, &bb)); 616 617 /* Determine ci and cj */ 618 for (i = 0; i < am; i++) { 619 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 620 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 621 ci[i + 1] = ci[i]; 622 /* Populate the min heap */ 623 for (j = 0; j < anzi; j++) { 624 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 625 if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 626 PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 627 } 628 } 629 /* Pick off the min element, adding it to free space */ 630 PetscCall(PetscHeapPop(h, &j, &col)); 631 while (j >= 0) { 632 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 633 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 634 ndouble++; 635 } 636 *(current_space->array++) = col; 637 current_space->local_used++; 638 current_space->local_remaining--; 639 ci[i + 1]++; 640 641 /* stash if anything else remains in this row of B */ 642 if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 643 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 644 PetscInt j2, col2; 645 PetscCall(PetscHeapPeek(h, &j2, &col2)); 646 if (col2 != col) break; 647 PetscCall(PetscHeapPop(h, &j2, &col2)); 648 if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 649 } 650 /* Put any stashed elements back into the min heap */ 651 PetscCall(PetscHeapUnstash(h)); 652 PetscCall(PetscHeapPop(h, &j, &col)); 653 } 654 } 655 PetscCall(PetscFree(bb)); 656 PetscCall(PetscHeapDestroy(&h)); 657 658 /* Column indices are in the list of free space */ 659 /* Allocate space for cj, initialize cj, and */ 660 /* destroy list of free space and other temporary array(s) */ 661 PetscCall(PetscMalloc1(ci[am], &cj)); 662 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 663 664 /* put together the new symbolic matrix */ 665 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 666 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 667 668 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 669 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 670 c = (Mat_SeqAIJ *)(C->data); 671 c->free_a = PETSC_TRUE; 672 c->free_ij = PETSC_TRUE; 673 c->nonew = 0; 674 675 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 676 677 /* set MatInfo */ 678 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 679 if (afill < 1.0) afill = 1.0; 680 C->info.mallocs = ndouble; 681 C->info.fill_ratio_given = fill; 682 C->info.fill_ratio_needed = afill; 683 684 #if defined(PETSC_USE_INFO) 685 if (ci[am]) { 686 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 687 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 688 } else { 689 PetscCall(PetscInfo(C, "Empty matrix product\n")); 690 } 691 #endif 692 PetscFunctionReturn(0); 693 } 694 695 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) { 696 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 697 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 698 PetscInt *ci, *cj, *bb; 699 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 700 PetscReal afill; 701 PetscInt i, j, col, ndouble = 0; 702 PetscFreeSpaceList free_space = NULL, current_space = NULL; 703 PetscHeap h; 704 PetscBT bt; 705 706 PetscFunctionBegin; 707 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 708 /*---------------------------------------------------------------------------------------------*/ 709 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 710 PetscCall(PetscMalloc1(am + 2, &ci)); 711 ci[0] = 0; 712 713 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 714 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 715 716 current_space = free_space; 717 718 PetscCall(PetscHeapCreate(a->rmax, &h)); 719 PetscCall(PetscMalloc1(a->rmax, &bb)); 720 PetscCall(PetscBTCreate(bn, &bt)); 721 722 /* Determine ci and cj */ 723 for (i = 0; i < am; i++) { 724 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 725 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 726 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 727 ci[i + 1] = ci[i]; 728 /* Populate the min heap */ 729 for (j = 0; j < anzi; j++) { 730 PetscInt brow = acol[j]; 731 for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 732 PetscInt bcol = bj[bb[j]]; 733 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 734 PetscCall(PetscHeapAdd(h, j, bcol)); 735 bb[j]++; 736 break; 737 } 738 } 739 } 740 /* Pick off the min element, adding it to free space */ 741 PetscCall(PetscHeapPop(h, &j, &col)); 742 while (j >= 0) { 743 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 744 fptr = NULL; /* need PetscBTMemzero */ 745 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 746 ndouble++; 747 } 748 *(current_space->array++) = col; 749 current_space->local_used++; 750 current_space->local_remaining--; 751 ci[i + 1]++; 752 753 /* stash if anything else remains in this row of B */ 754 for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 755 PetscInt bcol = bj[bb[j]]; 756 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 757 PetscCall(PetscHeapAdd(h, j, bcol)); 758 bb[j]++; 759 break; 760 } 761 } 762 PetscCall(PetscHeapPop(h, &j, &col)); 763 } 764 if (fptr) { /* Clear the bits for this row */ 765 for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 766 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 767 PetscCall(PetscBTMemzero(bn, bt)); 768 } 769 } 770 PetscCall(PetscFree(bb)); 771 PetscCall(PetscHeapDestroy(&h)); 772 PetscCall(PetscBTDestroy(&bt)); 773 774 /* Column indices are in the list of free space */ 775 /* Allocate space for cj, initialize cj, and */ 776 /* destroy list of free space and other temporary array(s) */ 777 PetscCall(PetscMalloc1(ci[am], &cj)); 778 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 779 780 /* put together the new symbolic matrix */ 781 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 782 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 783 784 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 785 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 786 c = (Mat_SeqAIJ *)(C->data); 787 c->free_a = PETSC_TRUE; 788 c->free_ij = PETSC_TRUE; 789 c->nonew = 0; 790 791 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 792 793 /* set MatInfo */ 794 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 795 if (afill < 1.0) afill = 1.0; 796 C->info.mallocs = ndouble; 797 C->info.fill_ratio_given = fill; 798 C->info.fill_ratio_needed = afill; 799 800 #if defined(PETSC_USE_INFO) 801 if (ci[am]) { 802 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 803 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 804 } else { 805 PetscCall(PetscInfo(C, "Empty matrix product\n")); 806 } 807 #endif 808 PetscFunctionReturn(0); 809 } 810 811 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) { 812 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 813 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 814 PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 815 PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 816 const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 817 const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 818 const PetscInt *brow_ptr[8], *brow_end[8]; 819 PetscInt window[8]; 820 PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 821 PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 822 PetscReal afill; 823 PetscInt *workj_L1, *workj_L2, *workj_L3; 824 PetscInt L1_nnz, L2_nnz; 825 826 /* Step 1: Get upper bound on memory required for allocation. 827 Because of the way virtual memory works, 828 only the memory pages that are actually needed will be physically allocated. */ 829 PetscFunctionBegin; 830 PetscCall(PetscMalloc1(am + 1, &ci)); 831 for (i = 0; i < am; i++) { 832 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 833 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 834 a_rownnz = 0; 835 for (k = 0; k < anzi; ++k) { 836 a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 837 if (a_rownnz > bn) { 838 a_rownnz = bn; 839 break; 840 } 841 } 842 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 843 } 844 /* temporary work areas for merging rows */ 845 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 846 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 847 PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 848 849 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 850 c_maxmem = 8 * (ai[am] + bi[bm]); 851 /* Step 2: Populate pattern for C */ 852 PetscCall(PetscMalloc1(c_maxmem, &cj)); 853 854 ci_nnz = 0; 855 ci[0] = 0; 856 worki_L1[0] = 0; 857 worki_L2[0] = 0; 858 for (i = 0; i < am; i++) { 859 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 860 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 861 rowsleft = anzi; 862 inputcol_L1 = acol; 863 L2_nnz = 0; 864 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 865 worki_L2[1] = 0; 866 outputi_nnz = 0; 867 868 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 869 while (ci_nnz + a_maxrownnz > c_maxmem) { 870 c_maxmem *= 2; 871 ndouble++; 872 PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 873 } 874 875 while (rowsleft) { 876 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 877 L1_nrows = 0; 878 L1_nnz = 0; 879 inputcol = inputcol_L1; 880 inputi = bi; 881 inputj = bj; 882 883 /* The following macro is used to specialize for small rows in A. 884 This helps with compiler unrolling, improving performance substantially. 885 Input: inputj inputi inputcol bn 886 Output: outputj outputi_nnz */ 887 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 888 window_min = bn; \ 889 outputi_nnz = 0; \ 890 for (k = 0; k < ANNZ; ++k) { \ 891 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 892 brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 893 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 894 window_min = PetscMin(window[k], window_min); \ 895 } \ 896 while (window_min < bn) { \ 897 outputj[outputi_nnz++] = window_min; \ 898 /* advance front and compute new minimum */ \ 899 old_window_min = window_min; \ 900 window_min = bn; \ 901 for (k = 0; k < ANNZ; ++k) { \ 902 if (window[k] == old_window_min) { \ 903 brow_ptr[k]++; \ 904 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 905 } \ 906 window_min = PetscMin(window[k], window_min); \ 907 } \ 908 } 909 910 /************** L E V E L 1 ***************/ 911 /* Merge up to 8 rows of B to L1 work array*/ 912 while (L1_rowsleft) { 913 outputi_nnz = 0; 914 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 915 else outputj = cj + ci_nnz; /* Merge directly to C */ 916 917 switch (L1_rowsleft) { 918 case 1: 919 brow_ptr[0] = inputj + inputi[inputcol[0]]; 920 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 921 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 922 inputcol += L1_rowsleft; 923 rowsleft -= L1_rowsleft; 924 L1_rowsleft = 0; 925 break; 926 case 2: 927 MatMatMultSymbolic_RowMergeMacro(2); 928 inputcol += L1_rowsleft; 929 rowsleft -= L1_rowsleft; 930 L1_rowsleft = 0; 931 break; 932 case 3: 933 MatMatMultSymbolic_RowMergeMacro(3); 934 inputcol += L1_rowsleft; 935 rowsleft -= L1_rowsleft; 936 L1_rowsleft = 0; 937 break; 938 case 4: 939 MatMatMultSymbolic_RowMergeMacro(4); 940 inputcol += L1_rowsleft; 941 rowsleft -= L1_rowsleft; 942 L1_rowsleft = 0; 943 break; 944 case 5: 945 MatMatMultSymbolic_RowMergeMacro(5); 946 inputcol += L1_rowsleft; 947 rowsleft -= L1_rowsleft; 948 L1_rowsleft = 0; 949 break; 950 case 6: 951 MatMatMultSymbolic_RowMergeMacro(6); 952 inputcol += L1_rowsleft; 953 rowsleft -= L1_rowsleft; 954 L1_rowsleft = 0; 955 break; 956 case 7: 957 MatMatMultSymbolic_RowMergeMacro(7); 958 inputcol += L1_rowsleft; 959 rowsleft -= L1_rowsleft; 960 L1_rowsleft = 0; 961 break; 962 default: 963 MatMatMultSymbolic_RowMergeMacro(8); 964 inputcol += 8; 965 rowsleft -= 8; 966 L1_rowsleft -= 8; 967 break; 968 } 969 inputcol_L1 = inputcol; 970 L1_nnz += outputi_nnz; 971 worki_L1[++L1_nrows] = L1_nnz; 972 } 973 974 /********************** L E V E L 2 ************************/ 975 /* Merge from L1 work array to either C or to L2 work array */ 976 if (anzi > 8) { 977 inputi = worki_L1; 978 inputj = workj_L1; 979 inputcol = workcol; 980 outputi_nnz = 0; 981 982 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 983 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 984 985 switch (L1_nrows) { 986 case 1: 987 brow_ptr[0] = inputj + inputi[inputcol[0]]; 988 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 989 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 990 break; 991 case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 992 case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 993 case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 994 case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 995 case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 996 case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 997 case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 998 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 999 } 1000 L2_nnz += outputi_nnz; 1001 worki_L2[++L2_nrows] = L2_nnz; 1002 1003 /************************ L E V E L 3 **********************/ 1004 /* Merge from L2 work array to either C or to L2 work array */ 1005 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1006 inputi = worki_L2; 1007 inputj = workj_L2; 1008 inputcol = workcol; 1009 outputi_nnz = 0; 1010 if (rowsleft) outputj = workj_L3; 1011 else outputj = cj + ci_nnz; 1012 switch (L2_nrows) { 1013 case 1: 1014 brow_ptr[0] = inputj + inputi[inputcol[0]]; 1015 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1016 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1017 break; 1018 case 2: MatMatMultSymbolic_RowMergeMacro(2); break; 1019 case 3: MatMatMultSymbolic_RowMergeMacro(3); break; 1020 case 4: MatMatMultSymbolic_RowMergeMacro(4); break; 1021 case 5: MatMatMultSymbolic_RowMergeMacro(5); break; 1022 case 6: MatMatMultSymbolic_RowMergeMacro(6); break; 1023 case 7: MatMatMultSymbolic_RowMergeMacro(7); break; 1024 case 8: MatMatMultSymbolic_RowMergeMacro(8); break; 1025 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1026 } 1027 L2_nrows = 1; 1028 L2_nnz = outputi_nnz; 1029 worki_L2[1] = outputi_nnz; 1030 /* Copy to workj_L2 */ 1031 if (rowsleft) { 1032 for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1033 } 1034 } 1035 } 1036 } /* while (rowsleft) */ 1037 #undef MatMatMultSymbolic_RowMergeMacro 1038 1039 /* terminate current row */ 1040 ci_nnz += outputi_nnz; 1041 ci[i + 1] = ci_nnz; 1042 } 1043 1044 /* Step 3: Create the new symbolic matrix */ 1045 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1046 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1047 1048 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1049 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1050 c = (Mat_SeqAIJ *)(C->data); 1051 c->free_a = PETSC_TRUE; 1052 c->free_ij = PETSC_TRUE; 1053 c->nonew = 0; 1054 1055 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1056 1057 /* set MatInfo */ 1058 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1059 if (afill < 1.0) afill = 1.0; 1060 C->info.mallocs = ndouble; 1061 C->info.fill_ratio_given = fill; 1062 C->info.fill_ratio_needed = afill; 1063 1064 #if defined(PETSC_USE_INFO) 1065 if (ci[am]) { 1066 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1067 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1068 } else { 1069 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1070 } 1071 #endif 1072 1073 /* Step 4: Free temporary work areas */ 1074 PetscCall(PetscFree(workj_L1)); 1075 PetscCall(PetscFree(workj_L2)); 1076 PetscCall(PetscFree(workj_L3)); 1077 PetscFunctionReturn(0); 1078 } 1079 1080 /* concatenate unique entries and then sort */ 1081 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) { 1082 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1083 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 1084 PetscInt *ci, *cj, bcol; 1085 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1086 PetscReal afill; 1087 PetscInt i, j, ndouble = 0; 1088 PetscSegBuffer seg, segrow; 1089 char *seen; 1090 1091 PetscFunctionBegin; 1092 PetscCall(PetscMalloc1(am + 1, &ci)); 1093 ci[0] = 0; 1094 1095 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1096 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 1097 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 1098 PetscCall(PetscCalloc1(bn, &seen)); 1099 1100 /* Determine ci and cj */ 1101 for (i = 0; i < am; i++) { 1102 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1103 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1104 PetscInt packlen = 0, *PETSC_RESTRICT crow; 1105 1106 /* Pack segrow */ 1107 for (j = 0; j < anzi; j++) { 1108 PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1109 for (k = bjstart; k < bjend; k++) { 1110 bcol = bj[k]; 1111 if (!seen[bcol]) { /* new entry */ 1112 PetscInt *PETSC_RESTRICT slot; 1113 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1114 *slot = bcol; 1115 seen[bcol] = 1; 1116 packlen++; 1117 } 1118 } 1119 } 1120 1121 /* Check i-th diagonal entry */ 1122 if (C->force_diagonals && !seen[i]) { 1123 PetscInt *PETSC_RESTRICT slot; 1124 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1125 *slot = i; 1126 seen[i] = 1; 1127 packlen++; 1128 } 1129 1130 PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 1131 PetscCall(PetscSegBufferExtractTo(segrow, crow)); 1132 PetscCall(PetscSortInt(packlen, crow)); 1133 ci[i + 1] = ci[i] + packlen; 1134 for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1135 } 1136 PetscCall(PetscSegBufferDestroy(&segrow)); 1137 PetscCall(PetscFree(seen)); 1138 1139 /* Column indices are in the segmented buffer */ 1140 PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 1141 PetscCall(PetscSegBufferDestroy(&seg)); 1142 1143 /* put together the new symbolic matrix */ 1144 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1145 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1146 1147 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1148 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1149 c = (Mat_SeqAIJ *)(C->data); 1150 c->free_a = PETSC_TRUE; 1151 c->free_ij = PETSC_TRUE; 1152 c->nonew = 0; 1153 1154 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1155 1156 /* set MatInfo */ 1157 afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1158 if (afill < 1.0) afill = 1.0; 1159 C->info.mallocs = ndouble; 1160 C->info.fill_ratio_given = fill; 1161 C->info.fill_ratio_needed = afill; 1162 1163 #if defined(PETSC_USE_INFO) 1164 if (ci[am]) { 1165 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1166 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1167 } else { 1168 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1169 } 1170 #endif 1171 PetscFunctionReturn(0); 1172 } 1173 1174 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) { 1175 Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data; 1176 1177 PetscFunctionBegin; 1178 PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 1179 PetscCall(MatDestroy(&abt->Bt_den)); 1180 PetscCall(MatDestroy(&abt->ABt_den)); 1181 PetscCall(PetscFree(abt)); 1182 PetscFunctionReturn(0); 1183 } 1184 1185 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) { 1186 Mat Bt; 1187 Mat_MatMatTransMult *abt; 1188 Mat_Product *product = C->product; 1189 char *alg; 1190 1191 PetscFunctionBegin; 1192 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1193 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1194 1195 /* create symbolic Bt */ 1196 PetscCall(MatTransposeSymbolic(B, &Bt)); 1197 PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1198 PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 1199 1200 /* get symbolic C=A*Bt */ 1201 PetscCall(PetscStrallocpy(product->alg, &alg)); 1202 PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 1203 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 1204 PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 1205 PetscCall(PetscFree(alg)); 1206 1207 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1208 PetscCall(PetscNew(&abt)); 1209 1210 product->data = abt; 1211 product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 1212 1213 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1214 1215 abt->usecoloring = PETSC_FALSE; 1216 PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 1217 if (abt->usecoloring) { 1218 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1219 MatTransposeColoring matcoloring; 1220 MatColoring coloring; 1221 ISColoring iscoloring; 1222 Mat Bt_dense, C_dense; 1223 1224 /* inode causes memory problem */ 1225 PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 1226 1227 PetscCall(MatColoringCreate(C, &coloring)); 1228 PetscCall(MatColoringSetDistance(coloring, 2)); 1229 PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 1230 PetscCall(MatColoringSetFromOptions(coloring)); 1231 PetscCall(MatColoringApply(coloring, &iscoloring)); 1232 PetscCall(MatColoringDestroy(&coloring)); 1233 PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 1234 1235 abt->matcoloring = matcoloring; 1236 1237 PetscCall(ISColoringDestroy(&iscoloring)); 1238 1239 /* Create Bt_dense and C_dense = A*Bt_dense */ 1240 PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 1241 PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 1242 PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 1243 PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 1244 1245 Bt_dense->assembled = PETSC_TRUE; 1246 abt->Bt_den = Bt_dense; 1247 1248 PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 1249 PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 1250 PetscCall(MatSetType(C_dense, MATSEQDENSE)); 1251 PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 1252 1253 Bt_dense->assembled = PETSC_TRUE; 1254 abt->ABt_den = C_dense; 1255 1256 #if defined(PETSC_USE_INFO) 1257 { 1258 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 1259 PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 1260 Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 1261 } 1262 #endif 1263 } 1264 /* clean up */ 1265 PetscCall(MatDestroy(&Bt)); 1266 PetscFunctionReturn(0); 1267 } 1268 1269 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) { 1270 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1271 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 1272 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 1273 PetscLogDouble flops = 0.0; 1274 MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 1275 Mat_MatMatTransMult *abt; 1276 Mat_Product *product = C->product; 1277 1278 PetscFunctionBegin; 1279 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1280 abt = (Mat_MatMatTransMult *)product->data; 1281 PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1282 /* clear old values in C */ 1283 if (!c->a) { 1284 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1285 c->a = ca; 1286 c->free_a = PETSC_TRUE; 1287 } else { 1288 ca = c->a; 1289 PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 1290 } 1291 1292 if (abt->usecoloring) { 1293 MatTransposeColoring matcoloring = abt->matcoloring; 1294 Mat Bt_dense, C_dense = abt->ABt_den; 1295 1296 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1297 Bt_dense = abt->Bt_den; 1298 PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1299 1300 /* C_dense = A*Bt_dense */ 1301 PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1302 1303 /* Recover C from C_dense */ 1304 PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 1305 PetscFunctionReturn(0); 1306 } 1307 1308 for (i = 0; i < cm; i++) { 1309 anzi = ai[i + 1] - ai[i]; 1310 acol = aj + ai[i]; 1311 aval = aa + ai[i]; 1312 cnzi = ci[i + 1] - ci[i]; 1313 ccol = cj + ci[i]; 1314 cval = ca + ci[i]; 1315 for (j = 0; j < cnzi; j++) { 1316 brow = ccol[j]; 1317 bnzj = bi[brow + 1] - bi[brow]; 1318 bcol = bj + bi[brow]; 1319 bval = ba + bi[brow]; 1320 1321 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1322 nexta = 0; 1323 nextb = 0; 1324 while (nexta < anzi && nextb < bnzj) { 1325 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1326 if (nexta == anzi) break; 1327 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1328 if (nextb == bnzj) break; 1329 if (acol[nexta] == bcol[nextb]) { 1330 cval[j] += aval[nexta] * bval[nextb]; 1331 nexta++; 1332 nextb++; 1333 flops += 2; 1334 } 1335 } 1336 } 1337 } 1338 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1339 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1340 PetscCall(PetscLogFlops(flops)); 1341 PetscFunctionReturn(0); 1342 } 1343 1344 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) { 1345 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data; 1346 1347 PetscFunctionBegin; 1348 PetscCall(MatDestroy(&atb->At)); 1349 if (atb->destroy) PetscCall((*atb->destroy)(atb->data)); 1350 PetscCall(PetscFree(atb)); 1351 PetscFunctionReturn(0); 1352 } 1353 1354 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) { 1355 Mat At = NULL; 1356 Mat_Product *product = C->product; 1357 PetscBool flg, def, square; 1358 1359 PetscFunctionBegin; 1360 MatCheckProduct(C, 4); 1361 square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 1362 /* outerproduct */ 1363 PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 1364 if (flg) { 1365 /* create symbolic At */ 1366 if (!square) { 1367 PetscCall(MatTransposeSymbolic(A, &At)); 1368 PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1369 PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1370 } 1371 /* get symbolic C=At*B */ 1372 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1373 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1374 1375 /* clean up */ 1376 if (!square) PetscCall(MatDestroy(&At)); 1377 1378 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1379 PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 1380 PetscFunctionReturn(0); 1381 } 1382 1383 /* matmatmult */ 1384 PetscCall(PetscStrcmp(product->alg, "default", &def)); 1385 PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1386 if (flg || def) { 1387 Mat_MatTransMatMult *atb; 1388 1389 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1390 PetscCall(PetscNew(&atb)); 1391 if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 1392 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1393 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1394 PetscCall(MatProductSetAlgorithm(C, "at*b")); 1395 product->data = atb; 1396 product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 1397 atb->At = At; 1398 1399 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1400 PetscFunctionReturn(0); 1401 } 1402 1403 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1404 } 1405 1406 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) { 1407 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1408 PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1409 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1410 PetscLogDouble flops = 0.0; 1411 MatScalar *aa = a->a, *ba, *ca, *caj; 1412 1413 PetscFunctionBegin; 1414 if (!c->a) { 1415 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1416 1417 c->a = ca; 1418 c->free_a = PETSC_TRUE; 1419 } else { 1420 ca = c->a; 1421 PetscCall(PetscArrayzero(ca, ci[cm])); 1422 } 1423 1424 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1425 for (i = 0; i < am; i++) { 1426 bj = b->j + bi[i]; 1427 ba = b->a + bi[i]; 1428 bnzi = bi[i + 1] - bi[i]; 1429 anzi = ai[i + 1] - ai[i]; 1430 for (j = 0; j < anzi; j++) { 1431 nextb = 0; 1432 crow = *aj++; 1433 cjj = cj + ci[crow]; 1434 caj = ca + ci[crow]; 1435 /* perform sparse axpy operation. Note cjj includes bj. */ 1436 for (k = 0; nextb < bnzi; k++) { 1437 if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 1438 caj[k] += (*aa) * (*(ba + nextb)); 1439 nextb++; 1440 } 1441 } 1442 flops += 2 * bnzi; 1443 aa++; 1444 } 1445 } 1446 1447 /* Assemble the final matrix and clean up */ 1448 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1449 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1450 PetscCall(PetscLogFlops(flops)); 1451 PetscFunctionReturn(0); 1452 } 1453 1454 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) { 1455 PetscFunctionBegin; 1456 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1457 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1458 PetscFunctionReturn(0); 1459 } 1460 1461 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) { 1462 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1463 PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1464 const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1465 const PetscInt *aj; 1466 PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 1467 PetscInt clda; 1468 PetscInt am4, bm4, col, i, j, n; 1469 1470 PetscFunctionBegin; 1471 if (!cm || !cn) PetscFunctionReturn(0); 1472 PetscCall(MatSeqAIJGetArrayRead(A, &av)); 1473 if (add) { 1474 PetscCall(MatDenseGetArray(C, &c)); 1475 } else { 1476 PetscCall(MatDenseGetArrayWrite(C, &c)); 1477 } 1478 PetscCall(MatDenseGetArrayRead(B, &b)); 1479 PetscCall(MatDenseGetLDA(B, &bm)); 1480 PetscCall(MatDenseGetLDA(C, &clda)); 1481 am4 = 4 * clda; 1482 bm4 = 4 * bm; 1483 b1 = b; 1484 b2 = b1 + bm; 1485 b3 = b2 + bm; 1486 b4 = b3 + bm; 1487 c1 = c; 1488 c2 = c1 + clda; 1489 c3 = c2 + clda; 1490 c4 = c3 + clda; 1491 for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 1492 for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1493 r1 = r2 = r3 = r4 = 0.0; 1494 n = a->i[i + 1] - a->i[i]; 1495 aj = a->j + a->i[i]; 1496 aa = av + a->i[i]; 1497 for (j = 0; j < n; j++) { 1498 const PetscScalar aatmp = aa[j]; 1499 const PetscInt ajtmp = aj[j]; 1500 r1 += aatmp * b1[ajtmp]; 1501 r2 += aatmp * b2[ajtmp]; 1502 r3 += aatmp * b3[ajtmp]; 1503 r4 += aatmp * b4[ajtmp]; 1504 } 1505 if (add) { 1506 c1[i] += r1; 1507 c2[i] += r2; 1508 c3[i] += r3; 1509 c4[i] += r4; 1510 } else { 1511 c1[i] = r1; 1512 c2[i] = r2; 1513 c3[i] = r3; 1514 c4[i] = r4; 1515 } 1516 } 1517 b1 += bm4; 1518 b2 += bm4; 1519 b3 += bm4; 1520 b4 += bm4; 1521 c1 += am4; 1522 c2 += am4; 1523 c3 += am4; 1524 c4 += am4; 1525 } 1526 /* process remaining columns */ 1527 if (col != cn) { 1528 PetscInt rc = cn - col; 1529 1530 if (rc == 1) { 1531 for (i = 0; i < am; i++) { 1532 r1 = 0.0; 1533 n = a->i[i + 1] - a->i[i]; 1534 aj = a->j + a->i[i]; 1535 aa = av + a->i[i]; 1536 for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 1537 if (add) c1[i] += r1; 1538 else c1[i] = r1; 1539 } 1540 } else if (rc == 2) { 1541 for (i = 0; i < am; i++) { 1542 r1 = r2 = 0.0; 1543 n = a->i[i + 1] - a->i[i]; 1544 aj = a->j + a->i[i]; 1545 aa = av + a->i[i]; 1546 for (j = 0; j < n; j++) { 1547 const PetscScalar aatmp = aa[j]; 1548 const PetscInt ajtmp = aj[j]; 1549 r1 += aatmp * b1[ajtmp]; 1550 r2 += aatmp * b2[ajtmp]; 1551 } 1552 if (add) { 1553 c1[i] += r1; 1554 c2[i] += r2; 1555 } else { 1556 c1[i] = r1; 1557 c2[i] = r2; 1558 } 1559 } 1560 } else { 1561 for (i = 0; i < am; i++) { 1562 r1 = r2 = r3 = 0.0; 1563 n = a->i[i + 1] - a->i[i]; 1564 aj = a->j + a->i[i]; 1565 aa = av + a->i[i]; 1566 for (j = 0; j < n; j++) { 1567 const PetscScalar aatmp = aa[j]; 1568 const PetscInt ajtmp = aj[j]; 1569 r1 += aatmp * b1[ajtmp]; 1570 r2 += aatmp * b2[ajtmp]; 1571 r3 += aatmp * b3[ajtmp]; 1572 } 1573 if (add) { 1574 c1[i] += r1; 1575 c2[i] += r2; 1576 c3[i] += r3; 1577 } else { 1578 c1[i] = r1; 1579 c2[i] = r2; 1580 c3[i] = r3; 1581 } 1582 } 1583 } 1584 } 1585 PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 1586 if (add) { 1587 PetscCall(MatDenseRestoreArray(C, &c)); 1588 } else { 1589 PetscCall(MatDenseRestoreArrayWrite(C, &c)); 1590 } 1591 PetscCall(MatDenseRestoreArrayRead(B, &b)); 1592 PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 1593 PetscFunctionReturn(0); 1594 } 1595 1596 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) { 1597 PetscFunctionBegin; 1598 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1599 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1600 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1601 1602 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 1603 PetscFunctionReturn(0); 1604 } 1605 1606 /* ------------------------------------------------------- */ 1607 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) { 1608 PetscFunctionBegin; 1609 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1610 C->ops->productsymbolic = MatProductSymbolic_AB; 1611 PetscFunctionReturn(0); 1612 } 1613 1614 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 1615 1616 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) { 1617 PetscFunctionBegin; 1618 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1619 C->ops->productsymbolic = MatProductSymbolic_AtB; 1620 PetscFunctionReturn(0); 1621 } 1622 1623 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) { 1624 PetscFunctionBegin; 1625 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1626 C->ops->productsymbolic = MatProductSymbolic_ABt; 1627 PetscFunctionReturn(0); 1628 } 1629 1630 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) { 1631 Mat_Product *product = C->product; 1632 1633 PetscFunctionBegin; 1634 switch (product->type) { 1635 case MATPRODUCT_AB: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); break; 1636 case MATPRODUCT_AtB: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); break; 1637 case MATPRODUCT_ABt: PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); break; 1638 default: break; 1639 } 1640 PetscFunctionReturn(0); 1641 } 1642 /* ------------------------------------------------------- */ 1643 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) { 1644 Mat_Product *product = C->product; 1645 Mat A = product->A; 1646 PetscBool baij; 1647 1648 PetscFunctionBegin; 1649 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 1650 if (!baij) { /* A is seqsbaij */ 1651 PetscBool sbaij; 1652 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 1653 PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 1654 1655 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1656 } else { /* A is seqbaij */ 1657 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1658 } 1659 1660 C->ops->productsymbolic = MatProductSymbolic_AB; 1661 PetscFunctionReturn(0); 1662 } 1663 1664 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) { 1665 Mat_Product *product = C->product; 1666 1667 PetscFunctionBegin; 1668 MatCheckProduct(C, 1); 1669 PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1670 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 1671 PetscFunctionReturn(0); 1672 } 1673 1674 /* ------------------------------------------------------- */ 1675 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) { 1676 PetscFunctionBegin; 1677 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1678 C->ops->productsymbolic = MatProductSymbolic_AB; 1679 PetscFunctionReturn(0); 1680 } 1681 1682 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) { 1683 Mat_Product *product = C->product; 1684 1685 PetscFunctionBegin; 1686 if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 1687 PetscFunctionReturn(0); 1688 } 1689 /* ------------------------------------------------------- */ 1690 1691 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) { 1692 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1693 Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 1694 PetscInt *bi = b->i, *bj = b->j; 1695 PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 1696 MatScalar *btval, *btval_den, *ba = b->a; 1697 PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1698 1699 PetscFunctionBegin; 1700 btval_den = btdense->v; 1701 PetscCall(PetscArrayzero(btval_den, m * n)); 1702 for (k = 0; k < ncolors; k++) { 1703 ncolumns = coloring->ncolumns[k]; 1704 for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1705 col = *(columns + colorforcol[k] + l); 1706 btcol = bj + bi[col]; 1707 btval = ba + bi[col]; 1708 anz = bi[col + 1] - bi[col]; 1709 for (j = 0; j < anz; j++) { 1710 brow = btcol[j]; 1711 btval_den[brow] = btval[j]; 1712 } 1713 } 1714 btval_den += m; 1715 } 1716 PetscFunctionReturn(0); 1717 } 1718 1719 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) { 1720 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 1721 const PetscScalar *ca_den, *ca_den_ptr; 1722 PetscScalar *ca = csp->a; 1723 PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1724 PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1725 PetscInt nrows, *row, *idx; 1726 PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 1727 1728 PetscFunctionBegin; 1729 PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1730 1731 if (brows > 0) { 1732 PetscInt *lstart, row_end, row_start; 1733 lstart = matcoloring->lstart; 1734 PetscCall(PetscArrayzero(lstart, ncolors)); 1735 1736 row_end = brows; 1737 if (row_end > m) row_end = m; 1738 for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1739 ca_den_ptr = ca_den; 1740 for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1741 nrows = matcoloring->nrows[k]; 1742 row = rows + colorforrow[k]; 1743 idx = den2sp + colorforrow[k]; 1744 for (l = lstart[k]; l < nrows; l++) { 1745 if (row[l] >= row_end) { 1746 lstart[k] = l; 1747 break; 1748 } else { 1749 ca[idx[l]] = ca_den_ptr[row[l]]; 1750 } 1751 } 1752 ca_den_ptr += m; 1753 } 1754 row_end += brows; 1755 if (row_end > m) row_end = m; 1756 } 1757 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1758 ca_den_ptr = ca_den; 1759 for (k = 0; k < ncolors; k++) { 1760 nrows = matcoloring->nrows[k]; 1761 row = rows + colorforrow[k]; 1762 idx = den2sp + colorforrow[k]; 1763 for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1764 ca_den_ptr += m; 1765 } 1766 } 1767 1768 PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1769 #if defined(PETSC_USE_INFO) 1770 if (matcoloring->brows > 0) { 1771 PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1772 } else { 1773 PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1774 } 1775 #endif 1776 PetscFunctionReturn(0); 1777 } 1778 1779 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) { 1780 PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 1781 const PetscInt *is, *ci, *cj, *row_idx; 1782 PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1783 IS *isa; 1784 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1785 PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1786 PetscInt *colorforcol, *columns, *columns_i, brows; 1787 PetscBool flg; 1788 1789 PetscFunctionBegin; 1790 PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1791 1792 /* bs >1 is not being tested yet! */ 1793 Nbs = mat->cmap->N / bs; 1794 c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1795 c->N = Nbs; 1796 c->m = c->M; 1797 c->rstart = 0; 1798 c->brows = 100; 1799 1800 c->ncolors = nis; 1801 PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 1802 PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 1803 PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1804 1805 brows = c->brows; 1806 PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1807 if (flg) c->brows = brows; 1808 if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 1809 1810 colorforrow[0] = 0; 1811 rows_i = rows; 1812 den2sp_i = den2sp; 1813 1814 PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 1815 PetscCall(PetscMalloc1(Nbs + 1, &columns)); 1816 1817 colorforcol[0] = 0; 1818 columns_i = columns; 1819 1820 /* get column-wise storage of mat */ 1821 PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1822 1823 cm = c->m; 1824 PetscCall(PetscMalloc1(cm + 1, &rowhit)); 1825 PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1826 for (i = 0; i < nis; i++) { /* loop over color */ 1827 PetscCall(ISGetLocalSize(isa[i], &n)); 1828 PetscCall(ISGetIndices(isa[i], &is)); 1829 1830 c->ncolumns[i] = n; 1831 if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1832 colorforcol[i + 1] = colorforcol[i] + n; 1833 columns_i += n; 1834 1835 /* fast, crude version requires O(N*N) work */ 1836 PetscCall(PetscArrayzero(rowhit, cm)); 1837 1838 for (j = 0; j < n; j++) { /* loop over columns*/ 1839 col = is[j]; 1840 row_idx = cj + ci[col]; 1841 m = ci[col + 1] - ci[col]; 1842 for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1843 idxhit[*row_idx] = spidx[ci[col] + k]; 1844 rowhit[*row_idx++] = col + 1; 1845 } 1846 } 1847 /* count the number of hits */ 1848 nrows = 0; 1849 for (j = 0; j < cm; j++) { 1850 if (rowhit[j]) nrows++; 1851 } 1852 c->nrows[i] = nrows; 1853 colorforrow[i + 1] = colorforrow[i] + nrows; 1854 1855 nrows = 0; 1856 for (j = 0; j < cm; j++) { /* loop over rows */ 1857 if (rowhit[j]) { 1858 rows_i[nrows] = j; 1859 den2sp_i[nrows] = idxhit[j]; 1860 nrows++; 1861 } 1862 } 1863 den2sp_i += nrows; 1864 1865 PetscCall(ISRestoreIndices(isa[i], &is)); 1866 rows_i += nrows; 1867 } 1868 PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1869 PetscCall(PetscFree(rowhit)); 1870 PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 1871 PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1872 1873 c->colorforrow = colorforrow; 1874 c->rows = rows; 1875 c->den2sp = den2sp; 1876 c->colorforcol = colorforcol; 1877 c->columns = columns; 1878 1879 PetscCall(PetscFree(idxhit)); 1880 PetscFunctionReturn(0); 1881 } 1882 1883 /* --------------------------------------------------------------- */ 1884 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) { 1885 Mat_Product *product = C->product; 1886 Mat A = product->A, B = product->B; 1887 1888 PetscFunctionBegin; 1889 if (C->ops->mattransposemultnumeric) { 1890 /* Alg: "outerproduct" */ 1891 PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 1892 } else { 1893 /* Alg: "matmatmult" -- C = At*B */ 1894 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 1895 1896 PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1897 if (atb->At) { 1898 /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 1899 user may have called MatProductReplaceMats() to get this A=product->A */ 1900 PetscCall(MatTransposeSetPrecursor(A, atb->At)); 1901 PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 1902 } 1903 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 1904 } 1905 PetscFunctionReturn(0); 1906 } 1907 1908 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) { 1909 Mat_Product *product = C->product; 1910 Mat A = product->A, B = product->B; 1911 PetscReal fill = product->fill; 1912 1913 PetscFunctionBegin; 1914 PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 1915 1916 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1917 PetscFunctionReturn(0); 1918 } 1919 1920 /* --------------------------------------------------------------- */ 1921 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) { 1922 Mat_Product *product = C->product; 1923 PetscInt alg = 0; /* default algorithm */ 1924 PetscBool flg = PETSC_FALSE; 1925 #if !defined(PETSC_HAVE_HYPRE) 1926 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 1927 PetscInt nalg = 7; 1928 #else 1929 const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 1930 PetscInt nalg = 8; 1931 #endif 1932 1933 PetscFunctionBegin; 1934 /* Set default algorithm */ 1935 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 1936 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 1937 1938 /* Get runtime option */ 1939 if (product->api_user) { 1940 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 1941 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 1942 PetscOptionsEnd(); 1943 } else { 1944 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 1945 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 1946 PetscOptionsEnd(); 1947 } 1948 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 1949 1950 C->ops->productsymbolic = MatProductSymbolic_AB; 1951 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 1952 PetscFunctionReturn(0); 1953 } 1954 1955 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) { 1956 Mat_Product *product = C->product; 1957 PetscInt alg = 0; /* default algorithm */ 1958 PetscBool flg = PETSC_FALSE; 1959 const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 1960 PetscInt nalg = 3; 1961 1962 PetscFunctionBegin; 1963 /* Get runtime option */ 1964 if (product->api_user) { 1965 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 1966 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 1967 PetscOptionsEnd(); 1968 } else { 1969 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 1970 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 1971 PetscOptionsEnd(); 1972 } 1973 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 1974 1975 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 1976 PetscFunctionReturn(0); 1977 } 1978 1979 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) { 1980 Mat_Product *product = C->product; 1981 PetscInt alg = 0; /* default algorithm */ 1982 PetscBool flg = PETSC_FALSE; 1983 const char *algTypes[2] = {"default", "color"}; 1984 PetscInt nalg = 2; 1985 1986 PetscFunctionBegin; 1987 /* Set default algorithm */ 1988 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 1989 if (!flg) { 1990 alg = 1; 1991 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 1992 } 1993 1994 /* Get runtime option */ 1995 if (product->api_user) { 1996 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 1997 PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 1998 PetscOptionsEnd(); 1999 } else { 2000 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 2001 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2002 PetscOptionsEnd(); 2003 } 2004 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2005 2006 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2007 C->ops->productsymbolic = MatProductSymbolic_ABt; 2008 PetscFunctionReturn(0); 2009 } 2010 2011 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) { 2012 Mat_Product *product = C->product; 2013 PetscBool flg = PETSC_FALSE; 2014 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2015 #if !defined(PETSC_HAVE_HYPRE) 2016 const char *algTypes[2] = {"scalable", "rap"}; 2017 PetscInt nalg = 2; 2018 #else 2019 const char *algTypes[3] = {"scalable", "rap", "hypre"}; 2020 PetscInt nalg = 3; 2021 #endif 2022 2023 PetscFunctionBegin; 2024 /* Set default algorithm */ 2025 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2026 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2027 2028 /* Get runtime option */ 2029 if (product->api_user) { 2030 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 2031 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2032 PetscOptionsEnd(); 2033 } else { 2034 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 2035 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2036 PetscOptionsEnd(); 2037 } 2038 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2039 2040 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2041 PetscFunctionReturn(0); 2042 } 2043 2044 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) { 2045 Mat_Product *product = C->product; 2046 PetscBool flg = PETSC_FALSE; 2047 PetscInt alg = 0; /* default algorithm */ 2048 const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 2049 PetscInt nalg = 3; 2050 2051 PetscFunctionBegin; 2052 /* Set default algorithm */ 2053 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2054 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2055 2056 /* Get runtime option */ 2057 if (product->api_user) { 2058 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 2059 PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2060 PetscOptionsEnd(); 2061 } else { 2062 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 2063 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2064 PetscOptionsEnd(); 2065 } 2066 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2067 2068 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2069 PetscFunctionReturn(0); 2070 } 2071 2072 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2073 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) { 2074 Mat_Product *product = C->product; 2075 PetscInt alg = 0; /* default algorithm */ 2076 PetscBool flg = PETSC_FALSE; 2077 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 2078 PetscInt nalg = 7; 2079 2080 PetscFunctionBegin; 2081 /* Set default algorithm */ 2082 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2083 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2084 2085 /* Get runtime option */ 2086 if (product->api_user) { 2087 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 2088 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2089 PetscOptionsEnd(); 2090 } else { 2091 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 2092 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2093 PetscOptionsEnd(); 2094 } 2095 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2096 2097 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2098 C->ops->productsymbolic = MatProductSymbolic_ABC; 2099 PetscFunctionReturn(0); 2100 } 2101 2102 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) { 2103 Mat_Product *product = C->product; 2104 2105 PetscFunctionBegin; 2106 switch (product->type) { 2107 case MATPRODUCT_AB: PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); break; 2108 case MATPRODUCT_AtB: PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); break; 2109 case MATPRODUCT_ABt: PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); break; 2110 case MATPRODUCT_PtAP: PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); break; 2111 case MATPRODUCT_RARt: PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); break; 2112 case MATPRODUCT_ABC: PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); break; 2113 default: break; 2114 } 2115 PetscFunctionReturn(0); 2116 } 2117