1 2 /* 3 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4 C = A * B 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <petscbt.h> 10 #include <petsc/private/isimpl.h> 11 #include <../src/mat/impls/dense/seq/dense.h> 12 13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 14 { 15 PetscFunctionBegin; 16 if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 17 else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 18 PetscFunctionReturn(PETSC_SUCCESS); 19 } 20 21 /* Modified from MatCreateSeqAIJWithArrays() */ 22 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) 23 { 24 PetscInt ii; 25 Mat_SeqAIJ *aij; 26 PetscBool isseqaij, osingle, ofree_a, ofree_ij; 27 28 PetscFunctionBegin; 29 PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 30 PetscCall(MatSetSizes(mat, m, n, m, n)); 31 32 if (!mtype) { 33 PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 34 if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 35 } else { 36 PetscCall(MatSetType(mat, mtype)); 37 } 38 39 aij = (Mat_SeqAIJ *)(mat)->data; 40 osingle = aij->singlemalloc; 41 ofree_a = aij->free_a; 42 ofree_ij = aij->free_ij; 43 /* changes the free flags */ 44 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 45 46 PetscCall(PetscFree(aij->ilen)); 47 PetscCall(PetscFree(aij->imax)); 48 PetscCall(PetscMalloc1(m, &aij->imax)); 49 PetscCall(PetscMalloc1(m, &aij->ilen)); 50 for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 51 const PetscInt rnz = i[ii + 1] - i[ii]; 52 aij->nonzerorowcnt += !!rnz; 53 aij->rmax = PetscMax(aij->rmax, rnz); 54 aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 55 } 56 aij->maxnz = i[m]; 57 aij->nz = i[m]; 58 59 if (osingle) { 60 PetscCall(PetscFree3(aij->a, aij->j, aij->i)); 61 } else { 62 if (ofree_a) PetscCall(PetscFree(aij->a)); 63 if (ofree_ij) PetscCall(PetscFree(aij->j)); 64 if (ofree_ij) PetscCall(PetscFree(aij->i)); 65 } 66 aij->i = i; 67 aij->j = j; 68 aij->a = a; 69 aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 70 /* default to not retain ownership */ 71 aij->singlemalloc = PETSC_FALSE; 72 aij->free_a = PETSC_FALSE; 73 aij->free_ij = PETSC_FALSE; 74 PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 75 PetscFunctionReturn(PETSC_SUCCESS); 76 } 77 78 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 79 { 80 Mat_Product *product = C->product; 81 MatProductAlgorithm alg; 82 PetscBool flg; 83 84 PetscFunctionBegin; 85 if (product) { 86 alg = product->alg; 87 } else { 88 alg = "sorted"; 89 } 90 /* sorted */ 91 PetscCall(PetscStrcmp(alg, "sorted", &flg)); 92 if (flg) { 93 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 94 PetscFunctionReturn(PETSC_SUCCESS); 95 } 96 97 /* scalable */ 98 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 99 if (flg) { 100 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 101 PetscFunctionReturn(PETSC_SUCCESS); 102 } 103 104 /* scalable_fast */ 105 PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 106 if (flg) { 107 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 108 PetscFunctionReturn(PETSC_SUCCESS); 109 } 110 111 /* heap */ 112 PetscCall(PetscStrcmp(alg, "heap", &flg)); 113 if (flg) { 114 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 115 PetscFunctionReturn(PETSC_SUCCESS); 116 } 117 118 /* btheap */ 119 PetscCall(PetscStrcmp(alg, "btheap", &flg)); 120 if (flg) { 121 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 122 PetscFunctionReturn(PETSC_SUCCESS); 123 } 124 125 /* llcondensed */ 126 PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 127 if (flg) { 128 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 129 PetscFunctionReturn(PETSC_SUCCESS); 130 } 131 132 /* rowmerge */ 133 PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 134 if (flg) { 135 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 #if defined(PETSC_HAVE_HYPRE) 140 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 141 if (flg) { 142 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 143 PetscFunctionReturn(PETSC_SUCCESS); 144 } 145 #endif 146 147 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 148 } 149 150 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) 151 { 152 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 153 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 154 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 155 PetscReal afill; 156 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 157 PetscHMapI ta; 158 PetscBT lnkbt; 159 PetscFreeSpaceList free_space = NULL, current_space = NULL; 160 161 PetscFunctionBegin; 162 /* Get ci and cj */ 163 /* Allocate ci array, arrays for fill computation and */ 164 /* free space for accumulating nonzero column info */ 165 PetscCall(PetscMalloc1(am + 2, &ci)); 166 ci[0] = 0; 167 168 /* create and initialize a linked list */ 169 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 170 MatRowMergeMax_SeqAIJ(b, bm, ta); 171 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 172 PetscCall(PetscHMapIDestroy(&ta)); 173 174 PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 175 176 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 177 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 178 179 current_space = free_space; 180 181 /* Determine ci and cj */ 182 for (i = 0; i < am; i++) { 183 anzi = ai[i + 1] - ai[i]; 184 aj = a->j + ai[i]; 185 for (j = 0; j < anzi; j++) { 186 brow = aj[j]; 187 bnzj = bi[brow + 1] - bi[brow]; 188 bj = b->j + bi[brow]; 189 /* add non-zero cols of B into the sorted linked list lnk */ 190 PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 191 } 192 /* add possible missing diagonal entry */ 193 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 194 cnzi = lnk[0]; 195 196 /* If free space is not available, make more free space */ 197 /* Double the amount of total space in the list */ 198 if (current_space->local_remaining < cnzi) { 199 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 200 ndouble++; 201 } 202 203 /* Copy data into free space, then initialize lnk */ 204 PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 205 206 current_space->array += cnzi; 207 current_space->local_used += cnzi; 208 current_space->local_remaining -= cnzi; 209 210 ci[i + 1] = ci[i] + cnzi; 211 } 212 213 /* Column indices are in the list of free space */ 214 /* Allocate space for cj, initialize cj, and */ 215 /* destroy list of free space and other temporary array(s) */ 216 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 217 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 218 PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 219 220 /* put together the new symbolic matrix */ 221 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 222 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 223 224 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 225 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 226 c = (Mat_SeqAIJ *)(C->data); 227 c->free_a = PETSC_FALSE; 228 c->free_ij = PETSC_TRUE; 229 c->nonew = 0; 230 231 /* fast, needs non-scalable O(bn) array 'abdense' */ 232 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 233 234 /* set MatInfo */ 235 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 236 if (afill < 1.0) afill = 1.0; 237 C->info.mallocs = ndouble; 238 C->info.fill_ratio_given = fill; 239 C->info.fill_ratio_needed = afill; 240 241 #if defined(PETSC_USE_INFO) 242 if (ci[am]) { 243 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 244 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 245 } else { 246 PetscCall(PetscInfo(C, "Empty matrix product\n")); 247 } 248 #endif 249 PetscFunctionReturn(PETSC_SUCCESS); 250 } 251 252 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) 253 { 254 PetscLogDouble flops = 0.0; 255 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 256 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 257 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 258 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 259 PetscInt am = A->rmap->n, cm = C->rmap->n; 260 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 261 PetscScalar *ca, valtmp; 262 PetscScalar *ab_dense; 263 PetscContainer cab_dense; 264 const PetscScalar *aa, *ba, *baj; 265 266 PetscFunctionBegin; 267 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 268 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 269 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 270 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 271 c->a = ca; 272 c->free_a = PETSC_TRUE; 273 } else ca = c->a; 274 275 /* TODO this should be done in the symbolic phase */ 276 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 277 that is hard to eradicate) */ 278 PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 279 if (!cab_dense) { 280 PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 281 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense)); 282 PetscCall(PetscContainerSetPointer(cab_dense, ab_dense)); 283 PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault)); 284 PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense)); 285 PetscCall(PetscObjectDereference((PetscObject)cab_dense)); 286 } 287 PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense)); 288 PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 289 290 /* clean old values in C */ 291 PetscCall(PetscArrayzero(ca, ci[cm])); 292 /* Traverse A row-wise. */ 293 /* Build the ith row in C by summing over nonzero columns in A, */ 294 /* the rows of B corresponding to nonzeros of A. */ 295 for (i = 0; i < am; i++) { 296 anzi = ai[i + 1] - ai[i]; 297 for (j = 0; j < anzi; j++) { 298 brow = aj[j]; 299 bnzi = bi[brow + 1] - bi[brow]; 300 bjj = bj + bi[brow]; 301 baj = ba + bi[brow]; 302 /* perform dense axpy */ 303 valtmp = aa[j]; 304 for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 305 flops += 2 * bnzi; 306 } 307 aj += anzi; 308 aa += anzi; 309 310 cnzi = ci[i + 1] - ci[i]; 311 for (k = 0; k < cnzi; k++) { 312 ca[k] += ab_dense[cj[k]]; 313 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 314 } 315 flops += cnzi; 316 cj += cnzi; 317 ca += cnzi; 318 } 319 #if defined(PETSC_HAVE_DEVICE) 320 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 321 #endif 322 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 323 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 324 PetscCall(PetscLogFlops(flops)); 325 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 326 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 327 PetscFunctionReturn(PETSC_SUCCESS); 328 } 329 330 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) 331 { 332 PetscLogDouble flops = 0.0; 333 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 334 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 335 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 336 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 337 PetscInt am = A->rmap->N, cm = C->rmap->N; 338 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 339 PetscScalar *ca = c->a, valtmp; 340 const PetscScalar *aa, *ba, *baj; 341 PetscInt nextb; 342 343 PetscFunctionBegin; 344 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 345 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 346 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 347 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 348 c->a = ca; 349 c->free_a = PETSC_TRUE; 350 } 351 352 /* clean old values in C */ 353 PetscCall(PetscArrayzero(ca, ci[cm])); 354 /* Traverse A row-wise. */ 355 /* Build the ith row in C by summing over nonzero columns in A, */ 356 /* the rows of B corresponding to nonzeros of A. */ 357 for (i = 0; i < am; i++) { 358 anzi = ai[i + 1] - ai[i]; 359 cnzi = ci[i + 1] - ci[i]; 360 for (j = 0; j < anzi; j++) { 361 brow = aj[j]; 362 bnzi = bi[brow + 1] - bi[brow]; 363 bjj = bj + bi[brow]; 364 baj = ba + bi[brow]; 365 /* perform sparse axpy */ 366 valtmp = aa[j]; 367 nextb = 0; 368 for (k = 0; nextb < bnzi; k++) { 369 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 370 ca[k] += valtmp * baj[nextb++]; 371 } 372 } 373 flops += 2 * bnzi; 374 } 375 aj += anzi; 376 aa += anzi; 377 cj += cnzi; 378 ca += cnzi; 379 } 380 #if defined(PETSC_HAVE_DEVICE) 381 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 382 #endif 383 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 384 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 385 PetscCall(PetscLogFlops(flops)); 386 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 387 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 388 PetscFunctionReturn(PETSC_SUCCESS); 389 } 390 391 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) 392 { 393 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 394 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 395 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 396 MatScalar *ca; 397 PetscReal afill; 398 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 399 PetscHMapI ta; 400 PetscFreeSpaceList free_space = NULL, current_space = NULL; 401 402 PetscFunctionBegin; 403 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 404 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 405 PetscCall(PetscMalloc1(am + 2, &ci)); 406 ci[0] = 0; 407 408 /* create and initialize a linked list */ 409 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 410 MatRowMergeMax_SeqAIJ(b, bm, ta); 411 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 412 PetscCall(PetscHMapIDestroy(&ta)); 413 414 PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 415 416 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 417 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 418 current_space = free_space; 419 420 /* Determine ci and cj */ 421 for (i = 0; i < am; i++) { 422 anzi = ai[i + 1] - ai[i]; 423 aj = a->j + ai[i]; 424 for (j = 0; j < anzi; j++) { 425 brow = aj[j]; 426 bnzj = bi[brow + 1] - bi[brow]; 427 bj = b->j + bi[brow]; 428 /* add non-zero cols of B into the sorted linked list lnk */ 429 PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 430 } 431 /* add possible missing diagonal entry */ 432 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 433 cnzi = lnk[1]; 434 435 /* If free space is not available, make more free space */ 436 /* Double the amount of total space in the list */ 437 if (current_space->local_remaining < cnzi) { 438 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 439 ndouble++; 440 } 441 442 /* Copy data into free space, then initialize lnk */ 443 PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 444 445 current_space->array += cnzi; 446 current_space->local_used += cnzi; 447 current_space->local_remaining -= cnzi; 448 449 ci[i + 1] = ci[i] + cnzi; 450 } 451 452 /* Column indices are in the list of free space */ 453 /* Allocate space for cj, initialize cj, and */ 454 /* destroy list of free space and other temporary array(s) */ 455 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 456 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 457 PetscCall(PetscLLCondensedDestroy_fast(lnk)); 458 459 /* Allocate space for ca */ 460 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 461 462 /* put together the new symbolic matrix */ 463 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 464 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 465 466 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 467 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 468 c = (Mat_SeqAIJ *)(C->data); 469 c->free_a = PETSC_TRUE; 470 c->free_ij = PETSC_TRUE; 471 c->nonew = 0; 472 473 /* slower, less memory */ 474 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 475 476 /* set MatInfo */ 477 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 478 if (afill < 1.0) afill = 1.0; 479 C->info.mallocs = ndouble; 480 C->info.fill_ratio_given = fill; 481 C->info.fill_ratio_needed = afill; 482 483 #if defined(PETSC_USE_INFO) 484 if (ci[am]) { 485 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 486 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 487 } else { 488 PetscCall(PetscInfo(C, "Empty matrix product\n")); 489 } 490 #endif 491 PetscFunctionReturn(PETSC_SUCCESS); 492 } 493 494 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) 495 { 496 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 497 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 498 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 499 MatScalar *ca; 500 PetscReal afill; 501 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 502 PetscHMapI ta; 503 PetscFreeSpaceList free_space = NULL, current_space = NULL; 504 505 PetscFunctionBegin; 506 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 507 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 508 PetscCall(PetscMalloc1(am + 2, &ci)); 509 ci[0] = 0; 510 511 /* create and initialize a linked list */ 512 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 513 MatRowMergeMax_SeqAIJ(b, bm, ta); 514 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 515 PetscCall(PetscHMapIDestroy(&ta)); 516 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 517 518 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 519 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 520 current_space = free_space; 521 522 /* Determine ci and cj */ 523 for (i = 0; i < am; i++) { 524 anzi = ai[i + 1] - ai[i]; 525 aj = a->j + ai[i]; 526 for (j = 0; j < anzi; j++) { 527 brow = aj[j]; 528 bnzj = bi[brow + 1] - bi[brow]; 529 bj = b->j + bi[brow]; 530 /* add non-zero cols of B into the sorted linked list lnk */ 531 PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 532 } 533 /* add possible missing diagonal entry */ 534 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 535 536 cnzi = lnk[0]; 537 538 /* If free space is not available, make more free space */ 539 /* Double the amount of total space in the list */ 540 if (current_space->local_remaining < cnzi) { 541 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 542 ndouble++; 543 } 544 545 /* Copy data into free space, then initialize lnk */ 546 PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 547 548 current_space->array += cnzi; 549 current_space->local_used += cnzi; 550 current_space->local_remaining -= cnzi; 551 552 ci[i + 1] = ci[i] + cnzi; 553 } 554 555 /* Column indices are in the list of free space */ 556 /* Allocate space for cj, initialize cj, and */ 557 /* destroy list of free space and other temporary array(s) */ 558 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 559 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 560 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 561 562 /* Allocate space for ca */ 563 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 564 565 /* put together the new symbolic matrix */ 566 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 567 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 568 569 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 570 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 571 c = (Mat_SeqAIJ *)(C->data); 572 c->free_a = PETSC_TRUE; 573 c->free_ij = PETSC_TRUE; 574 c->nonew = 0; 575 576 /* slower, less memory */ 577 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 578 579 /* set MatInfo */ 580 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 581 if (afill < 1.0) afill = 1.0; 582 C->info.mallocs = ndouble; 583 C->info.fill_ratio_given = fill; 584 C->info.fill_ratio_needed = afill; 585 586 #if defined(PETSC_USE_INFO) 587 if (ci[am]) { 588 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 589 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 590 } else { 591 PetscCall(PetscInfo(C, "Empty matrix product\n")); 592 } 593 #endif 594 PetscFunctionReturn(PETSC_SUCCESS); 595 } 596 597 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) 598 { 599 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 600 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 601 PetscInt *ci, *cj, *bb; 602 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 603 PetscReal afill; 604 PetscInt i, j, col, ndouble = 0; 605 PetscFreeSpaceList free_space = NULL, current_space = NULL; 606 PetscHeap h; 607 608 PetscFunctionBegin; 609 /* Get ci and cj - by merging sorted rows using a heap */ 610 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 611 PetscCall(PetscMalloc1(am + 2, &ci)); 612 ci[0] = 0; 613 614 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 615 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 616 current_space = free_space; 617 618 PetscCall(PetscHeapCreate(a->rmax, &h)); 619 PetscCall(PetscMalloc1(a->rmax, &bb)); 620 621 /* Determine ci and cj */ 622 for (i = 0; i < am; i++) { 623 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 624 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 625 ci[i + 1] = ci[i]; 626 /* Populate the min heap */ 627 for (j = 0; j < anzi; j++) { 628 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 629 if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 630 PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 631 } 632 } 633 /* Pick off the min element, adding it to free space */ 634 PetscCall(PetscHeapPop(h, &j, &col)); 635 while (j >= 0) { 636 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 637 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 638 ndouble++; 639 } 640 *(current_space->array++) = col; 641 current_space->local_used++; 642 current_space->local_remaining--; 643 ci[i + 1]++; 644 645 /* stash if anything else remains in this row of B */ 646 if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 647 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 648 PetscInt j2, col2; 649 PetscCall(PetscHeapPeek(h, &j2, &col2)); 650 if (col2 != col) break; 651 PetscCall(PetscHeapPop(h, &j2, &col2)); 652 if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 653 } 654 /* Put any stashed elements back into the min heap */ 655 PetscCall(PetscHeapUnstash(h)); 656 PetscCall(PetscHeapPop(h, &j, &col)); 657 } 658 } 659 PetscCall(PetscFree(bb)); 660 PetscCall(PetscHeapDestroy(&h)); 661 662 /* Column indices are in the list of free space */ 663 /* Allocate space for cj, initialize cj, and */ 664 /* destroy list of free space and other temporary array(s) */ 665 PetscCall(PetscMalloc1(ci[am], &cj)); 666 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 667 668 /* put together the new symbolic matrix */ 669 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 670 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 671 672 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 673 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 674 c = (Mat_SeqAIJ *)(C->data); 675 c->free_a = PETSC_TRUE; 676 c->free_ij = PETSC_TRUE; 677 c->nonew = 0; 678 679 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 680 681 /* set MatInfo */ 682 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 683 if (afill < 1.0) afill = 1.0; 684 C->info.mallocs = ndouble; 685 C->info.fill_ratio_given = fill; 686 C->info.fill_ratio_needed = afill; 687 688 #if defined(PETSC_USE_INFO) 689 if (ci[am]) { 690 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 691 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 692 } else { 693 PetscCall(PetscInfo(C, "Empty matrix product\n")); 694 } 695 #endif 696 PetscFunctionReturn(PETSC_SUCCESS); 697 } 698 699 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) 700 { 701 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 702 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 703 PetscInt *ci, *cj, *bb; 704 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 705 PetscReal afill; 706 PetscInt i, j, col, ndouble = 0; 707 PetscFreeSpaceList free_space = NULL, current_space = NULL; 708 PetscHeap h; 709 PetscBT bt; 710 711 PetscFunctionBegin; 712 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 713 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 714 PetscCall(PetscMalloc1(am + 2, &ci)); 715 ci[0] = 0; 716 717 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 718 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 719 720 current_space = free_space; 721 722 PetscCall(PetscHeapCreate(a->rmax, &h)); 723 PetscCall(PetscMalloc1(a->rmax, &bb)); 724 PetscCall(PetscBTCreate(bn, &bt)); 725 726 /* Determine ci and cj */ 727 for (i = 0; i < am; i++) { 728 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 729 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 730 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 731 ci[i + 1] = ci[i]; 732 /* Populate the min heap */ 733 for (j = 0; j < anzi; j++) { 734 PetscInt brow = acol[j]; 735 for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 736 PetscInt bcol = bj[bb[j]]; 737 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 738 PetscCall(PetscHeapAdd(h, j, bcol)); 739 bb[j]++; 740 break; 741 } 742 } 743 } 744 /* Pick off the min element, adding it to free space */ 745 PetscCall(PetscHeapPop(h, &j, &col)); 746 while (j >= 0) { 747 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 748 fptr = NULL; /* need PetscBTMemzero */ 749 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 750 ndouble++; 751 } 752 *(current_space->array++) = col; 753 current_space->local_used++; 754 current_space->local_remaining--; 755 ci[i + 1]++; 756 757 /* stash if anything else remains in this row of B */ 758 for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 759 PetscInt bcol = bj[bb[j]]; 760 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 761 PetscCall(PetscHeapAdd(h, j, bcol)); 762 bb[j]++; 763 break; 764 } 765 } 766 PetscCall(PetscHeapPop(h, &j, &col)); 767 } 768 if (fptr) { /* Clear the bits for this row */ 769 for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 770 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 771 PetscCall(PetscBTMemzero(bn, bt)); 772 } 773 } 774 PetscCall(PetscFree(bb)); 775 PetscCall(PetscHeapDestroy(&h)); 776 PetscCall(PetscBTDestroy(&bt)); 777 778 /* Column indices are in the list of free space */ 779 /* Allocate space for cj, initialize cj, and */ 780 /* destroy list of free space and other temporary array(s) */ 781 PetscCall(PetscMalloc1(ci[am], &cj)); 782 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 783 784 /* put together the new symbolic matrix */ 785 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 786 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 787 788 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 789 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 790 c = (Mat_SeqAIJ *)(C->data); 791 c->free_a = PETSC_TRUE; 792 c->free_ij = PETSC_TRUE; 793 c->nonew = 0; 794 795 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 796 797 /* set MatInfo */ 798 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 799 if (afill < 1.0) afill = 1.0; 800 C->info.mallocs = ndouble; 801 C->info.fill_ratio_given = fill; 802 C->info.fill_ratio_needed = afill; 803 804 #if defined(PETSC_USE_INFO) 805 if (ci[am]) { 806 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 807 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 808 } else { 809 PetscCall(PetscInfo(C, "Empty matrix product\n")); 810 } 811 #endif 812 PetscFunctionReturn(PETSC_SUCCESS); 813 } 814 815 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) 816 { 817 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 818 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 819 PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 820 PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 821 const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 822 const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 823 const PetscInt *brow_ptr[8], *brow_end[8]; 824 PetscInt window[8]; 825 PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 826 PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 827 PetscReal afill; 828 PetscInt *workj_L1, *workj_L2, *workj_L3; 829 PetscInt L1_nnz, L2_nnz; 830 831 /* Step 1: Get upper bound on memory required for allocation. 832 Because of the way virtual memory works, 833 only the memory pages that are actually needed will be physically allocated. */ 834 PetscFunctionBegin; 835 PetscCall(PetscMalloc1(am + 1, &ci)); 836 for (i = 0; i < am; i++) { 837 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 838 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 839 a_rownnz = 0; 840 for (k = 0; k < anzi; ++k) { 841 a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 842 if (a_rownnz > bn) { 843 a_rownnz = bn; 844 break; 845 } 846 } 847 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 848 } 849 /* temporary work areas for merging rows */ 850 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 851 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 852 PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 853 854 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 855 c_maxmem = 8 * (ai[am] + bi[bm]); 856 /* Step 2: Populate pattern for C */ 857 PetscCall(PetscMalloc1(c_maxmem, &cj)); 858 859 ci_nnz = 0; 860 ci[0] = 0; 861 worki_L1[0] = 0; 862 worki_L2[0] = 0; 863 for (i = 0; i < am; i++) { 864 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 865 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 866 rowsleft = anzi; 867 inputcol_L1 = acol; 868 L2_nnz = 0; 869 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 870 worki_L2[1] = 0; 871 outputi_nnz = 0; 872 873 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 874 while (ci_nnz + a_maxrownnz > c_maxmem) { 875 c_maxmem *= 2; 876 ndouble++; 877 PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 878 } 879 880 while (rowsleft) { 881 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 882 L1_nrows = 0; 883 L1_nnz = 0; 884 inputcol = inputcol_L1; 885 inputi = bi; 886 inputj = bj; 887 888 /* The following macro is used to specialize for small rows in A. 889 This helps with compiler unrolling, improving performance substantially. 890 Input: inputj inputi inputcol bn 891 Output: outputj outputi_nnz */ 892 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 893 do { \ 894 window_min = bn; \ 895 outputi_nnz = 0; \ 896 for (k = 0; k < ANNZ; ++k) { \ 897 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 898 brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 899 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 900 window_min = PetscMin(window[k], window_min); \ 901 } \ 902 while (window_min < bn) { \ 903 outputj[outputi_nnz++] = window_min; \ 904 /* advance front and compute new minimum */ \ 905 old_window_min = window_min; \ 906 window_min = bn; \ 907 for (k = 0; k < ANNZ; ++k) { \ 908 if (window[k] == old_window_min) { \ 909 brow_ptr[k]++; \ 910 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 911 } \ 912 window_min = PetscMin(window[k], window_min); \ 913 } \ 914 } \ 915 } while (0) 916 917 /************** L E V E L 1 ***************/ 918 /* Merge up to 8 rows of B to L1 work array*/ 919 while (L1_rowsleft) { 920 outputi_nnz = 0; 921 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 922 else outputj = cj + ci_nnz; /* Merge directly to C */ 923 924 switch (L1_rowsleft) { 925 case 1: 926 brow_ptr[0] = inputj + inputi[inputcol[0]]; 927 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 928 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 929 inputcol += L1_rowsleft; 930 rowsleft -= L1_rowsleft; 931 L1_rowsleft = 0; 932 break; 933 case 2: 934 MatMatMultSymbolic_RowMergeMacro(2); 935 inputcol += L1_rowsleft; 936 rowsleft -= L1_rowsleft; 937 L1_rowsleft = 0; 938 break; 939 case 3: 940 MatMatMultSymbolic_RowMergeMacro(3); 941 inputcol += L1_rowsleft; 942 rowsleft -= L1_rowsleft; 943 L1_rowsleft = 0; 944 break; 945 case 4: 946 MatMatMultSymbolic_RowMergeMacro(4); 947 inputcol += L1_rowsleft; 948 rowsleft -= L1_rowsleft; 949 L1_rowsleft = 0; 950 break; 951 case 5: 952 MatMatMultSymbolic_RowMergeMacro(5); 953 inputcol += L1_rowsleft; 954 rowsleft -= L1_rowsleft; 955 L1_rowsleft = 0; 956 break; 957 case 6: 958 MatMatMultSymbolic_RowMergeMacro(6); 959 inputcol += L1_rowsleft; 960 rowsleft -= L1_rowsleft; 961 L1_rowsleft = 0; 962 break; 963 case 7: 964 MatMatMultSymbolic_RowMergeMacro(7); 965 inputcol += L1_rowsleft; 966 rowsleft -= L1_rowsleft; 967 L1_rowsleft = 0; 968 break; 969 default: 970 MatMatMultSymbolic_RowMergeMacro(8); 971 inputcol += 8; 972 rowsleft -= 8; 973 L1_rowsleft -= 8; 974 break; 975 } 976 inputcol_L1 = inputcol; 977 L1_nnz += outputi_nnz; 978 worki_L1[++L1_nrows] = L1_nnz; 979 } 980 981 /********************** L E V E L 2 ************************/ 982 /* Merge from L1 work array to either C or to L2 work array */ 983 if (anzi > 8) { 984 inputi = worki_L1; 985 inputj = workj_L1; 986 inputcol = workcol; 987 outputi_nnz = 0; 988 989 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 990 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 991 992 switch (L1_nrows) { 993 case 1: 994 brow_ptr[0] = inputj + inputi[inputcol[0]]; 995 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 996 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 997 break; 998 case 2: 999 MatMatMultSymbolic_RowMergeMacro(2); 1000 break; 1001 case 3: 1002 MatMatMultSymbolic_RowMergeMacro(3); 1003 break; 1004 case 4: 1005 MatMatMultSymbolic_RowMergeMacro(4); 1006 break; 1007 case 5: 1008 MatMatMultSymbolic_RowMergeMacro(5); 1009 break; 1010 case 6: 1011 MatMatMultSymbolic_RowMergeMacro(6); 1012 break; 1013 case 7: 1014 MatMatMultSymbolic_RowMergeMacro(7); 1015 break; 1016 case 8: 1017 MatMatMultSymbolic_RowMergeMacro(8); 1018 break; 1019 default: 1020 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1021 } 1022 L2_nnz += outputi_nnz; 1023 worki_L2[++L2_nrows] = L2_nnz; 1024 1025 /************************ L E V E L 3 **********************/ 1026 /* Merge from L2 work array to either C or to L2 work array */ 1027 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1028 inputi = worki_L2; 1029 inputj = workj_L2; 1030 inputcol = workcol; 1031 outputi_nnz = 0; 1032 if (rowsleft) outputj = workj_L3; 1033 else outputj = cj + ci_nnz; 1034 switch (L2_nrows) { 1035 case 1: 1036 brow_ptr[0] = inputj + inputi[inputcol[0]]; 1037 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1038 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1039 break; 1040 case 2: 1041 MatMatMultSymbolic_RowMergeMacro(2); 1042 break; 1043 case 3: 1044 MatMatMultSymbolic_RowMergeMacro(3); 1045 break; 1046 case 4: 1047 MatMatMultSymbolic_RowMergeMacro(4); 1048 break; 1049 case 5: 1050 MatMatMultSymbolic_RowMergeMacro(5); 1051 break; 1052 case 6: 1053 MatMatMultSymbolic_RowMergeMacro(6); 1054 break; 1055 case 7: 1056 MatMatMultSymbolic_RowMergeMacro(7); 1057 break; 1058 case 8: 1059 MatMatMultSymbolic_RowMergeMacro(8); 1060 break; 1061 default: 1062 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1063 } 1064 L2_nrows = 1; 1065 L2_nnz = outputi_nnz; 1066 worki_L2[1] = outputi_nnz; 1067 /* Copy to workj_L2 */ 1068 if (rowsleft) { 1069 for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1070 } 1071 } 1072 } 1073 } /* while (rowsleft) */ 1074 #undef MatMatMultSymbolic_RowMergeMacro 1075 1076 /* terminate current row */ 1077 ci_nnz += outputi_nnz; 1078 ci[i + 1] = ci_nnz; 1079 } 1080 1081 /* Step 3: Create the new symbolic matrix */ 1082 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1083 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1084 1085 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1086 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1087 c = (Mat_SeqAIJ *)(C->data); 1088 c->free_a = PETSC_TRUE; 1089 c->free_ij = PETSC_TRUE; 1090 c->nonew = 0; 1091 1092 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1093 1094 /* set MatInfo */ 1095 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1096 if (afill < 1.0) afill = 1.0; 1097 C->info.mallocs = ndouble; 1098 C->info.fill_ratio_given = fill; 1099 C->info.fill_ratio_needed = afill; 1100 1101 #if defined(PETSC_USE_INFO) 1102 if (ci[am]) { 1103 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1104 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1105 } else { 1106 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1107 } 1108 #endif 1109 1110 /* Step 4: Free temporary work areas */ 1111 PetscCall(PetscFree(workj_L1)); 1112 PetscCall(PetscFree(workj_L2)); 1113 PetscCall(PetscFree(workj_L3)); 1114 PetscFunctionReturn(PETSC_SUCCESS); 1115 } 1116 1117 /* concatenate unique entries and then sort */ 1118 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) 1119 { 1120 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1121 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 1122 PetscInt *ci, *cj, bcol; 1123 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1124 PetscReal afill; 1125 PetscInt i, j, ndouble = 0; 1126 PetscSegBuffer seg, segrow; 1127 char *seen; 1128 1129 PetscFunctionBegin; 1130 PetscCall(PetscMalloc1(am + 1, &ci)); 1131 ci[0] = 0; 1132 1133 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1134 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 1135 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 1136 PetscCall(PetscCalloc1(bn, &seen)); 1137 1138 /* Determine ci and cj */ 1139 for (i = 0; i < am; i++) { 1140 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1141 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1142 PetscInt packlen = 0, *PETSC_RESTRICT crow; 1143 1144 /* Pack segrow */ 1145 for (j = 0; j < anzi; j++) { 1146 PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1147 for (k = bjstart; k < bjend; k++) { 1148 bcol = bj[k]; 1149 if (!seen[bcol]) { /* new entry */ 1150 PetscInt *PETSC_RESTRICT slot; 1151 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1152 *slot = bcol; 1153 seen[bcol] = 1; 1154 packlen++; 1155 } 1156 } 1157 } 1158 1159 /* Check i-th diagonal entry */ 1160 if (C->force_diagonals && !seen[i]) { 1161 PetscInt *PETSC_RESTRICT slot; 1162 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1163 *slot = i; 1164 seen[i] = 1; 1165 packlen++; 1166 } 1167 1168 PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 1169 PetscCall(PetscSegBufferExtractTo(segrow, crow)); 1170 PetscCall(PetscSortInt(packlen, crow)); 1171 ci[i + 1] = ci[i] + packlen; 1172 for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1173 } 1174 PetscCall(PetscSegBufferDestroy(&segrow)); 1175 PetscCall(PetscFree(seen)); 1176 1177 /* Column indices are in the segmented buffer */ 1178 PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 1179 PetscCall(PetscSegBufferDestroy(&seg)); 1180 1181 /* put together the new symbolic matrix */ 1182 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1183 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1184 1185 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1186 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1187 c = (Mat_SeqAIJ *)(C->data); 1188 c->free_a = PETSC_TRUE; 1189 c->free_ij = PETSC_TRUE; 1190 c->nonew = 0; 1191 1192 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1193 1194 /* set MatInfo */ 1195 afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1196 if (afill < 1.0) afill = 1.0; 1197 C->info.mallocs = ndouble; 1198 C->info.fill_ratio_given = fill; 1199 C->info.fill_ratio_needed = afill; 1200 1201 #if defined(PETSC_USE_INFO) 1202 if (ci[am]) { 1203 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1204 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1205 } else { 1206 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1207 } 1208 #endif 1209 PetscFunctionReturn(PETSC_SUCCESS); 1210 } 1211 1212 static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1213 { 1214 Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data; 1215 1216 PetscFunctionBegin; 1217 PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 1218 PetscCall(MatDestroy(&abt->Bt_den)); 1219 PetscCall(MatDestroy(&abt->ABt_den)); 1220 PetscCall(PetscFree(abt)); 1221 PetscFunctionReturn(PETSC_SUCCESS); 1222 } 1223 1224 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1225 { 1226 Mat Bt; 1227 Mat_MatMatTransMult *abt; 1228 Mat_Product *product = C->product; 1229 char *alg; 1230 1231 PetscFunctionBegin; 1232 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1233 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1234 1235 /* create symbolic Bt */ 1236 PetscCall(MatTransposeSymbolic(B, &Bt)); 1237 PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1238 PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 1239 1240 /* get symbolic C=A*Bt */ 1241 PetscCall(PetscStrallocpy(product->alg, &alg)); 1242 PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 1243 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 1244 PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 1245 PetscCall(PetscFree(alg)); 1246 1247 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1248 PetscCall(PetscNew(&abt)); 1249 1250 product->data = abt; 1251 product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 1252 1253 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1254 1255 abt->usecoloring = PETSC_FALSE; 1256 PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 1257 if (abt->usecoloring) { 1258 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1259 MatTransposeColoring matcoloring; 1260 MatColoring coloring; 1261 ISColoring iscoloring; 1262 Mat Bt_dense, C_dense; 1263 1264 /* inode causes memory problem */ 1265 PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 1266 1267 PetscCall(MatColoringCreate(C, &coloring)); 1268 PetscCall(MatColoringSetDistance(coloring, 2)); 1269 PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 1270 PetscCall(MatColoringSetFromOptions(coloring)); 1271 PetscCall(MatColoringApply(coloring, &iscoloring)); 1272 PetscCall(MatColoringDestroy(&coloring)); 1273 PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 1274 1275 abt->matcoloring = matcoloring; 1276 1277 PetscCall(ISColoringDestroy(&iscoloring)); 1278 1279 /* Create Bt_dense and C_dense = A*Bt_dense */ 1280 PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 1281 PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 1282 PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 1283 PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 1284 1285 Bt_dense->assembled = PETSC_TRUE; 1286 abt->Bt_den = Bt_dense; 1287 1288 PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 1289 PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 1290 PetscCall(MatSetType(C_dense, MATSEQDENSE)); 1291 PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 1292 1293 Bt_dense->assembled = PETSC_TRUE; 1294 abt->ABt_den = C_dense; 1295 1296 #if defined(PETSC_USE_INFO) 1297 { 1298 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 1299 PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 1300 Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 1301 } 1302 #endif 1303 } 1304 /* clean up */ 1305 PetscCall(MatDestroy(&Bt)); 1306 PetscFunctionReturn(PETSC_SUCCESS); 1307 } 1308 1309 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1310 { 1311 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1312 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 1313 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 1314 PetscLogDouble flops = 0.0; 1315 MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 1316 Mat_MatMatTransMult *abt; 1317 Mat_Product *product = C->product; 1318 1319 PetscFunctionBegin; 1320 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1321 abt = (Mat_MatMatTransMult *)product->data; 1322 PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1323 /* clear old values in C */ 1324 if (!c->a) { 1325 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1326 c->a = ca; 1327 c->free_a = PETSC_TRUE; 1328 } else { 1329 ca = c->a; 1330 PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 1331 } 1332 1333 if (abt->usecoloring) { 1334 MatTransposeColoring matcoloring = abt->matcoloring; 1335 Mat Bt_dense, C_dense = abt->ABt_den; 1336 1337 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1338 Bt_dense = abt->Bt_den; 1339 PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1340 1341 /* C_dense = A*Bt_dense */ 1342 PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1343 1344 /* Recover C from C_dense */ 1345 PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 1346 PetscFunctionReturn(PETSC_SUCCESS); 1347 } 1348 1349 for (i = 0; i < cm; i++) { 1350 anzi = ai[i + 1] - ai[i]; 1351 acol = aj + ai[i]; 1352 aval = aa + ai[i]; 1353 cnzi = ci[i + 1] - ci[i]; 1354 ccol = cj + ci[i]; 1355 cval = ca + ci[i]; 1356 for (j = 0; j < cnzi; j++) { 1357 brow = ccol[j]; 1358 bnzj = bi[brow + 1] - bi[brow]; 1359 bcol = bj + bi[brow]; 1360 bval = ba + bi[brow]; 1361 1362 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1363 nexta = 0; 1364 nextb = 0; 1365 while (nexta < anzi && nextb < bnzj) { 1366 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1367 if (nexta == anzi) break; 1368 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1369 if (nextb == bnzj) break; 1370 if (acol[nexta] == bcol[nextb]) { 1371 cval[j] += aval[nexta] * bval[nextb]; 1372 nexta++; 1373 nextb++; 1374 flops += 2; 1375 } 1376 } 1377 } 1378 } 1379 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1380 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1381 PetscCall(PetscLogFlops(flops)); 1382 PetscFunctionReturn(PETSC_SUCCESS); 1383 } 1384 1385 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1386 { 1387 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data; 1388 1389 PetscFunctionBegin; 1390 PetscCall(MatDestroy(&atb->At)); 1391 if (atb->destroy) PetscCall((*atb->destroy)(atb->data)); 1392 PetscCall(PetscFree(atb)); 1393 PetscFunctionReturn(PETSC_SUCCESS); 1394 } 1395 1396 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1397 { 1398 Mat At = NULL; 1399 Mat_Product *product = C->product; 1400 PetscBool flg, def, square; 1401 1402 PetscFunctionBegin; 1403 MatCheckProduct(C, 4); 1404 square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 1405 /* outerproduct */ 1406 PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 1407 if (flg) { 1408 /* create symbolic At */ 1409 if (!square) { 1410 PetscCall(MatTransposeSymbolic(A, &At)); 1411 PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1412 PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1413 } 1414 /* get symbolic C=At*B */ 1415 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1416 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1417 1418 /* clean up */ 1419 if (!square) PetscCall(MatDestroy(&At)); 1420 1421 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1422 PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 1423 PetscFunctionReturn(PETSC_SUCCESS); 1424 } 1425 1426 /* matmatmult */ 1427 PetscCall(PetscStrcmp(product->alg, "default", &def)); 1428 PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1429 if (flg || def) { 1430 Mat_MatTransMatMult *atb; 1431 1432 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1433 PetscCall(PetscNew(&atb)); 1434 if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 1435 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1436 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1437 PetscCall(MatProductSetAlgorithm(C, "at*b")); 1438 product->data = atb; 1439 product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 1440 atb->At = At; 1441 1442 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1443 PetscFunctionReturn(PETSC_SUCCESS); 1444 } 1445 1446 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1447 } 1448 1449 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1450 { 1451 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1452 PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1453 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1454 PetscLogDouble flops = 0.0; 1455 MatScalar *aa = a->a, *ba, *ca, *caj; 1456 1457 PetscFunctionBegin; 1458 if (!c->a) { 1459 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1460 1461 c->a = ca; 1462 c->free_a = PETSC_TRUE; 1463 } else { 1464 ca = c->a; 1465 PetscCall(PetscArrayzero(ca, ci[cm])); 1466 } 1467 1468 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1469 for (i = 0; i < am; i++) { 1470 bj = b->j + bi[i]; 1471 ba = b->a + bi[i]; 1472 bnzi = bi[i + 1] - bi[i]; 1473 anzi = ai[i + 1] - ai[i]; 1474 for (j = 0; j < anzi; j++) { 1475 nextb = 0; 1476 crow = *aj++; 1477 cjj = cj + ci[crow]; 1478 caj = ca + ci[crow]; 1479 /* perform sparse axpy operation. Note cjj includes bj. */ 1480 for (k = 0; nextb < bnzi; k++) { 1481 if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 1482 caj[k] += (*aa) * (*(ba + nextb)); 1483 nextb++; 1484 } 1485 } 1486 flops += 2 * bnzi; 1487 aa++; 1488 } 1489 } 1490 1491 /* Assemble the final matrix and clean up */ 1492 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1493 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1494 PetscCall(PetscLogFlops(flops)); 1495 PetscFunctionReturn(PETSC_SUCCESS); 1496 } 1497 1498 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1499 { 1500 PetscFunctionBegin; 1501 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1502 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1503 PetscFunctionReturn(PETSC_SUCCESS); 1504 } 1505 1506 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) 1507 { 1508 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1509 PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1510 const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1511 const PetscInt *aj; 1512 PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 1513 PetscInt clda; 1514 PetscInt am4, bm4, col, i, j, n; 1515 1516 PetscFunctionBegin; 1517 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1518 PetscCall(MatSeqAIJGetArrayRead(A, &av)); 1519 if (add) { 1520 PetscCall(MatDenseGetArray(C, &c)); 1521 } else { 1522 PetscCall(MatDenseGetArrayWrite(C, &c)); 1523 } 1524 PetscCall(MatDenseGetArrayRead(B, &b)); 1525 PetscCall(MatDenseGetLDA(B, &bm)); 1526 PetscCall(MatDenseGetLDA(C, &clda)); 1527 am4 = 4 * clda; 1528 bm4 = 4 * bm; 1529 if (b) { 1530 b1 = b; 1531 b2 = b1 + bm; 1532 b3 = b2 + bm; 1533 b4 = b3 + bm; 1534 } else b1 = b2 = b3 = b4 = NULL; 1535 c1 = c; 1536 c2 = c1 + clda; 1537 c3 = c2 + clda; 1538 c4 = c3 + clda; 1539 for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 1540 for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1541 r1 = r2 = r3 = r4 = 0.0; 1542 n = a->i[i + 1] - a->i[i]; 1543 aj = a->j ? a->j + a->i[i] : NULL; 1544 aa = av ? av + a->i[i] : NULL; 1545 for (j = 0; j < n; j++) { 1546 const PetscScalar aatmp = aa[j]; 1547 const PetscInt ajtmp = aj[j]; 1548 r1 += aatmp * b1[ajtmp]; 1549 r2 += aatmp * b2[ajtmp]; 1550 r3 += aatmp * b3[ajtmp]; 1551 r4 += aatmp * b4[ajtmp]; 1552 } 1553 if (add) { 1554 c1[i] += r1; 1555 c2[i] += r2; 1556 c3[i] += r3; 1557 c4[i] += r4; 1558 } else { 1559 c1[i] = r1; 1560 c2[i] = r2; 1561 c3[i] = r3; 1562 c4[i] = r4; 1563 } 1564 } 1565 if (b) { 1566 b1 += bm4; 1567 b2 += bm4; 1568 b3 += bm4; 1569 b4 += bm4; 1570 } 1571 c1 += am4; 1572 c2 += am4; 1573 c3 += am4; 1574 c4 += am4; 1575 } 1576 /* process remaining columns */ 1577 if (col != cn) { 1578 PetscInt rc = cn - col; 1579 1580 if (rc == 1) { 1581 for (i = 0; i < am; i++) { 1582 r1 = 0.0; 1583 n = a->i[i + 1] - a->i[i]; 1584 aj = a->j + a->i[i]; 1585 aa = av + a->i[i]; 1586 for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 1587 if (add) c1[i] += r1; 1588 else c1[i] = r1; 1589 } 1590 } else if (rc == 2) { 1591 for (i = 0; i < am; i++) { 1592 r1 = r2 = 0.0; 1593 n = a->i[i + 1] - a->i[i]; 1594 aj = a->j + a->i[i]; 1595 aa = av + a->i[i]; 1596 for (j = 0; j < n; j++) { 1597 const PetscScalar aatmp = aa[j]; 1598 const PetscInt ajtmp = aj[j]; 1599 r1 += aatmp * b1[ajtmp]; 1600 r2 += aatmp * b2[ajtmp]; 1601 } 1602 if (add) { 1603 c1[i] += r1; 1604 c2[i] += r2; 1605 } else { 1606 c1[i] = r1; 1607 c2[i] = r2; 1608 } 1609 } 1610 } else { 1611 for (i = 0; i < am; i++) { 1612 r1 = r2 = r3 = 0.0; 1613 n = a->i[i + 1] - a->i[i]; 1614 aj = a->j ? a->j + a->i[i] : NULL; 1615 aa = av ? av + a->i[i] : NULL; 1616 for (j = 0; j < n; j++) { 1617 const PetscScalar aatmp = aa[j]; 1618 const PetscInt ajtmp = aj[j]; 1619 r1 += aatmp * b1[ajtmp]; 1620 r2 += aatmp * b2[ajtmp]; 1621 r3 += aatmp * b3[ajtmp]; 1622 } 1623 if (add) { 1624 c1[i] += r1; 1625 c2[i] += r2; 1626 c3[i] += r3; 1627 } else { 1628 c1[i] = r1; 1629 c2[i] = r2; 1630 c3[i] = r3; 1631 } 1632 } 1633 } 1634 } 1635 PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 1636 if (add) { 1637 PetscCall(MatDenseRestoreArray(C, &c)); 1638 } else { 1639 PetscCall(MatDenseRestoreArrayWrite(C, &c)); 1640 } 1641 PetscCall(MatDenseRestoreArrayRead(B, &b)); 1642 PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 1643 PetscFunctionReturn(PETSC_SUCCESS); 1644 } 1645 1646 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) 1647 { 1648 PetscFunctionBegin; 1649 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1650 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1651 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1652 1653 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 1654 PetscFunctionReturn(PETSC_SUCCESS); 1655 } 1656 1657 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1658 { 1659 PetscFunctionBegin; 1660 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1661 C->ops->productsymbolic = MatProductSymbolic_AB; 1662 PetscFunctionReturn(PETSC_SUCCESS); 1663 } 1664 1665 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 1666 1667 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1668 { 1669 PetscFunctionBegin; 1670 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1671 C->ops->productsymbolic = MatProductSymbolic_AtB; 1672 PetscFunctionReturn(PETSC_SUCCESS); 1673 } 1674 1675 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1676 { 1677 PetscFunctionBegin; 1678 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1679 C->ops->productsymbolic = MatProductSymbolic_ABt; 1680 PetscFunctionReturn(PETSC_SUCCESS); 1681 } 1682 1683 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1684 { 1685 Mat_Product *product = C->product; 1686 1687 PetscFunctionBegin; 1688 switch (product->type) { 1689 case MATPRODUCT_AB: 1690 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1691 break; 1692 case MATPRODUCT_AtB: 1693 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1694 break; 1695 case MATPRODUCT_ABt: 1696 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1697 break; 1698 default: 1699 break; 1700 } 1701 PetscFunctionReturn(PETSC_SUCCESS); 1702 } 1703 1704 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1705 { 1706 Mat_Product *product = C->product; 1707 Mat A = product->A; 1708 PetscBool baij; 1709 1710 PetscFunctionBegin; 1711 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 1712 if (!baij) { /* A is seqsbaij */ 1713 PetscBool sbaij; 1714 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 1715 PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 1716 1717 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1718 } else { /* A is seqbaij */ 1719 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1720 } 1721 1722 C->ops->productsymbolic = MatProductSymbolic_AB; 1723 PetscFunctionReturn(PETSC_SUCCESS); 1724 } 1725 1726 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1727 { 1728 Mat_Product *product = C->product; 1729 1730 PetscFunctionBegin; 1731 MatCheckProduct(C, 1); 1732 PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1733 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 1734 PetscFunctionReturn(PETSC_SUCCESS); 1735 } 1736 1737 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1738 { 1739 PetscFunctionBegin; 1740 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1741 C->ops->productsymbolic = MatProductSymbolic_AB; 1742 PetscFunctionReturn(PETSC_SUCCESS); 1743 } 1744 1745 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1746 { 1747 Mat_Product *product = C->product; 1748 1749 PetscFunctionBegin; 1750 if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 1751 PetscFunctionReturn(PETSC_SUCCESS); 1752 } 1753 1754 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) 1755 { 1756 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1757 Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 1758 PetscInt *bi = b->i, *bj = b->j; 1759 PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 1760 MatScalar *btval, *btval_den, *ba = b->a; 1761 PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1762 1763 PetscFunctionBegin; 1764 btval_den = btdense->v; 1765 PetscCall(PetscArrayzero(btval_den, m * n)); 1766 for (k = 0; k < ncolors; k++) { 1767 ncolumns = coloring->ncolumns[k]; 1768 for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1769 col = *(columns + colorforcol[k] + l); 1770 btcol = bj + bi[col]; 1771 btval = ba + bi[col]; 1772 anz = bi[col + 1] - bi[col]; 1773 for (j = 0; j < anz; j++) { 1774 brow = btcol[j]; 1775 btval_den[brow] = btval[j]; 1776 } 1777 } 1778 btval_den += m; 1779 } 1780 PetscFunctionReturn(PETSC_SUCCESS); 1781 } 1782 1783 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) 1784 { 1785 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 1786 const PetscScalar *ca_den, *ca_den_ptr; 1787 PetscScalar *ca = csp->a; 1788 PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1789 PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1790 PetscInt nrows, *row, *idx; 1791 PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 1792 1793 PetscFunctionBegin; 1794 PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1795 1796 if (brows > 0) { 1797 PetscInt *lstart, row_end, row_start; 1798 lstart = matcoloring->lstart; 1799 PetscCall(PetscArrayzero(lstart, ncolors)); 1800 1801 row_end = brows; 1802 if (row_end > m) row_end = m; 1803 for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1804 ca_den_ptr = ca_den; 1805 for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1806 nrows = matcoloring->nrows[k]; 1807 row = rows + colorforrow[k]; 1808 idx = den2sp + colorforrow[k]; 1809 for (l = lstart[k]; l < nrows; l++) { 1810 if (row[l] >= row_end) { 1811 lstart[k] = l; 1812 break; 1813 } else { 1814 ca[idx[l]] = ca_den_ptr[row[l]]; 1815 } 1816 } 1817 ca_den_ptr += m; 1818 } 1819 row_end += brows; 1820 if (row_end > m) row_end = m; 1821 } 1822 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1823 ca_den_ptr = ca_den; 1824 for (k = 0; k < ncolors; k++) { 1825 nrows = matcoloring->nrows[k]; 1826 row = rows + colorforrow[k]; 1827 idx = den2sp + colorforrow[k]; 1828 for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1829 ca_den_ptr += m; 1830 } 1831 } 1832 1833 PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1834 #if defined(PETSC_USE_INFO) 1835 if (matcoloring->brows > 0) { 1836 PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1837 } else { 1838 PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1839 } 1840 #endif 1841 PetscFunctionReturn(PETSC_SUCCESS); 1842 } 1843 1844 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) 1845 { 1846 PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 1847 const PetscInt *is, *ci, *cj, *row_idx; 1848 PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1849 IS *isa; 1850 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1851 PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1852 PetscInt *colorforcol, *columns, *columns_i, brows; 1853 PetscBool flg; 1854 1855 PetscFunctionBegin; 1856 PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1857 1858 /* bs >1 is not being tested yet! */ 1859 Nbs = mat->cmap->N / bs; 1860 c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1861 c->N = Nbs; 1862 c->m = c->M; 1863 c->rstart = 0; 1864 c->brows = 100; 1865 1866 c->ncolors = nis; 1867 PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 1868 PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 1869 PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1870 1871 brows = c->brows; 1872 PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1873 if (flg) c->brows = brows; 1874 if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 1875 1876 colorforrow[0] = 0; 1877 rows_i = rows; 1878 den2sp_i = den2sp; 1879 1880 PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 1881 PetscCall(PetscMalloc1(Nbs + 1, &columns)); 1882 1883 colorforcol[0] = 0; 1884 columns_i = columns; 1885 1886 /* get column-wise storage of mat */ 1887 PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1888 1889 cm = c->m; 1890 PetscCall(PetscMalloc1(cm + 1, &rowhit)); 1891 PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1892 for (i = 0; i < nis; i++) { /* loop over color */ 1893 PetscCall(ISGetLocalSize(isa[i], &n)); 1894 PetscCall(ISGetIndices(isa[i], &is)); 1895 1896 c->ncolumns[i] = n; 1897 if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1898 colorforcol[i + 1] = colorforcol[i] + n; 1899 columns_i += n; 1900 1901 /* fast, crude version requires O(N*N) work */ 1902 PetscCall(PetscArrayzero(rowhit, cm)); 1903 1904 for (j = 0; j < n; j++) { /* loop over columns*/ 1905 col = is[j]; 1906 row_idx = cj + ci[col]; 1907 m = ci[col + 1] - ci[col]; 1908 for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1909 idxhit[*row_idx] = spidx[ci[col] + k]; 1910 rowhit[*row_idx++] = col + 1; 1911 } 1912 } 1913 /* count the number of hits */ 1914 nrows = 0; 1915 for (j = 0; j < cm; j++) { 1916 if (rowhit[j]) nrows++; 1917 } 1918 c->nrows[i] = nrows; 1919 colorforrow[i + 1] = colorforrow[i] + nrows; 1920 1921 nrows = 0; 1922 for (j = 0; j < cm; j++) { /* loop over rows */ 1923 if (rowhit[j]) { 1924 rows_i[nrows] = j; 1925 den2sp_i[nrows] = idxhit[j]; 1926 nrows++; 1927 } 1928 } 1929 den2sp_i += nrows; 1930 1931 PetscCall(ISRestoreIndices(isa[i], &is)); 1932 rows_i += nrows; 1933 } 1934 PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1935 PetscCall(PetscFree(rowhit)); 1936 PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 1937 PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1938 1939 c->colorforrow = colorforrow; 1940 c->rows = rows; 1941 c->den2sp = den2sp; 1942 c->colorforcol = colorforcol; 1943 c->columns = columns; 1944 1945 PetscCall(PetscFree(idxhit)); 1946 PetscFunctionReturn(PETSC_SUCCESS); 1947 } 1948 1949 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1950 { 1951 Mat_Product *product = C->product; 1952 Mat A = product->A, B = product->B; 1953 1954 PetscFunctionBegin; 1955 if (C->ops->mattransposemultnumeric) { 1956 /* Alg: "outerproduct" */ 1957 PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 1958 } else { 1959 /* Alg: "matmatmult" -- C = At*B */ 1960 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 1961 1962 PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1963 if (atb->At) { 1964 /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 1965 user may have called MatProductReplaceMats() to get this A=product->A */ 1966 PetscCall(MatTransposeSetPrecursor(A, atb->At)); 1967 PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 1968 } 1969 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 1970 } 1971 PetscFunctionReturn(PETSC_SUCCESS); 1972 } 1973 1974 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1975 { 1976 Mat_Product *product = C->product; 1977 Mat A = product->A, B = product->B; 1978 PetscReal fill = product->fill; 1979 1980 PetscFunctionBegin; 1981 PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 1982 1983 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1984 PetscFunctionReturn(PETSC_SUCCESS); 1985 } 1986 1987 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1988 { 1989 Mat_Product *product = C->product; 1990 PetscInt alg = 0; /* default algorithm */ 1991 PetscBool flg = PETSC_FALSE; 1992 #if !defined(PETSC_HAVE_HYPRE) 1993 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 1994 PetscInt nalg = 7; 1995 #else 1996 const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 1997 PetscInt nalg = 8; 1998 #endif 1999 2000 PetscFunctionBegin; 2001 /* Set default algorithm */ 2002 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2003 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2004 2005 /* Get runtime option */ 2006 if (product->api_user) { 2007 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 2008 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 2009 PetscOptionsEnd(); 2010 } else { 2011 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 2012 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 2013 PetscOptionsEnd(); 2014 } 2015 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2016 2017 C->ops->productsymbolic = MatProductSymbolic_AB; 2018 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 2019 PetscFunctionReturn(PETSC_SUCCESS); 2020 } 2021 2022 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2023 { 2024 Mat_Product *product = C->product; 2025 PetscInt alg = 0; /* default algorithm */ 2026 PetscBool flg = PETSC_FALSE; 2027 const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 2028 PetscInt nalg = 3; 2029 2030 PetscFunctionBegin; 2031 /* Get runtime option */ 2032 if (product->api_user) { 2033 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 2034 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2035 PetscOptionsEnd(); 2036 } else { 2037 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 2038 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 2039 PetscOptionsEnd(); 2040 } 2041 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2042 2043 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 2044 PetscFunctionReturn(PETSC_SUCCESS); 2045 } 2046 2047 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2048 { 2049 Mat_Product *product = C->product; 2050 PetscInt alg = 0; /* default algorithm */ 2051 PetscBool flg = PETSC_FALSE; 2052 const char *algTypes[2] = {"default", "color"}; 2053 PetscInt nalg = 2; 2054 2055 PetscFunctionBegin; 2056 /* Set default algorithm */ 2057 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2058 if (!flg) { 2059 alg = 1; 2060 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2061 } 2062 2063 /* Get runtime option */ 2064 if (product->api_user) { 2065 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 2066 PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2067 PetscOptionsEnd(); 2068 } else { 2069 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 2070 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2071 PetscOptionsEnd(); 2072 } 2073 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2074 2075 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2076 C->ops->productsymbolic = MatProductSymbolic_ABt; 2077 PetscFunctionReturn(PETSC_SUCCESS); 2078 } 2079 2080 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2081 { 2082 Mat_Product *product = C->product; 2083 PetscBool flg = PETSC_FALSE; 2084 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2085 #if !defined(PETSC_HAVE_HYPRE) 2086 const char *algTypes[2] = {"scalable", "rap"}; 2087 PetscInt nalg = 2; 2088 #else 2089 const char *algTypes[3] = {"scalable", "rap", "hypre"}; 2090 PetscInt nalg = 3; 2091 #endif 2092 2093 PetscFunctionBegin; 2094 /* Set default algorithm */ 2095 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2096 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2097 2098 /* Get runtime option */ 2099 if (product->api_user) { 2100 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 2101 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2102 PetscOptionsEnd(); 2103 } else { 2104 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 2105 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2106 PetscOptionsEnd(); 2107 } 2108 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2109 2110 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2111 PetscFunctionReturn(PETSC_SUCCESS); 2112 } 2113 2114 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2115 { 2116 Mat_Product *product = C->product; 2117 PetscBool flg = PETSC_FALSE; 2118 PetscInt alg = 0; /* default algorithm */ 2119 const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 2120 PetscInt nalg = 3; 2121 2122 PetscFunctionBegin; 2123 /* Set default algorithm */ 2124 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2125 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2126 2127 /* Get runtime option */ 2128 if (product->api_user) { 2129 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 2130 PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2131 PetscOptionsEnd(); 2132 } else { 2133 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 2134 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2135 PetscOptionsEnd(); 2136 } 2137 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2138 2139 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2140 PetscFunctionReturn(PETSC_SUCCESS); 2141 } 2142 2143 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2144 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2145 { 2146 Mat_Product *product = C->product; 2147 PetscInt alg = 0; /* default algorithm */ 2148 PetscBool flg = PETSC_FALSE; 2149 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 2150 PetscInt nalg = 7; 2151 2152 PetscFunctionBegin; 2153 /* Set default algorithm */ 2154 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2155 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2156 2157 /* Get runtime option */ 2158 if (product->api_user) { 2159 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 2160 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2161 PetscOptionsEnd(); 2162 } else { 2163 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 2164 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2165 PetscOptionsEnd(); 2166 } 2167 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2168 2169 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2170 C->ops->productsymbolic = MatProductSymbolic_ABC; 2171 PetscFunctionReturn(PETSC_SUCCESS); 2172 } 2173 2174 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2175 { 2176 Mat_Product *product = C->product; 2177 2178 PetscFunctionBegin; 2179 switch (product->type) { 2180 case MATPRODUCT_AB: 2181 PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2182 break; 2183 case MATPRODUCT_AtB: 2184 PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2185 break; 2186 case MATPRODUCT_ABt: 2187 PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2188 break; 2189 case MATPRODUCT_PtAP: 2190 PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2191 break; 2192 case MATPRODUCT_RARt: 2193 PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2194 break; 2195 case MATPRODUCT_ABC: 2196 PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2197 break; 2198 default: 2199 break; 2200 } 2201 PetscFunctionReturn(PETSC_SUCCESS); 2202 } 2203