1 2 /* 3 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 4 C = A * B 5 */ 6 7 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 8 #include <../src/mat/utils/freespace.h> 9 #include <petscbt.h> 10 #include <petsc/private/isimpl.h> 11 #include <../src/mat/impls/dense/seq/dense.h> 12 13 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 14 { 15 PetscFunctionBegin; 16 if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 17 else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 18 PetscFunctionReturn(PETSC_SUCCESS); 19 } 20 21 /* Modified from MatCreateSeqAIJWithArrays() */ 22 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) 23 { 24 PetscInt ii; 25 Mat_SeqAIJ *aij; 26 PetscBool isseqaij, osingle, ofree_a, ofree_ij; 27 28 PetscFunctionBegin; 29 PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 30 PetscCall(MatSetSizes(mat, m, n, m, n)); 31 32 if (!mtype) { 33 PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 34 if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 35 } else { 36 PetscCall(MatSetType(mat, mtype)); 37 } 38 39 aij = (Mat_SeqAIJ *)(mat)->data; 40 osingle = aij->singlemalloc; 41 ofree_a = aij->free_a; 42 ofree_ij = aij->free_ij; 43 /* changes the free flags */ 44 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 45 46 PetscCall(PetscFree(aij->ilen)); 47 PetscCall(PetscFree(aij->imax)); 48 PetscCall(PetscMalloc1(m, &aij->imax)); 49 PetscCall(PetscMalloc1(m, &aij->ilen)); 50 for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 51 const PetscInt rnz = i[ii + 1] - i[ii]; 52 aij->nonzerorowcnt += !!rnz; 53 aij->rmax = PetscMax(aij->rmax, rnz); 54 aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 55 } 56 aij->maxnz = i[m]; 57 aij->nz = i[m]; 58 59 if (osingle) { 60 PetscCall(PetscFree3(aij->a, aij->j, aij->i)); 61 } else { 62 if (ofree_a) PetscCall(PetscFree(aij->a)); 63 if (ofree_ij) PetscCall(PetscFree(aij->j)); 64 if (ofree_ij) PetscCall(PetscFree(aij->i)); 65 } 66 aij->i = i; 67 aij->j = j; 68 aij->a = a; 69 aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 70 /* default to not retain ownership */ 71 aij->singlemalloc = PETSC_FALSE; 72 aij->free_a = PETSC_FALSE; 73 aij->free_ij = PETSC_FALSE; 74 PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 75 PetscFunctionReturn(PETSC_SUCCESS); 76 } 77 78 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 79 { 80 Mat_Product *product = C->product; 81 MatProductAlgorithm alg; 82 PetscBool flg; 83 84 PetscFunctionBegin; 85 if (product) { 86 alg = product->alg; 87 } else { 88 alg = "sorted"; 89 } 90 /* sorted */ 91 PetscCall(PetscStrcmp(alg, "sorted", &flg)); 92 if (flg) { 93 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 94 PetscFunctionReturn(PETSC_SUCCESS); 95 } 96 97 /* scalable */ 98 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 99 if (flg) { 100 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 101 PetscFunctionReturn(PETSC_SUCCESS); 102 } 103 104 /* scalable_fast */ 105 PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 106 if (flg) { 107 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 108 PetscFunctionReturn(PETSC_SUCCESS); 109 } 110 111 /* heap */ 112 PetscCall(PetscStrcmp(alg, "heap", &flg)); 113 if (flg) { 114 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 115 PetscFunctionReturn(PETSC_SUCCESS); 116 } 117 118 /* btheap */ 119 PetscCall(PetscStrcmp(alg, "btheap", &flg)); 120 if (flg) { 121 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 122 PetscFunctionReturn(PETSC_SUCCESS); 123 } 124 125 /* llcondensed */ 126 PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 127 if (flg) { 128 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 129 PetscFunctionReturn(PETSC_SUCCESS); 130 } 131 132 /* rowmerge */ 133 PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 134 if (flg) { 135 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 136 PetscFunctionReturn(PETSC_SUCCESS); 137 } 138 139 #if defined(PETSC_HAVE_HYPRE) 140 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 141 if (flg) { 142 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 143 PetscFunctionReturn(PETSC_SUCCESS); 144 } 145 #endif 146 147 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 148 } 149 150 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) 151 { 152 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 153 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 154 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 155 PetscReal afill; 156 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 157 PetscHMapI ta; 158 PetscBT lnkbt; 159 PetscFreeSpaceList free_space = NULL, current_space = NULL; 160 161 PetscFunctionBegin; 162 /* Get ci and cj */ 163 /*---------------*/ 164 /* Allocate ci array, arrays for fill computation and */ 165 /* free space for accumulating nonzero column info */ 166 PetscCall(PetscMalloc1(am + 2, &ci)); 167 ci[0] = 0; 168 169 /* create and initialize a linked list */ 170 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 171 MatRowMergeMax_SeqAIJ(b, bm, ta); 172 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 173 PetscCall(PetscHMapIDestroy(&ta)); 174 175 PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 176 177 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 178 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 179 180 current_space = free_space; 181 182 /* Determine ci and cj */ 183 for (i = 0; i < am; i++) { 184 anzi = ai[i + 1] - ai[i]; 185 aj = a->j + ai[i]; 186 for (j = 0; j < anzi; j++) { 187 brow = aj[j]; 188 bnzj = bi[brow + 1] - bi[brow]; 189 bj = b->j + bi[brow]; 190 /* add non-zero cols of B into the sorted linked list lnk */ 191 PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 192 } 193 /* add possible missing diagonal entry */ 194 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 195 cnzi = lnk[0]; 196 197 /* If free space is not available, make more free space */ 198 /* Double the amount of total space in the list */ 199 if (current_space->local_remaining < cnzi) { 200 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 201 ndouble++; 202 } 203 204 /* Copy data into free space, then initialize lnk */ 205 PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 206 207 current_space->array += cnzi; 208 current_space->local_used += cnzi; 209 current_space->local_remaining -= cnzi; 210 211 ci[i + 1] = ci[i] + cnzi; 212 } 213 214 /* Column indices are in the list of free space */ 215 /* Allocate space for cj, initialize cj, and */ 216 /* destroy list of free space and other temporary array(s) */ 217 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 218 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 219 PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 220 221 /* put together the new symbolic matrix */ 222 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 223 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 224 225 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 226 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 227 c = (Mat_SeqAIJ *)(C->data); 228 c->free_a = PETSC_FALSE; 229 c->free_ij = PETSC_TRUE; 230 c->nonew = 0; 231 232 /* fast, needs non-scalable O(bn) array 'abdense' */ 233 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 234 235 /* set MatInfo */ 236 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 237 if (afill < 1.0) afill = 1.0; 238 C->info.mallocs = ndouble; 239 C->info.fill_ratio_given = fill; 240 C->info.fill_ratio_needed = afill; 241 242 #if defined(PETSC_USE_INFO) 243 if (ci[am]) { 244 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 245 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 246 } else { 247 PetscCall(PetscInfo(C, "Empty matrix product\n")); 248 } 249 #endif 250 PetscFunctionReturn(PETSC_SUCCESS); 251 } 252 253 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) 254 { 255 PetscLogDouble flops = 0.0; 256 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 257 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 258 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 259 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 260 PetscInt am = A->rmap->n, cm = C->rmap->n; 261 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 262 PetscScalar *ca, valtmp; 263 PetscScalar *ab_dense; 264 PetscContainer cab_dense; 265 const PetscScalar *aa, *ba, *baj; 266 267 PetscFunctionBegin; 268 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 269 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 270 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 271 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 272 c->a = ca; 273 c->free_a = PETSC_TRUE; 274 } else ca = c->a; 275 276 /* TODO this should be done in the symbolic phase */ 277 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 278 that is hard to eradicate) */ 279 PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 280 if (!cab_dense) { 281 PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 282 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense)); 283 PetscCall(PetscContainerSetPointer(cab_dense, ab_dense)); 284 PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault)); 285 PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense)); 286 PetscCall(PetscObjectDereference((PetscObject)cab_dense)); 287 } 288 PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense)); 289 PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 290 291 /* clean old values in C */ 292 PetscCall(PetscArrayzero(ca, ci[cm])); 293 /* Traverse A row-wise. */ 294 /* Build the ith row in C by summing over nonzero columns in A, */ 295 /* the rows of B corresponding to nonzeros of A. */ 296 for (i = 0; i < am; i++) { 297 anzi = ai[i + 1] - ai[i]; 298 for (j = 0; j < anzi; j++) { 299 brow = aj[j]; 300 bnzi = bi[brow + 1] - bi[brow]; 301 bjj = bj + bi[brow]; 302 baj = ba + bi[brow]; 303 /* perform dense axpy */ 304 valtmp = aa[j]; 305 for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 306 flops += 2 * bnzi; 307 } 308 aj += anzi; 309 aa += anzi; 310 311 cnzi = ci[i + 1] - ci[i]; 312 for (k = 0; k < cnzi; k++) { 313 ca[k] += ab_dense[cj[k]]; 314 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 315 } 316 flops += cnzi; 317 cj += cnzi; 318 ca += cnzi; 319 } 320 #if defined(PETSC_HAVE_DEVICE) 321 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 322 #endif 323 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 324 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 325 PetscCall(PetscLogFlops(flops)); 326 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 327 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 328 PetscFunctionReturn(PETSC_SUCCESS); 329 } 330 331 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) 332 { 333 PetscLogDouble flops = 0.0; 334 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 335 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 336 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 337 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 338 PetscInt am = A->rmap->N, cm = C->rmap->N; 339 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 340 PetscScalar *ca = c->a, valtmp; 341 const PetscScalar *aa, *ba, *baj; 342 PetscInt nextb; 343 344 PetscFunctionBegin; 345 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 346 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 347 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 348 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 349 c->a = ca; 350 c->free_a = PETSC_TRUE; 351 } 352 353 /* clean old values in C */ 354 PetscCall(PetscArrayzero(ca, ci[cm])); 355 /* Traverse A row-wise. */ 356 /* Build the ith row in C by summing over nonzero columns in A, */ 357 /* the rows of B corresponding to nonzeros of A. */ 358 for (i = 0; i < am; i++) { 359 anzi = ai[i + 1] - ai[i]; 360 cnzi = ci[i + 1] - ci[i]; 361 for (j = 0; j < anzi; j++) { 362 brow = aj[j]; 363 bnzi = bi[brow + 1] - bi[brow]; 364 bjj = bj + bi[brow]; 365 baj = ba + bi[brow]; 366 /* perform sparse axpy */ 367 valtmp = aa[j]; 368 nextb = 0; 369 for (k = 0; nextb < bnzi; k++) { 370 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 371 ca[k] += valtmp * baj[nextb++]; 372 } 373 } 374 flops += 2 * bnzi; 375 } 376 aj += anzi; 377 aa += anzi; 378 cj += cnzi; 379 ca += cnzi; 380 } 381 #if defined(PETSC_HAVE_DEVICE) 382 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 383 #endif 384 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 385 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 386 PetscCall(PetscLogFlops(flops)); 387 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 388 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 389 PetscFunctionReturn(PETSC_SUCCESS); 390 } 391 392 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) 393 { 394 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 395 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 396 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 397 MatScalar *ca; 398 PetscReal afill; 399 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 400 PetscHMapI ta; 401 PetscFreeSpaceList free_space = NULL, current_space = NULL; 402 403 PetscFunctionBegin; 404 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 405 /*-----------------------------------------------------------------------------------------*/ 406 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 407 PetscCall(PetscMalloc1(am + 2, &ci)); 408 ci[0] = 0; 409 410 /* create and initialize a linked list */ 411 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 412 MatRowMergeMax_SeqAIJ(b, bm, ta); 413 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 414 PetscCall(PetscHMapIDestroy(&ta)); 415 416 PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 417 418 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 419 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 420 current_space = free_space; 421 422 /* Determine ci and cj */ 423 for (i = 0; i < am; i++) { 424 anzi = ai[i + 1] - ai[i]; 425 aj = a->j + ai[i]; 426 for (j = 0; j < anzi; j++) { 427 brow = aj[j]; 428 bnzj = bi[brow + 1] - bi[brow]; 429 bj = b->j + bi[brow]; 430 /* add non-zero cols of B into the sorted linked list lnk */ 431 PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 432 } 433 /* add possible missing diagonal entry */ 434 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 435 cnzi = lnk[1]; 436 437 /* If free space is not available, make more free space */ 438 /* Double the amount of total space in the list */ 439 if (current_space->local_remaining < cnzi) { 440 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 441 ndouble++; 442 } 443 444 /* Copy data into free space, then initialize lnk */ 445 PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 446 447 current_space->array += cnzi; 448 current_space->local_used += cnzi; 449 current_space->local_remaining -= cnzi; 450 451 ci[i + 1] = ci[i] + cnzi; 452 } 453 454 /* Column indices are in the list of free space */ 455 /* Allocate space for cj, initialize cj, and */ 456 /* destroy list of free space and other temporary array(s) */ 457 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 458 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 459 PetscCall(PetscLLCondensedDestroy_fast(lnk)); 460 461 /* Allocate space for ca */ 462 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 463 464 /* put together the new symbolic matrix */ 465 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 466 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 467 468 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 469 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 470 c = (Mat_SeqAIJ *)(C->data); 471 c->free_a = PETSC_TRUE; 472 c->free_ij = PETSC_TRUE; 473 c->nonew = 0; 474 475 /* slower, less memory */ 476 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 477 478 /* set MatInfo */ 479 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 480 if (afill < 1.0) afill = 1.0; 481 C->info.mallocs = ndouble; 482 C->info.fill_ratio_given = fill; 483 C->info.fill_ratio_needed = afill; 484 485 #if defined(PETSC_USE_INFO) 486 if (ci[am]) { 487 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 488 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 489 } else { 490 PetscCall(PetscInfo(C, "Empty matrix product\n")); 491 } 492 #endif 493 PetscFunctionReturn(PETSC_SUCCESS); 494 } 495 496 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) 497 { 498 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 499 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 500 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 501 MatScalar *ca; 502 PetscReal afill; 503 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 504 PetscHMapI ta; 505 PetscFreeSpaceList free_space = NULL, current_space = NULL; 506 507 PetscFunctionBegin; 508 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 509 /*---------------------------------------------------------------------------------------------*/ 510 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 511 PetscCall(PetscMalloc1(am + 2, &ci)); 512 ci[0] = 0; 513 514 /* create and initialize a linked list */ 515 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 516 MatRowMergeMax_SeqAIJ(b, bm, ta); 517 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 518 PetscCall(PetscHMapIDestroy(&ta)); 519 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 520 521 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 522 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 523 current_space = free_space; 524 525 /* Determine ci and cj */ 526 for (i = 0; i < am; i++) { 527 anzi = ai[i + 1] - ai[i]; 528 aj = a->j + ai[i]; 529 for (j = 0; j < anzi; j++) { 530 brow = aj[j]; 531 bnzj = bi[brow + 1] - bi[brow]; 532 bj = b->j + bi[brow]; 533 /* add non-zero cols of B into the sorted linked list lnk */ 534 PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 535 } 536 /* add possible missing diagonal entry */ 537 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 538 539 cnzi = lnk[0]; 540 541 /* If free space is not available, make more free space */ 542 /* Double the amount of total space in the list */ 543 if (current_space->local_remaining < cnzi) { 544 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 545 ndouble++; 546 } 547 548 /* Copy data into free space, then initialize lnk */ 549 PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 550 551 current_space->array += cnzi; 552 current_space->local_used += cnzi; 553 current_space->local_remaining -= cnzi; 554 555 ci[i + 1] = ci[i] + cnzi; 556 } 557 558 /* Column indices are in the list of free space */ 559 /* Allocate space for cj, initialize cj, and */ 560 /* destroy list of free space and other temporary array(s) */ 561 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 562 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 563 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 564 565 /* Allocate space for ca */ 566 /*-----------------------*/ 567 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 568 569 /* put together the new symbolic matrix */ 570 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 571 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 572 573 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 574 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 575 c = (Mat_SeqAIJ *)(C->data); 576 c->free_a = PETSC_TRUE; 577 c->free_ij = PETSC_TRUE; 578 c->nonew = 0; 579 580 /* slower, less memory */ 581 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 582 583 /* set MatInfo */ 584 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 585 if (afill < 1.0) afill = 1.0; 586 C->info.mallocs = ndouble; 587 C->info.fill_ratio_given = fill; 588 C->info.fill_ratio_needed = afill; 589 590 #if defined(PETSC_USE_INFO) 591 if (ci[am]) { 592 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 593 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 594 } else { 595 PetscCall(PetscInfo(C, "Empty matrix product\n")); 596 } 597 #endif 598 PetscFunctionReturn(PETSC_SUCCESS); 599 } 600 601 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) 602 { 603 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 604 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 605 PetscInt *ci, *cj, *bb; 606 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 607 PetscReal afill; 608 PetscInt i, j, col, ndouble = 0; 609 PetscFreeSpaceList free_space = NULL, current_space = NULL; 610 PetscHeap h; 611 612 PetscFunctionBegin; 613 /* Get ci and cj - by merging sorted rows using a heap */ 614 /*---------------------------------------------------------------------------------------------*/ 615 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 616 PetscCall(PetscMalloc1(am + 2, &ci)); 617 ci[0] = 0; 618 619 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 620 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 621 current_space = free_space; 622 623 PetscCall(PetscHeapCreate(a->rmax, &h)); 624 PetscCall(PetscMalloc1(a->rmax, &bb)); 625 626 /* Determine ci and cj */ 627 for (i = 0; i < am; i++) { 628 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 629 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 630 ci[i + 1] = ci[i]; 631 /* Populate the min heap */ 632 for (j = 0; j < anzi; j++) { 633 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 634 if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 635 PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 636 } 637 } 638 /* Pick off the min element, adding it to free space */ 639 PetscCall(PetscHeapPop(h, &j, &col)); 640 while (j >= 0) { 641 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 642 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 643 ndouble++; 644 } 645 *(current_space->array++) = col; 646 current_space->local_used++; 647 current_space->local_remaining--; 648 ci[i + 1]++; 649 650 /* stash if anything else remains in this row of B */ 651 if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 652 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 653 PetscInt j2, col2; 654 PetscCall(PetscHeapPeek(h, &j2, &col2)); 655 if (col2 != col) break; 656 PetscCall(PetscHeapPop(h, &j2, &col2)); 657 if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 658 } 659 /* Put any stashed elements back into the min heap */ 660 PetscCall(PetscHeapUnstash(h)); 661 PetscCall(PetscHeapPop(h, &j, &col)); 662 } 663 } 664 PetscCall(PetscFree(bb)); 665 PetscCall(PetscHeapDestroy(&h)); 666 667 /* Column indices are in the list of free space */ 668 /* Allocate space for cj, initialize cj, and */ 669 /* destroy list of free space and other temporary array(s) */ 670 PetscCall(PetscMalloc1(ci[am], &cj)); 671 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 672 673 /* put together the new symbolic matrix */ 674 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 675 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 676 677 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 678 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 679 c = (Mat_SeqAIJ *)(C->data); 680 c->free_a = PETSC_TRUE; 681 c->free_ij = PETSC_TRUE; 682 c->nonew = 0; 683 684 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 685 686 /* set MatInfo */ 687 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 688 if (afill < 1.0) afill = 1.0; 689 C->info.mallocs = ndouble; 690 C->info.fill_ratio_given = fill; 691 C->info.fill_ratio_needed = afill; 692 693 #if defined(PETSC_USE_INFO) 694 if (ci[am]) { 695 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 696 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 697 } else { 698 PetscCall(PetscInfo(C, "Empty matrix product\n")); 699 } 700 #endif 701 PetscFunctionReturn(PETSC_SUCCESS); 702 } 703 704 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) 705 { 706 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 707 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 708 PetscInt *ci, *cj, *bb; 709 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 710 PetscReal afill; 711 PetscInt i, j, col, ndouble = 0; 712 PetscFreeSpaceList free_space = NULL, current_space = NULL; 713 PetscHeap h; 714 PetscBT bt; 715 716 PetscFunctionBegin; 717 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 718 /*---------------------------------------------------------------------------------------------*/ 719 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 720 PetscCall(PetscMalloc1(am + 2, &ci)); 721 ci[0] = 0; 722 723 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 724 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 725 726 current_space = free_space; 727 728 PetscCall(PetscHeapCreate(a->rmax, &h)); 729 PetscCall(PetscMalloc1(a->rmax, &bb)); 730 PetscCall(PetscBTCreate(bn, &bt)); 731 732 /* Determine ci and cj */ 733 for (i = 0; i < am; i++) { 734 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 735 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 736 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 737 ci[i + 1] = ci[i]; 738 /* Populate the min heap */ 739 for (j = 0; j < anzi; j++) { 740 PetscInt brow = acol[j]; 741 for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 742 PetscInt bcol = bj[bb[j]]; 743 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 744 PetscCall(PetscHeapAdd(h, j, bcol)); 745 bb[j]++; 746 break; 747 } 748 } 749 } 750 /* Pick off the min element, adding it to free space */ 751 PetscCall(PetscHeapPop(h, &j, &col)); 752 while (j >= 0) { 753 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 754 fptr = NULL; /* need PetscBTMemzero */ 755 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 756 ndouble++; 757 } 758 *(current_space->array++) = col; 759 current_space->local_used++; 760 current_space->local_remaining--; 761 ci[i + 1]++; 762 763 /* stash if anything else remains in this row of B */ 764 for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 765 PetscInt bcol = bj[bb[j]]; 766 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 767 PetscCall(PetscHeapAdd(h, j, bcol)); 768 bb[j]++; 769 break; 770 } 771 } 772 PetscCall(PetscHeapPop(h, &j, &col)); 773 } 774 if (fptr) { /* Clear the bits for this row */ 775 for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 776 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 777 PetscCall(PetscBTMemzero(bn, bt)); 778 } 779 } 780 PetscCall(PetscFree(bb)); 781 PetscCall(PetscHeapDestroy(&h)); 782 PetscCall(PetscBTDestroy(&bt)); 783 784 /* Column indices are in the list of free space */ 785 /* Allocate space for cj, initialize cj, and */ 786 /* destroy list of free space and other temporary array(s) */ 787 PetscCall(PetscMalloc1(ci[am], &cj)); 788 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 789 790 /* put together the new symbolic matrix */ 791 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 792 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 793 794 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 795 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 796 c = (Mat_SeqAIJ *)(C->data); 797 c->free_a = PETSC_TRUE; 798 c->free_ij = PETSC_TRUE; 799 c->nonew = 0; 800 801 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 802 803 /* set MatInfo */ 804 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 805 if (afill < 1.0) afill = 1.0; 806 C->info.mallocs = ndouble; 807 C->info.fill_ratio_given = fill; 808 C->info.fill_ratio_needed = afill; 809 810 #if defined(PETSC_USE_INFO) 811 if (ci[am]) { 812 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 813 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 814 } else { 815 PetscCall(PetscInfo(C, "Empty matrix product\n")); 816 } 817 #endif 818 PetscFunctionReturn(PETSC_SUCCESS); 819 } 820 821 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) 822 { 823 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 824 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 825 PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 826 PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 827 const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 828 const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 829 const PetscInt *brow_ptr[8], *brow_end[8]; 830 PetscInt window[8]; 831 PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 832 PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 833 PetscReal afill; 834 PetscInt *workj_L1, *workj_L2, *workj_L3; 835 PetscInt L1_nnz, L2_nnz; 836 837 /* Step 1: Get upper bound on memory required for allocation. 838 Because of the way virtual memory works, 839 only the memory pages that are actually needed will be physically allocated. */ 840 PetscFunctionBegin; 841 PetscCall(PetscMalloc1(am + 1, &ci)); 842 for (i = 0; i < am; i++) { 843 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 844 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 845 a_rownnz = 0; 846 for (k = 0; k < anzi; ++k) { 847 a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 848 if (a_rownnz > bn) { 849 a_rownnz = bn; 850 break; 851 } 852 } 853 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 854 } 855 /* temporary work areas for merging rows */ 856 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 857 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 858 PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 859 860 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 861 c_maxmem = 8 * (ai[am] + bi[bm]); 862 /* Step 2: Populate pattern for C */ 863 PetscCall(PetscMalloc1(c_maxmem, &cj)); 864 865 ci_nnz = 0; 866 ci[0] = 0; 867 worki_L1[0] = 0; 868 worki_L2[0] = 0; 869 for (i = 0; i < am; i++) { 870 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 871 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 872 rowsleft = anzi; 873 inputcol_L1 = acol; 874 L2_nnz = 0; 875 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 876 worki_L2[1] = 0; 877 outputi_nnz = 0; 878 879 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 880 while (ci_nnz + a_maxrownnz > c_maxmem) { 881 c_maxmem *= 2; 882 ndouble++; 883 PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 884 } 885 886 while (rowsleft) { 887 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 888 L1_nrows = 0; 889 L1_nnz = 0; 890 inputcol = inputcol_L1; 891 inputi = bi; 892 inputj = bj; 893 894 /* The following macro is used to specialize for small rows in A. 895 This helps with compiler unrolling, improving performance substantially. 896 Input: inputj inputi inputcol bn 897 Output: outputj outputi_nnz */ 898 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 899 window_min = bn; \ 900 outputi_nnz = 0; \ 901 for (k = 0; k < ANNZ; ++k) { \ 902 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 903 brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 904 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 905 window_min = PetscMin(window[k], window_min); \ 906 } \ 907 while (window_min < bn) { \ 908 outputj[outputi_nnz++] = window_min; \ 909 /* advance front and compute new minimum */ \ 910 old_window_min = window_min; \ 911 window_min = bn; \ 912 for (k = 0; k < ANNZ; ++k) { \ 913 if (window[k] == old_window_min) { \ 914 brow_ptr[k]++; \ 915 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 916 } \ 917 window_min = PetscMin(window[k], window_min); \ 918 } \ 919 } 920 921 /************** L E V E L 1 ***************/ 922 /* Merge up to 8 rows of B to L1 work array*/ 923 while (L1_rowsleft) { 924 outputi_nnz = 0; 925 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 926 else outputj = cj + ci_nnz; /* Merge directly to C */ 927 928 switch (L1_rowsleft) { 929 case 1: 930 brow_ptr[0] = inputj + inputi[inputcol[0]]; 931 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 932 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 933 inputcol += L1_rowsleft; 934 rowsleft -= L1_rowsleft; 935 L1_rowsleft = 0; 936 break; 937 case 2: 938 MatMatMultSymbolic_RowMergeMacro(2); 939 inputcol += L1_rowsleft; 940 rowsleft -= L1_rowsleft; 941 L1_rowsleft = 0; 942 break; 943 case 3: 944 MatMatMultSymbolic_RowMergeMacro(3); 945 inputcol += L1_rowsleft; 946 rowsleft -= L1_rowsleft; 947 L1_rowsleft = 0; 948 break; 949 case 4: 950 MatMatMultSymbolic_RowMergeMacro(4); 951 inputcol += L1_rowsleft; 952 rowsleft -= L1_rowsleft; 953 L1_rowsleft = 0; 954 break; 955 case 5: 956 MatMatMultSymbolic_RowMergeMacro(5); 957 inputcol += L1_rowsleft; 958 rowsleft -= L1_rowsleft; 959 L1_rowsleft = 0; 960 break; 961 case 6: 962 MatMatMultSymbolic_RowMergeMacro(6); 963 inputcol += L1_rowsleft; 964 rowsleft -= L1_rowsleft; 965 L1_rowsleft = 0; 966 break; 967 case 7: 968 MatMatMultSymbolic_RowMergeMacro(7); 969 inputcol += L1_rowsleft; 970 rowsleft -= L1_rowsleft; 971 L1_rowsleft = 0; 972 break; 973 default: 974 MatMatMultSymbolic_RowMergeMacro(8); 975 inputcol += 8; 976 rowsleft -= 8; 977 L1_rowsleft -= 8; 978 break; 979 } 980 inputcol_L1 = inputcol; 981 L1_nnz += outputi_nnz; 982 worki_L1[++L1_nrows] = L1_nnz; 983 } 984 985 /********************** L E V E L 2 ************************/ 986 /* Merge from L1 work array to either C or to L2 work array */ 987 if (anzi > 8) { 988 inputi = worki_L1; 989 inputj = workj_L1; 990 inputcol = workcol; 991 outputi_nnz = 0; 992 993 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 994 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 995 996 switch (L1_nrows) { 997 case 1: 998 brow_ptr[0] = inputj + inputi[inputcol[0]]; 999 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1000 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1001 break; 1002 case 2: 1003 MatMatMultSymbolic_RowMergeMacro(2); 1004 break; 1005 case 3: 1006 MatMatMultSymbolic_RowMergeMacro(3); 1007 break; 1008 case 4: 1009 MatMatMultSymbolic_RowMergeMacro(4); 1010 break; 1011 case 5: 1012 MatMatMultSymbolic_RowMergeMacro(5); 1013 break; 1014 case 6: 1015 MatMatMultSymbolic_RowMergeMacro(6); 1016 break; 1017 case 7: 1018 MatMatMultSymbolic_RowMergeMacro(7); 1019 break; 1020 case 8: 1021 MatMatMultSymbolic_RowMergeMacro(8); 1022 break; 1023 default: 1024 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1025 } 1026 L2_nnz += outputi_nnz; 1027 worki_L2[++L2_nrows] = L2_nnz; 1028 1029 /************************ L E V E L 3 **********************/ 1030 /* Merge from L2 work array to either C or to L2 work array */ 1031 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1032 inputi = worki_L2; 1033 inputj = workj_L2; 1034 inputcol = workcol; 1035 outputi_nnz = 0; 1036 if (rowsleft) outputj = workj_L3; 1037 else outputj = cj + ci_nnz; 1038 switch (L2_nrows) { 1039 case 1: 1040 brow_ptr[0] = inputj + inputi[inputcol[0]]; 1041 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1042 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1043 break; 1044 case 2: 1045 MatMatMultSymbolic_RowMergeMacro(2); 1046 break; 1047 case 3: 1048 MatMatMultSymbolic_RowMergeMacro(3); 1049 break; 1050 case 4: 1051 MatMatMultSymbolic_RowMergeMacro(4); 1052 break; 1053 case 5: 1054 MatMatMultSymbolic_RowMergeMacro(5); 1055 break; 1056 case 6: 1057 MatMatMultSymbolic_RowMergeMacro(6); 1058 break; 1059 case 7: 1060 MatMatMultSymbolic_RowMergeMacro(7); 1061 break; 1062 case 8: 1063 MatMatMultSymbolic_RowMergeMacro(8); 1064 break; 1065 default: 1066 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1067 } 1068 L2_nrows = 1; 1069 L2_nnz = outputi_nnz; 1070 worki_L2[1] = outputi_nnz; 1071 /* Copy to workj_L2 */ 1072 if (rowsleft) { 1073 for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1074 } 1075 } 1076 } 1077 } /* while (rowsleft) */ 1078 #undef MatMatMultSymbolic_RowMergeMacro 1079 1080 /* terminate current row */ 1081 ci_nnz += outputi_nnz; 1082 ci[i + 1] = ci_nnz; 1083 } 1084 1085 /* Step 3: Create the new symbolic matrix */ 1086 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1087 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1088 1089 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1090 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1091 c = (Mat_SeqAIJ *)(C->data); 1092 c->free_a = PETSC_TRUE; 1093 c->free_ij = PETSC_TRUE; 1094 c->nonew = 0; 1095 1096 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1097 1098 /* set MatInfo */ 1099 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1100 if (afill < 1.0) afill = 1.0; 1101 C->info.mallocs = ndouble; 1102 C->info.fill_ratio_given = fill; 1103 C->info.fill_ratio_needed = afill; 1104 1105 #if defined(PETSC_USE_INFO) 1106 if (ci[am]) { 1107 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1108 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1109 } else { 1110 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1111 } 1112 #endif 1113 1114 /* Step 4: Free temporary work areas */ 1115 PetscCall(PetscFree(workj_L1)); 1116 PetscCall(PetscFree(workj_L2)); 1117 PetscCall(PetscFree(workj_L3)); 1118 PetscFunctionReturn(PETSC_SUCCESS); 1119 } 1120 1121 /* concatenate unique entries and then sort */ 1122 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) 1123 { 1124 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1125 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 1126 PetscInt *ci, *cj, bcol; 1127 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1128 PetscReal afill; 1129 PetscInt i, j, ndouble = 0; 1130 PetscSegBuffer seg, segrow; 1131 char *seen; 1132 1133 PetscFunctionBegin; 1134 PetscCall(PetscMalloc1(am + 1, &ci)); 1135 ci[0] = 0; 1136 1137 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1138 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 1139 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 1140 PetscCall(PetscCalloc1(bn, &seen)); 1141 1142 /* Determine ci and cj */ 1143 for (i = 0; i < am; i++) { 1144 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1145 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1146 PetscInt packlen = 0, *PETSC_RESTRICT crow; 1147 1148 /* Pack segrow */ 1149 for (j = 0; j < anzi; j++) { 1150 PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1151 for (k = bjstart; k < bjend; k++) { 1152 bcol = bj[k]; 1153 if (!seen[bcol]) { /* new entry */ 1154 PetscInt *PETSC_RESTRICT slot; 1155 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1156 *slot = bcol; 1157 seen[bcol] = 1; 1158 packlen++; 1159 } 1160 } 1161 } 1162 1163 /* Check i-th diagonal entry */ 1164 if (C->force_diagonals && !seen[i]) { 1165 PetscInt *PETSC_RESTRICT slot; 1166 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1167 *slot = i; 1168 seen[i] = 1; 1169 packlen++; 1170 } 1171 1172 PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 1173 PetscCall(PetscSegBufferExtractTo(segrow, crow)); 1174 PetscCall(PetscSortInt(packlen, crow)); 1175 ci[i + 1] = ci[i] + packlen; 1176 for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1177 } 1178 PetscCall(PetscSegBufferDestroy(&segrow)); 1179 PetscCall(PetscFree(seen)); 1180 1181 /* Column indices are in the segmented buffer */ 1182 PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 1183 PetscCall(PetscSegBufferDestroy(&seg)); 1184 1185 /* put together the new symbolic matrix */ 1186 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1187 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1188 1189 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1190 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1191 c = (Mat_SeqAIJ *)(C->data); 1192 c->free_a = PETSC_TRUE; 1193 c->free_ij = PETSC_TRUE; 1194 c->nonew = 0; 1195 1196 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1197 1198 /* set MatInfo */ 1199 afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1200 if (afill < 1.0) afill = 1.0; 1201 C->info.mallocs = ndouble; 1202 C->info.fill_ratio_given = fill; 1203 C->info.fill_ratio_needed = afill; 1204 1205 #if defined(PETSC_USE_INFO) 1206 if (ci[am]) { 1207 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1208 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1209 } else { 1210 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1211 } 1212 #endif 1213 PetscFunctionReturn(PETSC_SUCCESS); 1214 } 1215 1216 PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1217 { 1218 Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data; 1219 1220 PetscFunctionBegin; 1221 PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 1222 PetscCall(MatDestroy(&abt->Bt_den)); 1223 PetscCall(MatDestroy(&abt->ABt_den)); 1224 PetscCall(PetscFree(abt)); 1225 PetscFunctionReturn(PETSC_SUCCESS); 1226 } 1227 1228 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1229 { 1230 Mat Bt; 1231 Mat_MatMatTransMult *abt; 1232 Mat_Product *product = C->product; 1233 char *alg; 1234 1235 PetscFunctionBegin; 1236 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1237 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1238 1239 /* create symbolic Bt */ 1240 PetscCall(MatTransposeSymbolic(B, &Bt)); 1241 PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1242 PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 1243 1244 /* get symbolic C=A*Bt */ 1245 PetscCall(PetscStrallocpy(product->alg, &alg)); 1246 PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 1247 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 1248 PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 1249 PetscCall(PetscFree(alg)); 1250 1251 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1252 PetscCall(PetscNew(&abt)); 1253 1254 product->data = abt; 1255 product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 1256 1257 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1258 1259 abt->usecoloring = PETSC_FALSE; 1260 PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 1261 if (abt->usecoloring) { 1262 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1263 MatTransposeColoring matcoloring; 1264 MatColoring coloring; 1265 ISColoring iscoloring; 1266 Mat Bt_dense, C_dense; 1267 1268 /* inode causes memory problem */ 1269 PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 1270 1271 PetscCall(MatColoringCreate(C, &coloring)); 1272 PetscCall(MatColoringSetDistance(coloring, 2)); 1273 PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 1274 PetscCall(MatColoringSetFromOptions(coloring)); 1275 PetscCall(MatColoringApply(coloring, &iscoloring)); 1276 PetscCall(MatColoringDestroy(&coloring)); 1277 PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 1278 1279 abt->matcoloring = matcoloring; 1280 1281 PetscCall(ISColoringDestroy(&iscoloring)); 1282 1283 /* Create Bt_dense and C_dense = A*Bt_dense */ 1284 PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 1285 PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 1286 PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 1287 PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 1288 1289 Bt_dense->assembled = PETSC_TRUE; 1290 abt->Bt_den = Bt_dense; 1291 1292 PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 1293 PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 1294 PetscCall(MatSetType(C_dense, MATSEQDENSE)); 1295 PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 1296 1297 Bt_dense->assembled = PETSC_TRUE; 1298 abt->ABt_den = C_dense; 1299 1300 #if defined(PETSC_USE_INFO) 1301 { 1302 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 1303 PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 1304 Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 1305 } 1306 #endif 1307 } 1308 /* clean up */ 1309 PetscCall(MatDestroy(&Bt)); 1310 PetscFunctionReturn(PETSC_SUCCESS); 1311 } 1312 1313 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1314 { 1315 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1316 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 1317 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 1318 PetscLogDouble flops = 0.0; 1319 MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 1320 Mat_MatMatTransMult *abt; 1321 Mat_Product *product = C->product; 1322 1323 PetscFunctionBegin; 1324 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1325 abt = (Mat_MatMatTransMult *)product->data; 1326 PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1327 /* clear old values in C */ 1328 if (!c->a) { 1329 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1330 c->a = ca; 1331 c->free_a = PETSC_TRUE; 1332 } else { 1333 ca = c->a; 1334 PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 1335 } 1336 1337 if (abt->usecoloring) { 1338 MatTransposeColoring matcoloring = abt->matcoloring; 1339 Mat Bt_dense, C_dense = abt->ABt_den; 1340 1341 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1342 Bt_dense = abt->Bt_den; 1343 PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1344 1345 /* C_dense = A*Bt_dense */ 1346 PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1347 1348 /* Recover C from C_dense */ 1349 PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 1350 PetscFunctionReturn(PETSC_SUCCESS); 1351 } 1352 1353 for (i = 0; i < cm; i++) { 1354 anzi = ai[i + 1] - ai[i]; 1355 acol = aj + ai[i]; 1356 aval = aa + ai[i]; 1357 cnzi = ci[i + 1] - ci[i]; 1358 ccol = cj + ci[i]; 1359 cval = ca + ci[i]; 1360 for (j = 0; j < cnzi; j++) { 1361 brow = ccol[j]; 1362 bnzj = bi[brow + 1] - bi[brow]; 1363 bcol = bj + bi[brow]; 1364 bval = ba + bi[brow]; 1365 1366 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1367 nexta = 0; 1368 nextb = 0; 1369 while (nexta < anzi && nextb < bnzj) { 1370 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1371 if (nexta == anzi) break; 1372 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1373 if (nextb == bnzj) break; 1374 if (acol[nexta] == bcol[nextb]) { 1375 cval[j] += aval[nexta] * bval[nextb]; 1376 nexta++; 1377 nextb++; 1378 flops += 2; 1379 } 1380 } 1381 } 1382 } 1383 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1384 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1385 PetscCall(PetscLogFlops(flops)); 1386 PetscFunctionReturn(PETSC_SUCCESS); 1387 } 1388 1389 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1390 { 1391 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data; 1392 1393 PetscFunctionBegin; 1394 PetscCall(MatDestroy(&atb->At)); 1395 if (atb->destroy) PetscCall((*atb->destroy)(atb->data)); 1396 PetscCall(PetscFree(atb)); 1397 PetscFunctionReturn(PETSC_SUCCESS); 1398 } 1399 1400 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1401 { 1402 Mat At = NULL; 1403 Mat_Product *product = C->product; 1404 PetscBool flg, def, square; 1405 1406 PetscFunctionBegin; 1407 MatCheckProduct(C, 4); 1408 square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 1409 /* outerproduct */ 1410 PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 1411 if (flg) { 1412 /* create symbolic At */ 1413 if (!square) { 1414 PetscCall(MatTransposeSymbolic(A, &At)); 1415 PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1416 PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1417 } 1418 /* get symbolic C=At*B */ 1419 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1420 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1421 1422 /* clean up */ 1423 if (!square) PetscCall(MatDestroy(&At)); 1424 1425 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1426 PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 1427 PetscFunctionReturn(PETSC_SUCCESS); 1428 } 1429 1430 /* matmatmult */ 1431 PetscCall(PetscStrcmp(product->alg, "default", &def)); 1432 PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1433 if (flg || def) { 1434 Mat_MatTransMatMult *atb; 1435 1436 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1437 PetscCall(PetscNew(&atb)); 1438 if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 1439 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1440 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1441 PetscCall(MatProductSetAlgorithm(C, "at*b")); 1442 product->data = atb; 1443 product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 1444 atb->At = At; 1445 1446 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1447 PetscFunctionReturn(PETSC_SUCCESS); 1448 } 1449 1450 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1451 } 1452 1453 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1454 { 1455 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1456 PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1457 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1458 PetscLogDouble flops = 0.0; 1459 MatScalar *aa = a->a, *ba, *ca, *caj; 1460 1461 PetscFunctionBegin; 1462 if (!c->a) { 1463 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1464 1465 c->a = ca; 1466 c->free_a = PETSC_TRUE; 1467 } else { 1468 ca = c->a; 1469 PetscCall(PetscArrayzero(ca, ci[cm])); 1470 } 1471 1472 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1473 for (i = 0; i < am; i++) { 1474 bj = b->j + bi[i]; 1475 ba = b->a + bi[i]; 1476 bnzi = bi[i + 1] - bi[i]; 1477 anzi = ai[i + 1] - ai[i]; 1478 for (j = 0; j < anzi; j++) { 1479 nextb = 0; 1480 crow = *aj++; 1481 cjj = cj + ci[crow]; 1482 caj = ca + ci[crow]; 1483 /* perform sparse axpy operation. Note cjj includes bj. */ 1484 for (k = 0; nextb < bnzi; k++) { 1485 if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 1486 caj[k] += (*aa) * (*(ba + nextb)); 1487 nextb++; 1488 } 1489 } 1490 flops += 2 * bnzi; 1491 aa++; 1492 } 1493 } 1494 1495 /* Assemble the final matrix and clean up */ 1496 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1497 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1498 PetscCall(PetscLogFlops(flops)); 1499 PetscFunctionReturn(PETSC_SUCCESS); 1500 } 1501 1502 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1503 { 1504 PetscFunctionBegin; 1505 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1506 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1507 PetscFunctionReturn(PETSC_SUCCESS); 1508 } 1509 1510 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) 1511 { 1512 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1513 PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1514 const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1515 const PetscInt *aj; 1516 PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 1517 PetscInt clda; 1518 PetscInt am4, bm4, col, i, j, n; 1519 1520 PetscFunctionBegin; 1521 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1522 PetscCall(MatSeqAIJGetArrayRead(A, &av)); 1523 if (add) { 1524 PetscCall(MatDenseGetArray(C, &c)); 1525 } else { 1526 PetscCall(MatDenseGetArrayWrite(C, &c)); 1527 } 1528 PetscCall(MatDenseGetArrayRead(B, &b)); 1529 PetscCall(MatDenseGetLDA(B, &bm)); 1530 PetscCall(MatDenseGetLDA(C, &clda)); 1531 am4 = 4 * clda; 1532 bm4 = 4 * bm; 1533 b1 = b; 1534 b2 = b1 + bm; 1535 b3 = b2 + bm; 1536 b4 = b3 + bm; 1537 c1 = c; 1538 c2 = c1 + clda; 1539 c3 = c2 + clda; 1540 c4 = c3 + clda; 1541 for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 1542 for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1543 r1 = r2 = r3 = r4 = 0.0; 1544 n = a->i[i + 1] - a->i[i]; 1545 aj = a->j + a->i[i]; 1546 aa = av + a->i[i]; 1547 for (j = 0; j < n; j++) { 1548 const PetscScalar aatmp = aa[j]; 1549 const PetscInt ajtmp = aj[j]; 1550 r1 += aatmp * b1[ajtmp]; 1551 r2 += aatmp * b2[ajtmp]; 1552 r3 += aatmp * b3[ajtmp]; 1553 r4 += aatmp * b4[ajtmp]; 1554 } 1555 if (add) { 1556 c1[i] += r1; 1557 c2[i] += r2; 1558 c3[i] += r3; 1559 c4[i] += r4; 1560 } else { 1561 c1[i] = r1; 1562 c2[i] = r2; 1563 c3[i] = r3; 1564 c4[i] = r4; 1565 } 1566 } 1567 b1 += bm4; 1568 b2 += bm4; 1569 b3 += bm4; 1570 b4 += bm4; 1571 c1 += am4; 1572 c2 += am4; 1573 c3 += am4; 1574 c4 += am4; 1575 } 1576 /* process remaining columns */ 1577 if (col != cn) { 1578 PetscInt rc = cn - col; 1579 1580 if (rc == 1) { 1581 for (i = 0; i < am; i++) { 1582 r1 = 0.0; 1583 n = a->i[i + 1] - a->i[i]; 1584 aj = a->j + a->i[i]; 1585 aa = av + a->i[i]; 1586 for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 1587 if (add) c1[i] += r1; 1588 else c1[i] = r1; 1589 } 1590 } else if (rc == 2) { 1591 for (i = 0; i < am; i++) { 1592 r1 = r2 = 0.0; 1593 n = a->i[i + 1] - a->i[i]; 1594 aj = a->j + a->i[i]; 1595 aa = av + a->i[i]; 1596 for (j = 0; j < n; j++) { 1597 const PetscScalar aatmp = aa[j]; 1598 const PetscInt ajtmp = aj[j]; 1599 r1 += aatmp * b1[ajtmp]; 1600 r2 += aatmp * b2[ajtmp]; 1601 } 1602 if (add) { 1603 c1[i] += r1; 1604 c2[i] += r2; 1605 } else { 1606 c1[i] = r1; 1607 c2[i] = r2; 1608 } 1609 } 1610 } else { 1611 for (i = 0; i < am; i++) { 1612 r1 = r2 = r3 = 0.0; 1613 n = a->i[i + 1] - a->i[i]; 1614 aj = a->j + a->i[i]; 1615 aa = av + a->i[i]; 1616 for (j = 0; j < n; j++) { 1617 const PetscScalar aatmp = aa[j]; 1618 const PetscInt ajtmp = aj[j]; 1619 r1 += aatmp * b1[ajtmp]; 1620 r2 += aatmp * b2[ajtmp]; 1621 r3 += aatmp * b3[ajtmp]; 1622 } 1623 if (add) { 1624 c1[i] += r1; 1625 c2[i] += r2; 1626 c3[i] += r3; 1627 } else { 1628 c1[i] = r1; 1629 c2[i] = r2; 1630 c3[i] = r3; 1631 } 1632 } 1633 } 1634 } 1635 PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 1636 if (add) { 1637 PetscCall(MatDenseRestoreArray(C, &c)); 1638 } else { 1639 PetscCall(MatDenseRestoreArrayWrite(C, &c)); 1640 } 1641 PetscCall(MatDenseRestoreArrayRead(B, &b)); 1642 PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 1643 PetscFunctionReturn(PETSC_SUCCESS); 1644 } 1645 1646 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) 1647 { 1648 PetscFunctionBegin; 1649 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1650 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1651 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1652 1653 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 1654 PetscFunctionReturn(PETSC_SUCCESS); 1655 } 1656 1657 /* ------------------------------------------------------- */ 1658 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1659 { 1660 PetscFunctionBegin; 1661 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1662 C->ops->productsymbolic = MatProductSymbolic_AB; 1663 PetscFunctionReturn(PETSC_SUCCESS); 1664 } 1665 1666 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 1667 1668 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1669 { 1670 PetscFunctionBegin; 1671 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1672 C->ops->productsymbolic = MatProductSymbolic_AtB; 1673 PetscFunctionReturn(PETSC_SUCCESS); 1674 } 1675 1676 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1677 { 1678 PetscFunctionBegin; 1679 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1680 C->ops->productsymbolic = MatProductSymbolic_ABt; 1681 PetscFunctionReturn(PETSC_SUCCESS); 1682 } 1683 1684 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1685 { 1686 Mat_Product *product = C->product; 1687 1688 PetscFunctionBegin; 1689 switch (product->type) { 1690 case MATPRODUCT_AB: 1691 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1692 break; 1693 case MATPRODUCT_AtB: 1694 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1695 break; 1696 case MATPRODUCT_ABt: 1697 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1698 break; 1699 default: 1700 break; 1701 } 1702 PetscFunctionReturn(PETSC_SUCCESS); 1703 } 1704 /* ------------------------------------------------------- */ 1705 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1706 { 1707 Mat_Product *product = C->product; 1708 Mat A = product->A; 1709 PetscBool baij; 1710 1711 PetscFunctionBegin; 1712 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 1713 if (!baij) { /* A is seqsbaij */ 1714 PetscBool sbaij; 1715 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 1716 PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 1717 1718 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1719 } else { /* A is seqbaij */ 1720 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1721 } 1722 1723 C->ops->productsymbolic = MatProductSymbolic_AB; 1724 PetscFunctionReturn(PETSC_SUCCESS); 1725 } 1726 1727 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1728 { 1729 Mat_Product *product = C->product; 1730 1731 PetscFunctionBegin; 1732 MatCheckProduct(C, 1); 1733 PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1734 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 1735 PetscFunctionReturn(PETSC_SUCCESS); 1736 } 1737 1738 /* ------------------------------------------------------- */ 1739 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1740 { 1741 PetscFunctionBegin; 1742 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1743 C->ops->productsymbolic = MatProductSymbolic_AB; 1744 PetscFunctionReturn(PETSC_SUCCESS); 1745 } 1746 1747 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1748 { 1749 Mat_Product *product = C->product; 1750 1751 PetscFunctionBegin; 1752 if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 1753 PetscFunctionReturn(PETSC_SUCCESS); 1754 } 1755 /* ------------------------------------------------------- */ 1756 1757 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) 1758 { 1759 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1760 Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 1761 PetscInt *bi = b->i, *bj = b->j; 1762 PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 1763 MatScalar *btval, *btval_den, *ba = b->a; 1764 PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1765 1766 PetscFunctionBegin; 1767 btval_den = btdense->v; 1768 PetscCall(PetscArrayzero(btval_den, m * n)); 1769 for (k = 0; k < ncolors; k++) { 1770 ncolumns = coloring->ncolumns[k]; 1771 for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1772 col = *(columns + colorforcol[k] + l); 1773 btcol = bj + bi[col]; 1774 btval = ba + bi[col]; 1775 anz = bi[col + 1] - bi[col]; 1776 for (j = 0; j < anz; j++) { 1777 brow = btcol[j]; 1778 btval_den[brow] = btval[j]; 1779 } 1780 } 1781 btval_den += m; 1782 } 1783 PetscFunctionReturn(PETSC_SUCCESS); 1784 } 1785 1786 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) 1787 { 1788 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 1789 const PetscScalar *ca_den, *ca_den_ptr; 1790 PetscScalar *ca = csp->a; 1791 PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1792 PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1793 PetscInt nrows, *row, *idx; 1794 PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 1795 1796 PetscFunctionBegin; 1797 PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1798 1799 if (brows > 0) { 1800 PetscInt *lstart, row_end, row_start; 1801 lstart = matcoloring->lstart; 1802 PetscCall(PetscArrayzero(lstart, ncolors)); 1803 1804 row_end = brows; 1805 if (row_end > m) row_end = m; 1806 for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1807 ca_den_ptr = ca_den; 1808 for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1809 nrows = matcoloring->nrows[k]; 1810 row = rows + colorforrow[k]; 1811 idx = den2sp + colorforrow[k]; 1812 for (l = lstart[k]; l < nrows; l++) { 1813 if (row[l] >= row_end) { 1814 lstart[k] = l; 1815 break; 1816 } else { 1817 ca[idx[l]] = ca_den_ptr[row[l]]; 1818 } 1819 } 1820 ca_den_ptr += m; 1821 } 1822 row_end += brows; 1823 if (row_end > m) row_end = m; 1824 } 1825 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1826 ca_den_ptr = ca_den; 1827 for (k = 0; k < ncolors; k++) { 1828 nrows = matcoloring->nrows[k]; 1829 row = rows + colorforrow[k]; 1830 idx = den2sp + colorforrow[k]; 1831 for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1832 ca_den_ptr += m; 1833 } 1834 } 1835 1836 PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1837 #if defined(PETSC_USE_INFO) 1838 if (matcoloring->brows > 0) { 1839 PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1840 } else { 1841 PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1842 } 1843 #endif 1844 PetscFunctionReturn(PETSC_SUCCESS); 1845 } 1846 1847 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) 1848 { 1849 PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 1850 const PetscInt *is, *ci, *cj, *row_idx; 1851 PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1852 IS *isa; 1853 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1854 PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1855 PetscInt *colorforcol, *columns, *columns_i, brows; 1856 PetscBool flg; 1857 1858 PetscFunctionBegin; 1859 PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1860 1861 /* bs >1 is not being tested yet! */ 1862 Nbs = mat->cmap->N / bs; 1863 c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1864 c->N = Nbs; 1865 c->m = c->M; 1866 c->rstart = 0; 1867 c->brows = 100; 1868 1869 c->ncolors = nis; 1870 PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 1871 PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 1872 PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1873 1874 brows = c->brows; 1875 PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1876 if (flg) c->brows = brows; 1877 if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 1878 1879 colorforrow[0] = 0; 1880 rows_i = rows; 1881 den2sp_i = den2sp; 1882 1883 PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 1884 PetscCall(PetscMalloc1(Nbs + 1, &columns)); 1885 1886 colorforcol[0] = 0; 1887 columns_i = columns; 1888 1889 /* get column-wise storage of mat */ 1890 PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1891 1892 cm = c->m; 1893 PetscCall(PetscMalloc1(cm + 1, &rowhit)); 1894 PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1895 for (i = 0; i < nis; i++) { /* loop over color */ 1896 PetscCall(ISGetLocalSize(isa[i], &n)); 1897 PetscCall(ISGetIndices(isa[i], &is)); 1898 1899 c->ncolumns[i] = n; 1900 if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1901 colorforcol[i + 1] = colorforcol[i] + n; 1902 columns_i += n; 1903 1904 /* fast, crude version requires O(N*N) work */ 1905 PetscCall(PetscArrayzero(rowhit, cm)); 1906 1907 for (j = 0; j < n; j++) { /* loop over columns*/ 1908 col = is[j]; 1909 row_idx = cj + ci[col]; 1910 m = ci[col + 1] - ci[col]; 1911 for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1912 idxhit[*row_idx] = spidx[ci[col] + k]; 1913 rowhit[*row_idx++] = col + 1; 1914 } 1915 } 1916 /* count the number of hits */ 1917 nrows = 0; 1918 for (j = 0; j < cm; j++) { 1919 if (rowhit[j]) nrows++; 1920 } 1921 c->nrows[i] = nrows; 1922 colorforrow[i + 1] = colorforrow[i] + nrows; 1923 1924 nrows = 0; 1925 for (j = 0; j < cm; j++) { /* loop over rows */ 1926 if (rowhit[j]) { 1927 rows_i[nrows] = j; 1928 den2sp_i[nrows] = idxhit[j]; 1929 nrows++; 1930 } 1931 } 1932 den2sp_i += nrows; 1933 1934 PetscCall(ISRestoreIndices(isa[i], &is)); 1935 rows_i += nrows; 1936 } 1937 PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1938 PetscCall(PetscFree(rowhit)); 1939 PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 1940 PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1941 1942 c->colorforrow = colorforrow; 1943 c->rows = rows; 1944 c->den2sp = den2sp; 1945 c->colorforcol = colorforcol; 1946 c->columns = columns; 1947 1948 PetscCall(PetscFree(idxhit)); 1949 PetscFunctionReturn(PETSC_SUCCESS); 1950 } 1951 1952 /* --------------------------------------------------------------- */ 1953 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1954 { 1955 Mat_Product *product = C->product; 1956 Mat A = product->A, B = product->B; 1957 1958 PetscFunctionBegin; 1959 if (C->ops->mattransposemultnumeric) { 1960 /* Alg: "outerproduct" */ 1961 PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 1962 } else { 1963 /* Alg: "matmatmult" -- C = At*B */ 1964 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 1965 1966 PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1967 if (atb->At) { 1968 /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 1969 user may have called MatProductReplaceMats() to get this A=product->A */ 1970 PetscCall(MatTransposeSetPrecursor(A, atb->At)); 1971 PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 1972 } 1973 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 1974 } 1975 PetscFunctionReturn(PETSC_SUCCESS); 1976 } 1977 1978 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1979 { 1980 Mat_Product *product = C->product; 1981 Mat A = product->A, B = product->B; 1982 PetscReal fill = product->fill; 1983 1984 PetscFunctionBegin; 1985 PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 1986 1987 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1988 PetscFunctionReturn(PETSC_SUCCESS); 1989 } 1990 1991 /* --------------------------------------------------------------- */ 1992 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1993 { 1994 Mat_Product *product = C->product; 1995 PetscInt alg = 0; /* default algorithm */ 1996 PetscBool flg = PETSC_FALSE; 1997 #if !defined(PETSC_HAVE_HYPRE) 1998 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 1999 PetscInt nalg = 7; 2000 #else 2001 const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 2002 PetscInt nalg = 8; 2003 #endif 2004 2005 PetscFunctionBegin; 2006 /* Set default algorithm */ 2007 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2008 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2009 2010 /* Get runtime option */ 2011 if (product->api_user) { 2012 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 2013 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 2014 PetscOptionsEnd(); 2015 } else { 2016 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 2017 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 2018 PetscOptionsEnd(); 2019 } 2020 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2021 2022 C->ops->productsymbolic = MatProductSymbolic_AB; 2023 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 2024 PetscFunctionReturn(PETSC_SUCCESS); 2025 } 2026 2027 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2028 { 2029 Mat_Product *product = C->product; 2030 PetscInt alg = 0; /* default algorithm */ 2031 PetscBool flg = PETSC_FALSE; 2032 const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 2033 PetscInt nalg = 3; 2034 2035 PetscFunctionBegin; 2036 /* Get runtime option */ 2037 if (product->api_user) { 2038 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 2039 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2040 PetscOptionsEnd(); 2041 } else { 2042 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 2043 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 2044 PetscOptionsEnd(); 2045 } 2046 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2047 2048 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 2049 PetscFunctionReturn(PETSC_SUCCESS); 2050 } 2051 2052 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2053 { 2054 Mat_Product *product = C->product; 2055 PetscInt alg = 0; /* default algorithm */ 2056 PetscBool flg = PETSC_FALSE; 2057 const char *algTypes[2] = {"default", "color"}; 2058 PetscInt nalg = 2; 2059 2060 PetscFunctionBegin; 2061 /* Set default algorithm */ 2062 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2063 if (!flg) { 2064 alg = 1; 2065 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2066 } 2067 2068 /* Get runtime option */ 2069 if (product->api_user) { 2070 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 2071 PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2072 PetscOptionsEnd(); 2073 } else { 2074 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 2075 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2076 PetscOptionsEnd(); 2077 } 2078 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2079 2080 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2081 C->ops->productsymbolic = MatProductSymbolic_ABt; 2082 PetscFunctionReturn(PETSC_SUCCESS); 2083 } 2084 2085 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2086 { 2087 Mat_Product *product = C->product; 2088 PetscBool flg = PETSC_FALSE; 2089 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2090 #if !defined(PETSC_HAVE_HYPRE) 2091 const char *algTypes[2] = {"scalable", "rap"}; 2092 PetscInt nalg = 2; 2093 #else 2094 const char *algTypes[3] = {"scalable", "rap", "hypre"}; 2095 PetscInt nalg = 3; 2096 #endif 2097 2098 PetscFunctionBegin; 2099 /* Set default algorithm */ 2100 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2101 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2102 2103 /* Get runtime option */ 2104 if (product->api_user) { 2105 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 2106 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2107 PetscOptionsEnd(); 2108 } else { 2109 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 2110 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2111 PetscOptionsEnd(); 2112 } 2113 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2114 2115 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2116 PetscFunctionReturn(PETSC_SUCCESS); 2117 } 2118 2119 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2120 { 2121 Mat_Product *product = C->product; 2122 PetscBool flg = PETSC_FALSE; 2123 PetscInt alg = 0; /* default algorithm */ 2124 const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 2125 PetscInt nalg = 3; 2126 2127 PetscFunctionBegin; 2128 /* Set default algorithm */ 2129 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2130 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2131 2132 /* Get runtime option */ 2133 if (product->api_user) { 2134 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 2135 PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2136 PetscOptionsEnd(); 2137 } else { 2138 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 2139 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2140 PetscOptionsEnd(); 2141 } 2142 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2143 2144 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2145 PetscFunctionReturn(PETSC_SUCCESS); 2146 } 2147 2148 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2149 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2150 { 2151 Mat_Product *product = C->product; 2152 PetscInt alg = 0; /* default algorithm */ 2153 PetscBool flg = PETSC_FALSE; 2154 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 2155 PetscInt nalg = 7; 2156 2157 PetscFunctionBegin; 2158 /* Set default algorithm */ 2159 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2160 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2161 2162 /* Get runtime option */ 2163 if (product->api_user) { 2164 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 2165 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2166 PetscOptionsEnd(); 2167 } else { 2168 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 2169 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2170 PetscOptionsEnd(); 2171 } 2172 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2173 2174 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2175 C->ops->productsymbolic = MatProductSymbolic_ABC; 2176 PetscFunctionReturn(PETSC_SUCCESS); 2177 } 2178 2179 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2180 { 2181 Mat_Product *product = C->product; 2182 2183 PetscFunctionBegin; 2184 switch (product->type) { 2185 case MATPRODUCT_AB: 2186 PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2187 break; 2188 case MATPRODUCT_AtB: 2189 PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2190 break; 2191 case MATPRODUCT_ABt: 2192 PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2193 break; 2194 case MATPRODUCT_PtAP: 2195 PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2196 break; 2197 case MATPRODUCT_RARt: 2198 PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2199 break; 2200 case MATPRODUCT_ABC: 2201 PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2202 break; 2203 default: 2204 break; 2205 } 2206 PetscFunctionReturn(PETSC_SUCCESS); 2207 } 2208