1 /* 2 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 3 C = A * B 4 */ 5 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <petscbt.h> 9 #include <petsc/private/isimpl.h> 10 #include <../src/mat/impls/dense/seq/dense.h> 11 12 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 13 { 14 PetscFunctionBegin; 15 if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 16 else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 17 PetscFunctionReturn(PETSC_SUCCESS); 18 } 19 20 /* Modified from MatCreateSeqAIJWithArrays() */ 21 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) 22 { 23 PetscInt ii; 24 Mat_SeqAIJ *aij; 25 PetscBool isseqaij, osingle, ofree_a, ofree_ij; 26 27 PetscFunctionBegin; 28 PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 29 PetscCall(MatSetSizes(mat, m, n, m, n)); 30 31 if (!mtype) { 32 PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 33 if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 34 } else { 35 PetscCall(MatSetType(mat, mtype)); 36 } 37 38 aij = (Mat_SeqAIJ *)(mat)->data; 39 osingle = aij->singlemalloc; 40 ofree_a = aij->free_a; 41 ofree_ij = aij->free_ij; 42 /* changes the free flags */ 43 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 44 45 PetscCall(PetscFree(aij->ilen)); 46 PetscCall(PetscFree(aij->imax)); 47 PetscCall(PetscMalloc1(m, &aij->imax)); 48 PetscCall(PetscMalloc1(m, &aij->ilen)); 49 for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 50 const PetscInt rnz = i[ii + 1] - i[ii]; 51 aij->nonzerorowcnt += !!rnz; 52 aij->rmax = PetscMax(aij->rmax, rnz); 53 aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 54 } 55 aij->maxnz = i[m]; 56 aij->nz = i[m]; 57 58 if (osingle) { 59 PetscCall(PetscFree3(aij->a, aij->j, aij->i)); 60 } else { 61 if (ofree_a) PetscCall(PetscFree(aij->a)); 62 if (ofree_ij) PetscCall(PetscFree(aij->j)); 63 if (ofree_ij) PetscCall(PetscFree(aij->i)); 64 } 65 aij->i = i; 66 aij->j = j; 67 aij->a = a; 68 aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 69 /* default to not retain ownership */ 70 aij->singlemalloc = PETSC_FALSE; 71 aij->free_a = PETSC_FALSE; 72 aij->free_ij = PETSC_FALSE; 73 PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 74 PetscFunctionReturn(PETSC_SUCCESS); 75 } 76 77 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 78 { 79 Mat_Product *product = C->product; 80 MatProductAlgorithm alg; 81 PetscBool flg; 82 83 PetscFunctionBegin; 84 if (product) { 85 alg = product->alg; 86 } else { 87 alg = "sorted"; 88 } 89 /* sorted */ 90 PetscCall(PetscStrcmp(alg, "sorted", &flg)); 91 if (flg) { 92 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 93 PetscFunctionReturn(PETSC_SUCCESS); 94 } 95 96 /* scalable */ 97 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 98 if (flg) { 99 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 100 PetscFunctionReturn(PETSC_SUCCESS); 101 } 102 103 /* scalable_fast */ 104 PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 105 if (flg) { 106 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 107 PetscFunctionReturn(PETSC_SUCCESS); 108 } 109 110 /* heap */ 111 PetscCall(PetscStrcmp(alg, "heap", &flg)); 112 if (flg) { 113 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 114 PetscFunctionReturn(PETSC_SUCCESS); 115 } 116 117 /* btheap */ 118 PetscCall(PetscStrcmp(alg, "btheap", &flg)); 119 if (flg) { 120 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 /* llcondensed */ 125 PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 126 if (flg) { 127 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 128 PetscFunctionReturn(PETSC_SUCCESS); 129 } 130 131 /* rowmerge */ 132 PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 133 if (flg) { 134 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 135 PetscFunctionReturn(PETSC_SUCCESS); 136 } 137 138 #if defined(PETSC_HAVE_HYPRE) 139 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 140 if (flg) { 141 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 142 PetscFunctionReturn(PETSC_SUCCESS); 143 } 144 #endif 145 146 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 147 } 148 149 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) 150 { 151 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 152 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 153 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 154 PetscReal afill; 155 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 156 PetscHMapI ta; 157 PetscBT lnkbt; 158 PetscFreeSpaceList free_space = NULL, current_space = NULL; 159 160 PetscFunctionBegin; 161 /* Get ci and cj */ 162 /* Allocate ci array, arrays for fill computation and */ 163 /* free space for accumulating nonzero column info */ 164 PetscCall(PetscMalloc1(am + 2, &ci)); 165 ci[0] = 0; 166 167 /* create and initialize a linked list */ 168 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 169 MatRowMergeMax_SeqAIJ(b, bm, ta); 170 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 171 PetscCall(PetscHMapIDestroy(&ta)); 172 173 PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 174 175 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 176 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 177 178 current_space = free_space; 179 180 /* Determine ci and cj */ 181 for (i = 0; i < am; i++) { 182 anzi = ai[i + 1] - ai[i]; 183 aj = a->j + ai[i]; 184 for (j = 0; j < anzi; j++) { 185 brow = aj[j]; 186 bnzj = bi[brow + 1] - bi[brow]; 187 bj = b->j + bi[brow]; 188 /* add non-zero cols of B into the sorted linked list lnk */ 189 PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 190 } 191 /* add possible missing diagonal entry */ 192 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 193 cnzi = lnk[0]; 194 195 /* If free space is not available, make more free space */ 196 /* Double the amount of total space in the list */ 197 if (current_space->local_remaining < cnzi) { 198 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 199 ndouble++; 200 } 201 202 /* Copy data into free space, then initialize lnk */ 203 PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 204 205 current_space->array += cnzi; 206 current_space->local_used += cnzi; 207 current_space->local_remaining -= cnzi; 208 209 ci[i + 1] = ci[i] + cnzi; 210 } 211 212 /* Column indices are in the list of free space */ 213 /* Allocate space for cj, initialize cj, and */ 214 /* destroy list of free space and other temporary array(s) */ 215 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 216 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 217 PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 218 219 /* put together the new symbolic matrix */ 220 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 221 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 222 223 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 224 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 225 c = (Mat_SeqAIJ *)(C->data); 226 c->free_a = PETSC_FALSE; 227 c->free_ij = PETSC_TRUE; 228 c->nonew = 0; 229 230 /* fast, needs non-scalable O(bn) array 'abdense' */ 231 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 232 233 /* set MatInfo */ 234 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 235 if (afill < 1.0) afill = 1.0; 236 C->info.mallocs = ndouble; 237 C->info.fill_ratio_given = fill; 238 C->info.fill_ratio_needed = afill; 239 240 #if defined(PETSC_USE_INFO) 241 if (ci[am]) { 242 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 243 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 244 } else { 245 PetscCall(PetscInfo(C, "Empty matrix product\n")); 246 } 247 #endif 248 PetscFunctionReturn(PETSC_SUCCESS); 249 } 250 251 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) 252 { 253 PetscLogDouble flops = 0.0; 254 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 255 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 256 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 257 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 258 PetscInt am = A->rmap->n, cm = C->rmap->n; 259 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 260 PetscScalar *ca, valtmp; 261 PetscScalar *ab_dense; 262 PetscContainer cab_dense; 263 const PetscScalar *aa, *ba, *baj; 264 265 PetscFunctionBegin; 266 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 267 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 268 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 269 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 270 c->a = ca; 271 c->free_a = PETSC_TRUE; 272 } else ca = c->a; 273 274 /* TODO this should be done in the symbolic phase */ 275 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 276 that is hard to eradicate) */ 277 PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 278 if (!cab_dense) { 279 PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 280 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense)); 281 PetscCall(PetscContainerSetPointer(cab_dense, ab_dense)); 282 PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault)); 283 PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense)); 284 PetscCall(PetscObjectDereference((PetscObject)cab_dense)); 285 } 286 PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense)); 287 PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 288 289 /* clean old values in C */ 290 PetscCall(PetscArrayzero(ca, ci[cm])); 291 /* Traverse A row-wise. */ 292 /* Build the ith row in C by summing over nonzero columns in A, */ 293 /* the rows of B corresponding to nonzeros of A. */ 294 for (i = 0; i < am; i++) { 295 anzi = ai[i + 1] - ai[i]; 296 for (j = 0; j < anzi; j++) { 297 brow = aj[j]; 298 bnzi = bi[brow + 1] - bi[brow]; 299 bjj = bj + bi[brow]; 300 baj = ba + bi[brow]; 301 /* perform dense axpy */ 302 valtmp = aa[j]; 303 for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 304 flops += 2 * bnzi; 305 } 306 aj += anzi; 307 aa += anzi; 308 309 cnzi = ci[i + 1] - ci[i]; 310 for (k = 0; k < cnzi; k++) { 311 ca[k] += ab_dense[cj[k]]; 312 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 313 } 314 flops += cnzi; 315 cj += cnzi; 316 ca += cnzi; 317 } 318 #if defined(PETSC_HAVE_DEVICE) 319 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 320 #endif 321 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 322 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 323 PetscCall(PetscLogFlops(flops)); 324 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 325 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 326 PetscFunctionReturn(PETSC_SUCCESS); 327 } 328 329 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) 330 { 331 PetscLogDouble flops = 0.0; 332 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 333 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 334 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 335 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 336 PetscInt am = A->rmap->N, cm = C->rmap->N; 337 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 338 PetscScalar *ca = c->a, valtmp; 339 const PetscScalar *aa, *ba, *baj; 340 PetscInt nextb; 341 342 PetscFunctionBegin; 343 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 344 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 345 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 346 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 347 c->a = ca; 348 c->free_a = PETSC_TRUE; 349 } 350 351 /* clean old values in C */ 352 PetscCall(PetscArrayzero(ca, ci[cm])); 353 /* Traverse A row-wise. */ 354 /* Build the ith row in C by summing over nonzero columns in A, */ 355 /* the rows of B corresponding to nonzeros of A. */ 356 for (i = 0; i < am; i++) { 357 anzi = ai[i + 1] - ai[i]; 358 cnzi = ci[i + 1] - ci[i]; 359 for (j = 0; j < anzi; j++) { 360 brow = aj[j]; 361 bnzi = bi[brow + 1] - bi[brow]; 362 bjj = bj + bi[brow]; 363 baj = ba + bi[brow]; 364 /* perform sparse axpy */ 365 valtmp = aa[j]; 366 nextb = 0; 367 for (k = 0; nextb < bnzi; k++) { 368 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 369 ca[k] += valtmp * baj[nextb++]; 370 } 371 } 372 flops += 2 * bnzi; 373 } 374 aj += anzi; 375 aa += anzi; 376 cj += cnzi; 377 ca += cnzi; 378 } 379 #if defined(PETSC_HAVE_DEVICE) 380 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 381 #endif 382 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 383 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 384 PetscCall(PetscLogFlops(flops)); 385 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 386 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 387 PetscFunctionReturn(PETSC_SUCCESS); 388 } 389 390 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) 391 { 392 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 393 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 394 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 395 MatScalar *ca; 396 PetscReal afill; 397 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 398 PetscHMapI ta; 399 PetscFreeSpaceList free_space = NULL, current_space = NULL; 400 401 PetscFunctionBegin; 402 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 403 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 404 PetscCall(PetscMalloc1(am + 2, &ci)); 405 ci[0] = 0; 406 407 /* create and initialize a linked list */ 408 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 409 MatRowMergeMax_SeqAIJ(b, bm, ta); 410 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 411 PetscCall(PetscHMapIDestroy(&ta)); 412 413 PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 414 415 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 416 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 417 current_space = free_space; 418 419 /* Determine ci and cj */ 420 for (i = 0; i < am; i++) { 421 anzi = ai[i + 1] - ai[i]; 422 aj = a->j + ai[i]; 423 for (j = 0; j < anzi; j++) { 424 brow = aj[j]; 425 bnzj = bi[brow + 1] - bi[brow]; 426 bj = b->j + bi[brow]; 427 /* add non-zero cols of B into the sorted linked list lnk */ 428 PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 429 } 430 /* add possible missing diagonal entry */ 431 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 432 cnzi = lnk[1]; 433 434 /* If free space is not available, make more free space */ 435 /* Double the amount of total space in the list */ 436 if (current_space->local_remaining < cnzi) { 437 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 438 ndouble++; 439 } 440 441 /* Copy data into free space, then initialize lnk */ 442 PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 443 444 current_space->array += cnzi; 445 current_space->local_used += cnzi; 446 current_space->local_remaining -= cnzi; 447 448 ci[i + 1] = ci[i] + cnzi; 449 } 450 451 /* Column indices are in the list of free space */ 452 /* Allocate space for cj, initialize cj, and */ 453 /* destroy list of free space and other temporary array(s) */ 454 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 455 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 456 PetscCall(PetscLLCondensedDestroy_fast(lnk)); 457 458 /* Allocate space for ca */ 459 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 460 461 /* put together the new symbolic matrix */ 462 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 463 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 464 465 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 466 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 467 c = (Mat_SeqAIJ *)(C->data); 468 c->free_a = PETSC_TRUE; 469 c->free_ij = PETSC_TRUE; 470 c->nonew = 0; 471 472 /* slower, less memory */ 473 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 474 475 /* set MatInfo */ 476 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 477 if (afill < 1.0) afill = 1.0; 478 C->info.mallocs = ndouble; 479 C->info.fill_ratio_given = fill; 480 C->info.fill_ratio_needed = afill; 481 482 #if defined(PETSC_USE_INFO) 483 if (ci[am]) { 484 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 485 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 486 } else { 487 PetscCall(PetscInfo(C, "Empty matrix product\n")); 488 } 489 #endif 490 PetscFunctionReturn(PETSC_SUCCESS); 491 } 492 493 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) 494 { 495 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 496 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 497 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 498 MatScalar *ca; 499 PetscReal afill; 500 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 501 PetscHMapI ta; 502 PetscFreeSpaceList free_space = NULL, current_space = NULL; 503 504 PetscFunctionBegin; 505 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 506 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 507 PetscCall(PetscMalloc1(am + 2, &ci)); 508 ci[0] = 0; 509 510 /* create and initialize a linked list */ 511 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 512 MatRowMergeMax_SeqAIJ(b, bm, ta); 513 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 514 PetscCall(PetscHMapIDestroy(&ta)); 515 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 516 517 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 518 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 519 current_space = free_space; 520 521 /* Determine ci and cj */ 522 for (i = 0; i < am; i++) { 523 anzi = ai[i + 1] - ai[i]; 524 aj = a->j + ai[i]; 525 for (j = 0; j < anzi; j++) { 526 brow = aj[j]; 527 bnzj = bi[brow + 1] - bi[brow]; 528 bj = b->j + bi[brow]; 529 /* add non-zero cols of B into the sorted linked list lnk */ 530 PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 531 } 532 /* add possible missing diagonal entry */ 533 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 534 535 cnzi = lnk[0]; 536 537 /* If free space is not available, make more free space */ 538 /* Double the amount of total space in the list */ 539 if (current_space->local_remaining < cnzi) { 540 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 541 ndouble++; 542 } 543 544 /* Copy data into free space, then initialize lnk */ 545 PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 546 547 current_space->array += cnzi; 548 current_space->local_used += cnzi; 549 current_space->local_remaining -= cnzi; 550 551 ci[i + 1] = ci[i] + cnzi; 552 } 553 554 /* Column indices are in the list of free space */ 555 /* Allocate space for cj, initialize cj, and */ 556 /* destroy list of free space and other temporary array(s) */ 557 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 558 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 559 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 560 561 /* Allocate space for ca */ 562 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 563 564 /* put together the new symbolic matrix */ 565 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 566 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 567 568 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 569 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 570 c = (Mat_SeqAIJ *)(C->data); 571 c->free_a = PETSC_TRUE; 572 c->free_ij = PETSC_TRUE; 573 c->nonew = 0; 574 575 /* slower, less memory */ 576 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 577 578 /* set MatInfo */ 579 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 580 if (afill < 1.0) afill = 1.0; 581 C->info.mallocs = ndouble; 582 C->info.fill_ratio_given = fill; 583 C->info.fill_ratio_needed = afill; 584 585 #if defined(PETSC_USE_INFO) 586 if (ci[am]) { 587 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 588 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 589 } else { 590 PetscCall(PetscInfo(C, "Empty matrix product\n")); 591 } 592 #endif 593 PetscFunctionReturn(PETSC_SUCCESS); 594 } 595 596 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) 597 { 598 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 599 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 600 PetscInt *ci, *cj, *bb; 601 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 602 PetscReal afill; 603 PetscInt i, j, col, ndouble = 0; 604 PetscFreeSpaceList free_space = NULL, current_space = NULL; 605 PetscHeap h; 606 607 PetscFunctionBegin; 608 /* Get ci and cj - by merging sorted rows using a heap */ 609 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 610 PetscCall(PetscMalloc1(am + 2, &ci)); 611 ci[0] = 0; 612 613 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 614 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 615 current_space = free_space; 616 617 PetscCall(PetscHeapCreate(a->rmax, &h)); 618 PetscCall(PetscMalloc1(a->rmax, &bb)); 619 620 /* Determine ci and cj */ 621 for (i = 0; i < am; i++) { 622 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 623 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 624 ci[i + 1] = ci[i]; 625 /* Populate the min heap */ 626 for (j = 0; j < anzi; j++) { 627 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 628 if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 629 PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 630 } 631 } 632 /* Pick off the min element, adding it to free space */ 633 PetscCall(PetscHeapPop(h, &j, &col)); 634 while (j >= 0) { 635 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 636 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 637 ndouble++; 638 } 639 *(current_space->array++) = col; 640 current_space->local_used++; 641 current_space->local_remaining--; 642 ci[i + 1]++; 643 644 /* stash if anything else remains in this row of B */ 645 if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 646 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 647 PetscInt j2, col2; 648 PetscCall(PetscHeapPeek(h, &j2, &col2)); 649 if (col2 != col) break; 650 PetscCall(PetscHeapPop(h, &j2, &col2)); 651 if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 652 } 653 /* Put any stashed elements back into the min heap */ 654 PetscCall(PetscHeapUnstash(h)); 655 PetscCall(PetscHeapPop(h, &j, &col)); 656 } 657 } 658 PetscCall(PetscFree(bb)); 659 PetscCall(PetscHeapDestroy(&h)); 660 661 /* Column indices are in the list of free space */ 662 /* Allocate space for cj, initialize cj, and */ 663 /* destroy list of free space and other temporary array(s) */ 664 PetscCall(PetscMalloc1(ci[am], &cj)); 665 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 666 667 /* put together the new symbolic matrix */ 668 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 669 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 670 671 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 672 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 673 c = (Mat_SeqAIJ *)(C->data); 674 c->free_a = PETSC_TRUE; 675 c->free_ij = PETSC_TRUE; 676 c->nonew = 0; 677 678 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 679 680 /* set MatInfo */ 681 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 682 if (afill < 1.0) afill = 1.0; 683 C->info.mallocs = ndouble; 684 C->info.fill_ratio_given = fill; 685 C->info.fill_ratio_needed = afill; 686 687 #if defined(PETSC_USE_INFO) 688 if (ci[am]) { 689 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 690 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 691 } else { 692 PetscCall(PetscInfo(C, "Empty matrix product\n")); 693 } 694 #endif 695 PetscFunctionReturn(PETSC_SUCCESS); 696 } 697 698 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) 699 { 700 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 701 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 702 PetscInt *ci, *cj, *bb; 703 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 704 PetscReal afill; 705 PetscInt i, j, col, ndouble = 0; 706 PetscFreeSpaceList free_space = NULL, current_space = NULL; 707 PetscHeap h; 708 PetscBT bt; 709 710 PetscFunctionBegin; 711 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 712 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 713 PetscCall(PetscMalloc1(am + 2, &ci)); 714 ci[0] = 0; 715 716 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 717 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 718 719 current_space = free_space; 720 721 PetscCall(PetscHeapCreate(a->rmax, &h)); 722 PetscCall(PetscMalloc1(a->rmax, &bb)); 723 PetscCall(PetscBTCreate(bn, &bt)); 724 725 /* Determine ci and cj */ 726 for (i = 0; i < am; i++) { 727 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 728 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 729 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 730 ci[i + 1] = ci[i]; 731 /* Populate the min heap */ 732 for (j = 0; j < anzi; j++) { 733 PetscInt brow = acol[j]; 734 for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 735 PetscInt bcol = bj[bb[j]]; 736 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 737 PetscCall(PetscHeapAdd(h, j, bcol)); 738 bb[j]++; 739 break; 740 } 741 } 742 } 743 /* Pick off the min element, adding it to free space */ 744 PetscCall(PetscHeapPop(h, &j, &col)); 745 while (j >= 0) { 746 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 747 fptr = NULL; /* need PetscBTMemzero */ 748 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 749 ndouble++; 750 } 751 *(current_space->array++) = col; 752 current_space->local_used++; 753 current_space->local_remaining--; 754 ci[i + 1]++; 755 756 /* stash if anything else remains in this row of B */ 757 for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 758 PetscInt bcol = bj[bb[j]]; 759 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 760 PetscCall(PetscHeapAdd(h, j, bcol)); 761 bb[j]++; 762 break; 763 } 764 } 765 PetscCall(PetscHeapPop(h, &j, &col)); 766 } 767 if (fptr) { /* Clear the bits for this row */ 768 for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 769 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 770 PetscCall(PetscBTMemzero(bn, bt)); 771 } 772 } 773 PetscCall(PetscFree(bb)); 774 PetscCall(PetscHeapDestroy(&h)); 775 PetscCall(PetscBTDestroy(&bt)); 776 777 /* Column indices are in the list of free space */ 778 /* Allocate space for cj, initialize cj, and */ 779 /* destroy list of free space and other temporary array(s) */ 780 PetscCall(PetscMalloc1(ci[am], &cj)); 781 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 782 783 /* put together the new symbolic matrix */ 784 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 785 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 786 787 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 788 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 789 c = (Mat_SeqAIJ *)(C->data); 790 c->free_a = PETSC_TRUE; 791 c->free_ij = PETSC_TRUE; 792 c->nonew = 0; 793 794 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 795 796 /* set MatInfo */ 797 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 798 if (afill < 1.0) afill = 1.0; 799 C->info.mallocs = ndouble; 800 C->info.fill_ratio_given = fill; 801 C->info.fill_ratio_needed = afill; 802 803 #if defined(PETSC_USE_INFO) 804 if (ci[am]) { 805 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 806 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 807 } else { 808 PetscCall(PetscInfo(C, "Empty matrix product\n")); 809 } 810 #endif 811 PetscFunctionReturn(PETSC_SUCCESS); 812 } 813 814 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) 815 { 816 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 817 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 818 PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 819 PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 820 const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 821 const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 822 const PetscInt *brow_ptr[8], *brow_end[8]; 823 PetscInt window[8]; 824 PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 825 PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 826 PetscReal afill; 827 PetscInt *workj_L1, *workj_L2, *workj_L3; 828 PetscInt L1_nnz, L2_nnz; 829 830 /* Step 1: Get upper bound on memory required for allocation. 831 Because of the way virtual memory works, 832 only the memory pages that are actually needed will be physically allocated. */ 833 PetscFunctionBegin; 834 PetscCall(PetscMalloc1(am + 1, &ci)); 835 for (i = 0; i < am; i++) { 836 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 837 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 838 a_rownnz = 0; 839 for (k = 0; k < anzi; ++k) { 840 a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 841 if (a_rownnz > bn) { 842 a_rownnz = bn; 843 break; 844 } 845 } 846 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 847 } 848 /* temporary work areas for merging rows */ 849 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 850 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 851 PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 852 853 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 854 c_maxmem = 8 * (ai[am] + bi[bm]); 855 /* Step 2: Populate pattern for C */ 856 PetscCall(PetscMalloc1(c_maxmem, &cj)); 857 858 ci_nnz = 0; 859 ci[0] = 0; 860 worki_L1[0] = 0; 861 worki_L2[0] = 0; 862 for (i = 0; i < am; i++) { 863 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 864 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 865 rowsleft = anzi; 866 inputcol_L1 = acol; 867 L2_nnz = 0; 868 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 869 worki_L2[1] = 0; 870 outputi_nnz = 0; 871 872 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 873 while (ci_nnz + a_maxrownnz > c_maxmem) { 874 c_maxmem *= 2; 875 ndouble++; 876 PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 877 } 878 879 while (rowsleft) { 880 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 881 L1_nrows = 0; 882 L1_nnz = 0; 883 inputcol = inputcol_L1; 884 inputi = bi; 885 inputj = bj; 886 887 /* The following macro is used to specialize for small rows in A. 888 This helps with compiler unrolling, improving performance substantially. 889 Input: inputj inputi inputcol bn 890 Output: outputj outputi_nnz */ 891 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 892 do { \ 893 window_min = bn; \ 894 outputi_nnz = 0; \ 895 for (k = 0; k < ANNZ; ++k) { \ 896 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 897 brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 898 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 899 window_min = PetscMin(window[k], window_min); \ 900 } \ 901 while (window_min < bn) { \ 902 outputj[outputi_nnz++] = window_min; \ 903 /* advance front and compute new minimum */ \ 904 old_window_min = window_min; \ 905 window_min = bn; \ 906 for (k = 0; k < ANNZ; ++k) { \ 907 if (window[k] == old_window_min) { \ 908 brow_ptr[k]++; \ 909 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 910 } \ 911 window_min = PetscMin(window[k], window_min); \ 912 } \ 913 } \ 914 } while (0) 915 916 /************** L E V E L 1 ***************/ 917 /* Merge up to 8 rows of B to L1 work array*/ 918 while (L1_rowsleft) { 919 outputi_nnz = 0; 920 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 921 else outputj = cj + ci_nnz; /* Merge directly to C */ 922 923 switch (L1_rowsleft) { 924 case 1: 925 brow_ptr[0] = inputj + inputi[inputcol[0]]; 926 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 927 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 928 inputcol += L1_rowsleft; 929 rowsleft -= L1_rowsleft; 930 L1_rowsleft = 0; 931 break; 932 case 2: 933 MatMatMultSymbolic_RowMergeMacro(2); 934 inputcol += L1_rowsleft; 935 rowsleft -= L1_rowsleft; 936 L1_rowsleft = 0; 937 break; 938 case 3: 939 MatMatMultSymbolic_RowMergeMacro(3); 940 inputcol += L1_rowsleft; 941 rowsleft -= L1_rowsleft; 942 L1_rowsleft = 0; 943 break; 944 case 4: 945 MatMatMultSymbolic_RowMergeMacro(4); 946 inputcol += L1_rowsleft; 947 rowsleft -= L1_rowsleft; 948 L1_rowsleft = 0; 949 break; 950 case 5: 951 MatMatMultSymbolic_RowMergeMacro(5); 952 inputcol += L1_rowsleft; 953 rowsleft -= L1_rowsleft; 954 L1_rowsleft = 0; 955 break; 956 case 6: 957 MatMatMultSymbolic_RowMergeMacro(6); 958 inputcol += L1_rowsleft; 959 rowsleft -= L1_rowsleft; 960 L1_rowsleft = 0; 961 break; 962 case 7: 963 MatMatMultSymbolic_RowMergeMacro(7); 964 inputcol += L1_rowsleft; 965 rowsleft -= L1_rowsleft; 966 L1_rowsleft = 0; 967 break; 968 default: 969 MatMatMultSymbolic_RowMergeMacro(8); 970 inputcol += 8; 971 rowsleft -= 8; 972 L1_rowsleft -= 8; 973 break; 974 } 975 inputcol_L1 = inputcol; 976 L1_nnz += outputi_nnz; 977 worki_L1[++L1_nrows] = L1_nnz; 978 } 979 980 /********************** L E V E L 2 ************************/ 981 /* Merge from L1 work array to either C or to L2 work array */ 982 if (anzi > 8) { 983 inputi = worki_L1; 984 inputj = workj_L1; 985 inputcol = workcol; 986 outputi_nnz = 0; 987 988 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 989 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 990 991 switch (L1_nrows) { 992 case 1: 993 brow_ptr[0] = inputj + inputi[inputcol[0]]; 994 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 995 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 996 break; 997 case 2: 998 MatMatMultSymbolic_RowMergeMacro(2); 999 break; 1000 case 3: 1001 MatMatMultSymbolic_RowMergeMacro(3); 1002 break; 1003 case 4: 1004 MatMatMultSymbolic_RowMergeMacro(4); 1005 break; 1006 case 5: 1007 MatMatMultSymbolic_RowMergeMacro(5); 1008 break; 1009 case 6: 1010 MatMatMultSymbolic_RowMergeMacro(6); 1011 break; 1012 case 7: 1013 MatMatMultSymbolic_RowMergeMacro(7); 1014 break; 1015 case 8: 1016 MatMatMultSymbolic_RowMergeMacro(8); 1017 break; 1018 default: 1019 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1020 } 1021 L2_nnz += outputi_nnz; 1022 worki_L2[++L2_nrows] = L2_nnz; 1023 1024 /************************ L E V E L 3 **********************/ 1025 /* Merge from L2 work array to either C or to L2 work array */ 1026 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1027 inputi = worki_L2; 1028 inputj = workj_L2; 1029 inputcol = workcol; 1030 outputi_nnz = 0; 1031 if (rowsleft) outputj = workj_L3; 1032 else outputj = cj + ci_nnz; 1033 switch (L2_nrows) { 1034 case 1: 1035 brow_ptr[0] = inputj + inputi[inputcol[0]]; 1036 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1037 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1038 break; 1039 case 2: 1040 MatMatMultSymbolic_RowMergeMacro(2); 1041 break; 1042 case 3: 1043 MatMatMultSymbolic_RowMergeMacro(3); 1044 break; 1045 case 4: 1046 MatMatMultSymbolic_RowMergeMacro(4); 1047 break; 1048 case 5: 1049 MatMatMultSymbolic_RowMergeMacro(5); 1050 break; 1051 case 6: 1052 MatMatMultSymbolic_RowMergeMacro(6); 1053 break; 1054 case 7: 1055 MatMatMultSymbolic_RowMergeMacro(7); 1056 break; 1057 case 8: 1058 MatMatMultSymbolic_RowMergeMacro(8); 1059 break; 1060 default: 1061 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1062 } 1063 L2_nrows = 1; 1064 L2_nnz = outputi_nnz; 1065 worki_L2[1] = outputi_nnz; 1066 /* Copy to workj_L2 */ 1067 if (rowsleft) { 1068 for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1069 } 1070 } 1071 } 1072 } /* while (rowsleft) */ 1073 #undef MatMatMultSymbolic_RowMergeMacro 1074 1075 /* terminate current row */ 1076 ci_nnz += outputi_nnz; 1077 ci[i + 1] = ci_nnz; 1078 } 1079 1080 /* Step 3: Create the new symbolic matrix */ 1081 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1082 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1083 1084 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1085 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1086 c = (Mat_SeqAIJ *)(C->data); 1087 c->free_a = PETSC_TRUE; 1088 c->free_ij = PETSC_TRUE; 1089 c->nonew = 0; 1090 1091 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1092 1093 /* set MatInfo */ 1094 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1095 if (afill < 1.0) afill = 1.0; 1096 C->info.mallocs = ndouble; 1097 C->info.fill_ratio_given = fill; 1098 C->info.fill_ratio_needed = afill; 1099 1100 #if defined(PETSC_USE_INFO) 1101 if (ci[am]) { 1102 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1103 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1104 } else { 1105 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1106 } 1107 #endif 1108 1109 /* Step 4: Free temporary work areas */ 1110 PetscCall(PetscFree(workj_L1)); 1111 PetscCall(PetscFree(workj_L2)); 1112 PetscCall(PetscFree(workj_L3)); 1113 PetscFunctionReturn(PETSC_SUCCESS); 1114 } 1115 1116 /* concatenate unique entries and then sort */ 1117 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) 1118 { 1119 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1120 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 1121 PetscInt *ci, *cj, bcol; 1122 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1123 PetscReal afill; 1124 PetscInt i, j, ndouble = 0; 1125 PetscSegBuffer seg, segrow; 1126 char *seen; 1127 1128 PetscFunctionBegin; 1129 PetscCall(PetscMalloc1(am + 1, &ci)); 1130 ci[0] = 0; 1131 1132 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1133 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 1134 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 1135 PetscCall(PetscCalloc1(bn, &seen)); 1136 1137 /* Determine ci and cj */ 1138 for (i = 0; i < am; i++) { 1139 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1140 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 1141 PetscInt packlen = 0, *PETSC_RESTRICT crow; 1142 1143 /* Pack segrow */ 1144 for (j = 0; j < anzi; j++) { 1145 PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1146 for (k = bjstart; k < bjend; k++) { 1147 bcol = bj[k]; 1148 if (!seen[bcol]) { /* new entry */ 1149 PetscInt *PETSC_RESTRICT slot; 1150 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1151 *slot = bcol; 1152 seen[bcol] = 1; 1153 packlen++; 1154 } 1155 } 1156 } 1157 1158 /* Check i-th diagonal entry */ 1159 if (C->force_diagonals && !seen[i]) { 1160 PetscInt *PETSC_RESTRICT slot; 1161 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1162 *slot = i; 1163 seen[i] = 1; 1164 packlen++; 1165 } 1166 1167 PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 1168 PetscCall(PetscSegBufferExtractTo(segrow, crow)); 1169 PetscCall(PetscSortInt(packlen, crow)); 1170 ci[i + 1] = ci[i] + packlen; 1171 for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1172 } 1173 PetscCall(PetscSegBufferDestroy(&segrow)); 1174 PetscCall(PetscFree(seen)); 1175 1176 /* Column indices are in the segmented buffer */ 1177 PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 1178 PetscCall(PetscSegBufferDestroy(&seg)); 1179 1180 /* put together the new symbolic matrix */ 1181 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1182 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1183 1184 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1185 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1186 c = (Mat_SeqAIJ *)(C->data); 1187 c->free_a = PETSC_TRUE; 1188 c->free_ij = PETSC_TRUE; 1189 c->nonew = 0; 1190 1191 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1192 1193 /* set MatInfo */ 1194 afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1195 if (afill < 1.0) afill = 1.0; 1196 C->info.mallocs = ndouble; 1197 C->info.fill_ratio_given = fill; 1198 C->info.fill_ratio_needed = afill; 1199 1200 #if defined(PETSC_USE_INFO) 1201 if (ci[am]) { 1202 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1203 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1204 } else { 1205 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1206 } 1207 #endif 1208 PetscFunctionReturn(PETSC_SUCCESS); 1209 } 1210 1211 static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1212 { 1213 Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data; 1214 1215 PetscFunctionBegin; 1216 PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 1217 PetscCall(MatDestroy(&abt->Bt_den)); 1218 PetscCall(MatDestroy(&abt->ABt_den)); 1219 PetscCall(PetscFree(abt)); 1220 PetscFunctionReturn(PETSC_SUCCESS); 1221 } 1222 1223 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1224 { 1225 Mat Bt; 1226 Mat_MatMatTransMult *abt; 1227 Mat_Product *product = C->product; 1228 char *alg; 1229 1230 PetscFunctionBegin; 1231 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1232 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1233 1234 /* create symbolic Bt */ 1235 PetscCall(MatTransposeSymbolic(B, &Bt)); 1236 PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1237 PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 1238 1239 /* get symbolic C=A*Bt */ 1240 PetscCall(PetscStrallocpy(product->alg, &alg)); 1241 PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 1242 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 1243 PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 1244 PetscCall(PetscFree(alg)); 1245 1246 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1247 PetscCall(PetscNew(&abt)); 1248 1249 product->data = abt; 1250 product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 1251 1252 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1253 1254 abt->usecoloring = PETSC_FALSE; 1255 PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 1256 if (abt->usecoloring) { 1257 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1258 MatTransposeColoring matcoloring; 1259 MatColoring coloring; 1260 ISColoring iscoloring; 1261 Mat Bt_dense, C_dense; 1262 1263 /* inode causes memory problem */ 1264 PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 1265 1266 PetscCall(MatColoringCreate(C, &coloring)); 1267 PetscCall(MatColoringSetDistance(coloring, 2)); 1268 PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 1269 PetscCall(MatColoringSetFromOptions(coloring)); 1270 PetscCall(MatColoringApply(coloring, &iscoloring)); 1271 PetscCall(MatColoringDestroy(&coloring)); 1272 PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 1273 1274 abt->matcoloring = matcoloring; 1275 1276 PetscCall(ISColoringDestroy(&iscoloring)); 1277 1278 /* Create Bt_dense and C_dense = A*Bt_dense */ 1279 PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 1280 PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 1281 PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 1282 PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 1283 1284 Bt_dense->assembled = PETSC_TRUE; 1285 abt->Bt_den = Bt_dense; 1286 1287 PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 1288 PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 1289 PetscCall(MatSetType(C_dense, MATSEQDENSE)); 1290 PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 1291 1292 Bt_dense->assembled = PETSC_TRUE; 1293 abt->ABt_den = C_dense; 1294 1295 #if defined(PETSC_USE_INFO) 1296 { 1297 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 1298 PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 1299 Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 1300 } 1301 #endif 1302 } 1303 /* clean up */ 1304 PetscCall(MatDestroy(&Bt)); 1305 PetscFunctionReturn(PETSC_SUCCESS); 1306 } 1307 1308 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1309 { 1310 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1311 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 1312 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 1313 PetscLogDouble flops = 0.0; 1314 MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 1315 Mat_MatMatTransMult *abt; 1316 Mat_Product *product = C->product; 1317 1318 PetscFunctionBegin; 1319 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1320 abt = (Mat_MatMatTransMult *)product->data; 1321 PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1322 /* clear old values in C */ 1323 if (!c->a) { 1324 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1325 c->a = ca; 1326 c->free_a = PETSC_TRUE; 1327 } else { 1328 ca = c->a; 1329 PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 1330 } 1331 1332 if (abt->usecoloring) { 1333 MatTransposeColoring matcoloring = abt->matcoloring; 1334 Mat Bt_dense, C_dense = abt->ABt_den; 1335 1336 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1337 Bt_dense = abt->Bt_den; 1338 PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1339 1340 /* C_dense = A*Bt_dense */ 1341 PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1342 1343 /* Recover C from C_dense */ 1344 PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 1345 PetscFunctionReturn(PETSC_SUCCESS); 1346 } 1347 1348 for (i = 0; i < cm; i++) { 1349 anzi = ai[i + 1] - ai[i]; 1350 acol = aj + ai[i]; 1351 aval = aa + ai[i]; 1352 cnzi = ci[i + 1] - ci[i]; 1353 ccol = cj + ci[i]; 1354 cval = ca + ci[i]; 1355 for (j = 0; j < cnzi; j++) { 1356 brow = ccol[j]; 1357 bnzj = bi[brow + 1] - bi[brow]; 1358 bcol = bj + bi[brow]; 1359 bval = ba + bi[brow]; 1360 1361 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1362 nexta = 0; 1363 nextb = 0; 1364 while (nexta < anzi && nextb < bnzj) { 1365 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1366 if (nexta == anzi) break; 1367 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1368 if (nextb == bnzj) break; 1369 if (acol[nexta] == bcol[nextb]) { 1370 cval[j] += aval[nexta] * bval[nextb]; 1371 nexta++; 1372 nextb++; 1373 flops += 2; 1374 } 1375 } 1376 } 1377 } 1378 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1379 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1380 PetscCall(PetscLogFlops(flops)); 1381 PetscFunctionReturn(PETSC_SUCCESS); 1382 } 1383 1384 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1385 { 1386 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data; 1387 1388 PetscFunctionBegin; 1389 PetscCall(MatDestroy(&atb->At)); 1390 if (atb->destroy) PetscCall((*atb->destroy)(atb->data)); 1391 PetscCall(PetscFree(atb)); 1392 PetscFunctionReturn(PETSC_SUCCESS); 1393 } 1394 1395 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1396 { 1397 Mat At = NULL; 1398 Mat_Product *product = C->product; 1399 PetscBool flg, def, square; 1400 1401 PetscFunctionBegin; 1402 MatCheckProduct(C, 4); 1403 square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 1404 /* outerproduct */ 1405 PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 1406 if (flg) { 1407 /* create symbolic At */ 1408 if (!square) { 1409 PetscCall(MatTransposeSymbolic(A, &At)); 1410 PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1411 PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1412 } 1413 /* get symbolic C=At*B */ 1414 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1415 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1416 1417 /* clean up */ 1418 if (!square) PetscCall(MatDestroy(&At)); 1419 1420 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1421 PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 1422 PetscFunctionReturn(PETSC_SUCCESS); 1423 } 1424 1425 /* matmatmult */ 1426 PetscCall(PetscStrcmp(product->alg, "default", &def)); 1427 PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1428 if (flg || def) { 1429 Mat_MatTransMatMult *atb; 1430 1431 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1432 PetscCall(PetscNew(&atb)); 1433 if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 1434 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1435 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1436 PetscCall(MatProductSetAlgorithm(C, "at*b")); 1437 product->data = atb; 1438 product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 1439 atb->At = At; 1440 1441 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1442 PetscFunctionReturn(PETSC_SUCCESS); 1443 } 1444 1445 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1446 } 1447 1448 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1449 { 1450 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1451 PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1452 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1453 PetscLogDouble flops = 0.0; 1454 MatScalar *aa = a->a, *ba, *ca, *caj; 1455 1456 PetscFunctionBegin; 1457 if (!c->a) { 1458 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1459 1460 c->a = ca; 1461 c->free_a = PETSC_TRUE; 1462 } else { 1463 ca = c->a; 1464 PetscCall(PetscArrayzero(ca, ci[cm])); 1465 } 1466 1467 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1468 for (i = 0; i < am; i++) { 1469 bj = b->j + bi[i]; 1470 ba = b->a + bi[i]; 1471 bnzi = bi[i + 1] - bi[i]; 1472 anzi = ai[i + 1] - ai[i]; 1473 for (j = 0; j < anzi; j++) { 1474 nextb = 0; 1475 crow = *aj++; 1476 cjj = cj + ci[crow]; 1477 caj = ca + ci[crow]; 1478 /* perform sparse axpy operation. Note cjj includes bj. */ 1479 for (k = 0; nextb < bnzi; k++) { 1480 if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 1481 caj[k] += (*aa) * (*(ba + nextb)); 1482 nextb++; 1483 } 1484 } 1485 flops += 2 * bnzi; 1486 aa++; 1487 } 1488 } 1489 1490 /* Assemble the final matrix and clean up */ 1491 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1492 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1493 PetscCall(PetscLogFlops(flops)); 1494 PetscFunctionReturn(PETSC_SUCCESS); 1495 } 1496 1497 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1498 { 1499 PetscFunctionBegin; 1500 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1501 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1502 PetscFunctionReturn(PETSC_SUCCESS); 1503 } 1504 1505 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) 1506 { 1507 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1508 PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1509 const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1510 const PetscInt *aj; 1511 PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 1512 PetscInt clda; 1513 PetscInt am4, bm4, col, i, j, n; 1514 1515 PetscFunctionBegin; 1516 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1517 PetscCall(MatSeqAIJGetArrayRead(A, &av)); 1518 if (add) { 1519 PetscCall(MatDenseGetArray(C, &c)); 1520 } else { 1521 PetscCall(MatDenseGetArrayWrite(C, &c)); 1522 } 1523 PetscCall(MatDenseGetArrayRead(B, &b)); 1524 PetscCall(MatDenseGetLDA(B, &bm)); 1525 PetscCall(MatDenseGetLDA(C, &clda)); 1526 am4 = 4 * clda; 1527 bm4 = 4 * bm; 1528 if (b) { 1529 b1 = b; 1530 b2 = b1 + bm; 1531 b3 = b2 + bm; 1532 b4 = b3 + bm; 1533 } else b1 = b2 = b3 = b4 = NULL; 1534 c1 = c; 1535 c2 = c1 + clda; 1536 c3 = c2 + clda; 1537 c4 = c3 + clda; 1538 for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 1539 for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1540 r1 = r2 = r3 = r4 = 0.0; 1541 n = a->i[i + 1] - a->i[i]; 1542 aj = a->j ? a->j + a->i[i] : NULL; 1543 aa = av ? av + a->i[i] : NULL; 1544 for (j = 0; j < n; j++) { 1545 const PetscScalar aatmp = aa[j]; 1546 const PetscInt ajtmp = aj[j]; 1547 r1 += aatmp * b1[ajtmp]; 1548 r2 += aatmp * b2[ajtmp]; 1549 r3 += aatmp * b3[ajtmp]; 1550 r4 += aatmp * b4[ajtmp]; 1551 } 1552 if (add) { 1553 c1[i] += r1; 1554 c2[i] += r2; 1555 c3[i] += r3; 1556 c4[i] += r4; 1557 } else { 1558 c1[i] = r1; 1559 c2[i] = r2; 1560 c3[i] = r3; 1561 c4[i] = r4; 1562 } 1563 } 1564 if (b) { 1565 b1 += bm4; 1566 b2 += bm4; 1567 b3 += bm4; 1568 b4 += bm4; 1569 } 1570 c1 += am4; 1571 c2 += am4; 1572 c3 += am4; 1573 c4 += am4; 1574 } 1575 /* process remaining columns */ 1576 if (col != cn) { 1577 PetscInt rc = cn - col; 1578 1579 if (rc == 1) { 1580 for (i = 0; i < am; i++) { 1581 r1 = 0.0; 1582 n = a->i[i + 1] - a->i[i]; 1583 aj = a->j + a->i[i]; 1584 aa = av + a->i[i]; 1585 for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 1586 if (add) c1[i] += r1; 1587 else c1[i] = r1; 1588 } 1589 } else if (rc == 2) { 1590 for (i = 0; i < am; i++) { 1591 r1 = r2 = 0.0; 1592 n = a->i[i + 1] - a->i[i]; 1593 aj = a->j + a->i[i]; 1594 aa = av + a->i[i]; 1595 for (j = 0; j < n; j++) { 1596 const PetscScalar aatmp = aa[j]; 1597 const PetscInt ajtmp = aj[j]; 1598 r1 += aatmp * b1[ajtmp]; 1599 r2 += aatmp * b2[ajtmp]; 1600 } 1601 if (add) { 1602 c1[i] += r1; 1603 c2[i] += r2; 1604 } else { 1605 c1[i] = r1; 1606 c2[i] = r2; 1607 } 1608 } 1609 } else { 1610 for (i = 0; i < am; i++) { 1611 r1 = r2 = r3 = 0.0; 1612 n = a->i[i + 1] - a->i[i]; 1613 aj = a->j ? a->j + a->i[i] : NULL; 1614 aa = av ? av + a->i[i] : NULL; 1615 for (j = 0; j < n; j++) { 1616 const PetscScalar aatmp = aa[j]; 1617 const PetscInt ajtmp = aj[j]; 1618 r1 += aatmp * b1[ajtmp]; 1619 r2 += aatmp * b2[ajtmp]; 1620 r3 += aatmp * b3[ajtmp]; 1621 } 1622 if (add) { 1623 c1[i] += r1; 1624 c2[i] += r2; 1625 c3[i] += r3; 1626 } else { 1627 c1[i] = r1; 1628 c2[i] = r2; 1629 c3[i] = r3; 1630 } 1631 } 1632 } 1633 } 1634 PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 1635 if (add) { 1636 PetscCall(MatDenseRestoreArray(C, &c)); 1637 } else { 1638 PetscCall(MatDenseRestoreArrayWrite(C, &c)); 1639 } 1640 PetscCall(MatDenseRestoreArrayRead(B, &b)); 1641 PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 1642 PetscFunctionReturn(PETSC_SUCCESS); 1643 } 1644 1645 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) 1646 { 1647 PetscFunctionBegin; 1648 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1649 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1650 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1651 1652 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 1653 PetscFunctionReturn(PETSC_SUCCESS); 1654 } 1655 1656 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1657 { 1658 PetscFunctionBegin; 1659 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1660 C->ops->productsymbolic = MatProductSymbolic_AB; 1661 PetscFunctionReturn(PETSC_SUCCESS); 1662 } 1663 1664 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 1665 1666 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1667 { 1668 PetscFunctionBegin; 1669 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1670 C->ops->productsymbolic = MatProductSymbolic_AtB; 1671 PetscFunctionReturn(PETSC_SUCCESS); 1672 } 1673 1674 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1675 { 1676 PetscFunctionBegin; 1677 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1678 C->ops->productsymbolic = MatProductSymbolic_ABt; 1679 PetscFunctionReturn(PETSC_SUCCESS); 1680 } 1681 1682 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1683 { 1684 Mat_Product *product = C->product; 1685 1686 PetscFunctionBegin; 1687 switch (product->type) { 1688 case MATPRODUCT_AB: 1689 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1690 break; 1691 case MATPRODUCT_AtB: 1692 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1693 break; 1694 case MATPRODUCT_ABt: 1695 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1696 break; 1697 default: 1698 break; 1699 } 1700 PetscFunctionReturn(PETSC_SUCCESS); 1701 } 1702 1703 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1704 { 1705 Mat_Product *product = C->product; 1706 Mat A = product->A; 1707 PetscBool baij; 1708 1709 PetscFunctionBegin; 1710 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 1711 if (!baij) { /* A is seqsbaij */ 1712 PetscBool sbaij; 1713 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 1714 PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 1715 1716 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1717 } else { /* A is seqbaij */ 1718 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1719 } 1720 1721 C->ops->productsymbolic = MatProductSymbolic_AB; 1722 PetscFunctionReturn(PETSC_SUCCESS); 1723 } 1724 1725 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1726 { 1727 Mat_Product *product = C->product; 1728 1729 PetscFunctionBegin; 1730 MatCheckProduct(C, 1); 1731 PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1732 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 1733 PetscFunctionReturn(PETSC_SUCCESS); 1734 } 1735 1736 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1737 { 1738 PetscFunctionBegin; 1739 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1740 C->ops->productsymbolic = MatProductSymbolic_AB; 1741 PetscFunctionReturn(PETSC_SUCCESS); 1742 } 1743 1744 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1745 { 1746 Mat_Product *product = C->product; 1747 1748 PetscFunctionBegin; 1749 if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 1750 PetscFunctionReturn(PETSC_SUCCESS); 1751 } 1752 1753 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) 1754 { 1755 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1756 Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 1757 PetscInt *bi = b->i, *bj = b->j; 1758 PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 1759 MatScalar *btval, *btval_den, *ba = b->a; 1760 PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1761 1762 PetscFunctionBegin; 1763 btval_den = btdense->v; 1764 PetscCall(PetscArrayzero(btval_den, m * n)); 1765 for (k = 0; k < ncolors; k++) { 1766 ncolumns = coloring->ncolumns[k]; 1767 for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1768 col = *(columns + colorforcol[k] + l); 1769 btcol = bj + bi[col]; 1770 btval = ba + bi[col]; 1771 anz = bi[col + 1] - bi[col]; 1772 for (j = 0; j < anz; j++) { 1773 brow = btcol[j]; 1774 btval_den[brow] = btval[j]; 1775 } 1776 } 1777 btval_den += m; 1778 } 1779 PetscFunctionReturn(PETSC_SUCCESS); 1780 } 1781 1782 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) 1783 { 1784 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 1785 const PetscScalar *ca_den, *ca_den_ptr; 1786 PetscScalar *ca = csp->a; 1787 PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1788 PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1789 PetscInt nrows, *row, *idx; 1790 PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 1791 1792 PetscFunctionBegin; 1793 PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1794 1795 if (brows > 0) { 1796 PetscInt *lstart, row_end, row_start; 1797 lstart = matcoloring->lstart; 1798 PetscCall(PetscArrayzero(lstart, ncolors)); 1799 1800 row_end = brows; 1801 if (row_end > m) row_end = m; 1802 for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1803 ca_den_ptr = ca_den; 1804 for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1805 nrows = matcoloring->nrows[k]; 1806 row = rows + colorforrow[k]; 1807 idx = den2sp + colorforrow[k]; 1808 for (l = lstart[k]; l < nrows; l++) { 1809 if (row[l] >= row_end) { 1810 lstart[k] = l; 1811 break; 1812 } else { 1813 ca[idx[l]] = ca_den_ptr[row[l]]; 1814 } 1815 } 1816 ca_den_ptr += m; 1817 } 1818 row_end += brows; 1819 if (row_end > m) row_end = m; 1820 } 1821 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1822 ca_den_ptr = ca_den; 1823 for (k = 0; k < ncolors; k++) { 1824 nrows = matcoloring->nrows[k]; 1825 row = rows + colorforrow[k]; 1826 idx = den2sp + colorforrow[k]; 1827 for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1828 ca_den_ptr += m; 1829 } 1830 } 1831 1832 PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1833 #if defined(PETSC_USE_INFO) 1834 if (matcoloring->brows > 0) { 1835 PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1836 } else { 1837 PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1838 } 1839 #endif 1840 PetscFunctionReturn(PETSC_SUCCESS); 1841 } 1842 1843 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) 1844 { 1845 PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 1846 const PetscInt *is, *ci, *cj, *row_idx; 1847 PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1848 IS *isa; 1849 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1850 PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1851 PetscInt *colorforcol, *columns, *columns_i, brows; 1852 PetscBool flg; 1853 1854 PetscFunctionBegin; 1855 PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1856 1857 /* bs >1 is not being tested yet! */ 1858 Nbs = mat->cmap->N / bs; 1859 c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1860 c->N = Nbs; 1861 c->m = c->M; 1862 c->rstart = 0; 1863 c->brows = 100; 1864 1865 c->ncolors = nis; 1866 PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 1867 PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 1868 PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1869 1870 brows = c->brows; 1871 PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1872 if (flg) c->brows = brows; 1873 if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 1874 1875 colorforrow[0] = 0; 1876 rows_i = rows; 1877 den2sp_i = den2sp; 1878 1879 PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 1880 PetscCall(PetscMalloc1(Nbs + 1, &columns)); 1881 1882 colorforcol[0] = 0; 1883 columns_i = columns; 1884 1885 /* get column-wise storage of mat */ 1886 PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1887 1888 cm = c->m; 1889 PetscCall(PetscMalloc1(cm + 1, &rowhit)); 1890 PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1891 for (i = 0; i < nis; i++) { /* loop over color */ 1892 PetscCall(ISGetLocalSize(isa[i], &n)); 1893 PetscCall(ISGetIndices(isa[i], &is)); 1894 1895 c->ncolumns[i] = n; 1896 if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1897 colorforcol[i + 1] = colorforcol[i] + n; 1898 columns_i += n; 1899 1900 /* fast, crude version requires O(N*N) work */ 1901 PetscCall(PetscArrayzero(rowhit, cm)); 1902 1903 for (j = 0; j < n; j++) { /* loop over columns*/ 1904 col = is[j]; 1905 row_idx = cj + ci[col]; 1906 m = ci[col + 1] - ci[col]; 1907 for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1908 idxhit[*row_idx] = spidx[ci[col] + k]; 1909 rowhit[*row_idx++] = col + 1; 1910 } 1911 } 1912 /* count the number of hits */ 1913 nrows = 0; 1914 for (j = 0; j < cm; j++) { 1915 if (rowhit[j]) nrows++; 1916 } 1917 c->nrows[i] = nrows; 1918 colorforrow[i + 1] = colorforrow[i] + nrows; 1919 1920 nrows = 0; 1921 for (j = 0; j < cm; j++) { /* loop over rows */ 1922 if (rowhit[j]) { 1923 rows_i[nrows] = j; 1924 den2sp_i[nrows] = idxhit[j]; 1925 nrows++; 1926 } 1927 } 1928 den2sp_i += nrows; 1929 1930 PetscCall(ISRestoreIndices(isa[i], &is)); 1931 rows_i += nrows; 1932 } 1933 PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1934 PetscCall(PetscFree(rowhit)); 1935 PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 1936 PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1937 1938 c->colorforrow = colorforrow; 1939 c->rows = rows; 1940 c->den2sp = den2sp; 1941 c->colorforcol = colorforcol; 1942 c->columns = columns; 1943 1944 PetscCall(PetscFree(idxhit)); 1945 PetscFunctionReturn(PETSC_SUCCESS); 1946 } 1947 1948 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1949 { 1950 Mat_Product *product = C->product; 1951 Mat A = product->A, B = product->B; 1952 1953 PetscFunctionBegin; 1954 if (C->ops->mattransposemultnumeric) { 1955 /* Alg: "outerproduct" */ 1956 PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 1957 } else { 1958 /* Alg: "matmatmult" -- C = At*B */ 1959 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 1960 1961 PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1962 if (atb->At) { 1963 /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 1964 user may have called MatProductReplaceMats() to get this A=product->A */ 1965 PetscCall(MatTransposeSetPrecursor(A, atb->At)); 1966 PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 1967 } 1968 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 1969 } 1970 PetscFunctionReturn(PETSC_SUCCESS); 1971 } 1972 1973 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1974 { 1975 Mat_Product *product = C->product; 1976 Mat A = product->A, B = product->B; 1977 PetscReal fill = product->fill; 1978 1979 PetscFunctionBegin; 1980 PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 1981 1982 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1983 PetscFunctionReturn(PETSC_SUCCESS); 1984 } 1985 1986 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1987 { 1988 Mat_Product *product = C->product; 1989 PetscInt alg = 0; /* default algorithm */ 1990 PetscBool flg = PETSC_FALSE; 1991 #if !defined(PETSC_HAVE_HYPRE) 1992 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 1993 PetscInt nalg = 7; 1994 #else 1995 const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 1996 PetscInt nalg = 8; 1997 #endif 1998 1999 PetscFunctionBegin; 2000 /* Set default algorithm */ 2001 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2002 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2003 2004 /* Get runtime option */ 2005 if (product->api_user) { 2006 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 2007 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 2008 PetscOptionsEnd(); 2009 } else { 2010 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 2011 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 2012 PetscOptionsEnd(); 2013 } 2014 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2015 2016 C->ops->productsymbolic = MatProductSymbolic_AB; 2017 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 2018 PetscFunctionReturn(PETSC_SUCCESS); 2019 } 2020 2021 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2022 { 2023 Mat_Product *product = C->product; 2024 PetscInt alg = 0; /* default algorithm */ 2025 PetscBool flg = PETSC_FALSE; 2026 const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 2027 PetscInt nalg = 3; 2028 2029 PetscFunctionBegin; 2030 /* Get runtime option */ 2031 if (product->api_user) { 2032 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 2033 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2034 PetscOptionsEnd(); 2035 } else { 2036 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 2037 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 2038 PetscOptionsEnd(); 2039 } 2040 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2041 2042 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 2043 PetscFunctionReturn(PETSC_SUCCESS); 2044 } 2045 2046 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2047 { 2048 Mat_Product *product = C->product; 2049 PetscInt alg = 0; /* default algorithm */ 2050 PetscBool flg = PETSC_FALSE; 2051 const char *algTypes[2] = {"default", "color"}; 2052 PetscInt nalg = 2; 2053 2054 PetscFunctionBegin; 2055 /* Set default algorithm */ 2056 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2057 if (!flg) { 2058 alg = 1; 2059 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2060 } 2061 2062 /* Get runtime option */ 2063 if (product->api_user) { 2064 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 2065 PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2066 PetscOptionsEnd(); 2067 } else { 2068 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 2069 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2070 PetscOptionsEnd(); 2071 } 2072 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2073 2074 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2075 C->ops->productsymbolic = MatProductSymbolic_ABt; 2076 PetscFunctionReturn(PETSC_SUCCESS); 2077 } 2078 2079 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2080 { 2081 Mat_Product *product = C->product; 2082 PetscBool flg = PETSC_FALSE; 2083 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2084 #if !defined(PETSC_HAVE_HYPRE) 2085 const char *algTypes[2] = {"scalable", "rap"}; 2086 PetscInt nalg = 2; 2087 #else 2088 const char *algTypes[3] = {"scalable", "rap", "hypre"}; 2089 PetscInt nalg = 3; 2090 #endif 2091 2092 PetscFunctionBegin; 2093 /* Set default algorithm */ 2094 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2095 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2096 2097 /* Get runtime option */ 2098 if (product->api_user) { 2099 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 2100 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2101 PetscOptionsEnd(); 2102 } else { 2103 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 2104 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2105 PetscOptionsEnd(); 2106 } 2107 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2108 2109 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2110 PetscFunctionReturn(PETSC_SUCCESS); 2111 } 2112 2113 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2114 { 2115 Mat_Product *product = C->product; 2116 PetscBool flg = PETSC_FALSE; 2117 PetscInt alg = 0; /* default algorithm */ 2118 const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 2119 PetscInt nalg = 3; 2120 2121 PetscFunctionBegin; 2122 /* Set default algorithm */ 2123 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2124 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2125 2126 /* Get runtime option */ 2127 if (product->api_user) { 2128 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 2129 PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2130 PetscOptionsEnd(); 2131 } else { 2132 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 2133 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2134 PetscOptionsEnd(); 2135 } 2136 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2137 2138 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2139 PetscFunctionReturn(PETSC_SUCCESS); 2140 } 2141 2142 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2143 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2144 { 2145 Mat_Product *product = C->product; 2146 PetscInt alg = 0; /* default algorithm */ 2147 PetscBool flg = PETSC_FALSE; 2148 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 2149 PetscInt nalg = 7; 2150 2151 PetscFunctionBegin; 2152 /* Set default algorithm */ 2153 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2154 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2155 2156 /* Get runtime option */ 2157 if (product->api_user) { 2158 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 2159 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2160 PetscOptionsEnd(); 2161 } else { 2162 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 2163 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2164 PetscOptionsEnd(); 2165 } 2166 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2167 2168 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2169 C->ops->productsymbolic = MatProductSymbolic_ABC; 2170 PetscFunctionReturn(PETSC_SUCCESS); 2171 } 2172 2173 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2174 { 2175 Mat_Product *product = C->product; 2176 2177 PetscFunctionBegin; 2178 switch (product->type) { 2179 case MATPRODUCT_AB: 2180 PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2181 break; 2182 case MATPRODUCT_AtB: 2183 PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2184 break; 2185 case MATPRODUCT_ABt: 2186 PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2187 break; 2188 case MATPRODUCT_PtAP: 2189 PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2190 break; 2191 case MATPRODUCT_RARt: 2192 PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2193 break; 2194 case MATPRODUCT_ABC: 2195 PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2196 break; 2197 default: 2198 break; 2199 } 2200 PetscFunctionReturn(PETSC_SUCCESS); 2201 } 2202