1 /* 2 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 3 C = A * B 4 */ 5 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <petscbt.h> 9 #include <petsc/private/isimpl.h> 10 #include <../src/mat/impls/dense/seq/dense.h> 11 12 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 13 { 14 PetscFunctionBegin; 15 if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 16 else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 17 PetscFunctionReturn(PETSC_SUCCESS); 18 } 19 20 /* Modified from MatCreateSeqAIJWithArrays() */ 21 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) 22 { 23 PetscInt ii; 24 Mat_SeqAIJ *aij; 25 PetscBool isseqaij, osingle, ofree_a, ofree_ij; 26 27 PetscFunctionBegin; 28 PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 29 PetscCall(MatSetSizes(mat, m, n, m, n)); 30 31 if (!mtype) { 32 PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 33 if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 34 } else { 35 PetscCall(MatSetType(mat, mtype)); 36 } 37 38 aij = (Mat_SeqAIJ *)(mat)->data; 39 osingle = aij->singlemalloc; 40 ofree_a = aij->free_a; 41 ofree_ij = aij->free_ij; 42 /* changes the free flags */ 43 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 44 45 PetscCall(PetscFree(aij->ilen)); 46 PetscCall(PetscFree(aij->imax)); 47 PetscCall(PetscMalloc1(m, &aij->imax)); 48 PetscCall(PetscMalloc1(m, &aij->ilen)); 49 for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 50 const PetscInt rnz = i[ii + 1] - i[ii]; 51 aij->nonzerorowcnt += !!rnz; 52 aij->rmax = PetscMax(aij->rmax, rnz); 53 aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 54 } 55 aij->maxnz = i[m]; 56 aij->nz = i[m]; 57 58 if (osingle) { 59 PetscCall(PetscFree3(aij->a, aij->j, aij->i)); 60 } else { 61 if (ofree_a) PetscCall(PetscFree(aij->a)); 62 if (ofree_ij) PetscCall(PetscFree(aij->j)); 63 if (ofree_ij) PetscCall(PetscFree(aij->i)); 64 } 65 aij->i = i; 66 aij->j = j; 67 aij->a = a; 68 aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 69 /* default to not retain ownership */ 70 aij->singlemalloc = PETSC_FALSE; 71 aij->free_a = PETSC_FALSE; 72 aij->free_ij = PETSC_FALSE; 73 PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 74 // Always build the diag info when i, j are set 75 PetscCall(MatMarkDiagonal_SeqAIJ(mat)); 76 PetscFunctionReturn(PETSC_SUCCESS); 77 } 78 79 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 80 { 81 Mat_Product *product = C->product; 82 MatProductAlgorithm alg; 83 PetscBool flg; 84 85 PetscFunctionBegin; 86 if (product) { 87 alg = product->alg; 88 } else { 89 alg = "sorted"; 90 } 91 /* sorted */ 92 PetscCall(PetscStrcmp(alg, "sorted", &flg)); 93 if (flg) { 94 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 95 PetscFunctionReturn(PETSC_SUCCESS); 96 } 97 98 /* scalable */ 99 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 100 if (flg) { 101 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 102 PetscFunctionReturn(PETSC_SUCCESS); 103 } 104 105 /* scalable_fast */ 106 PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 107 if (flg) { 108 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 109 PetscFunctionReturn(PETSC_SUCCESS); 110 } 111 112 /* heap */ 113 PetscCall(PetscStrcmp(alg, "heap", &flg)); 114 if (flg) { 115 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 116 PetscFunctionReturn(PETSC_SUCCESS); 117 } 118 119 /* btheap */ 120 PetscCall(PetscStrcmp(alg, "btheap", &flg)); 121 if (flg) { 122 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 123 PetscFunctionReturn(PETSC_SUCCESS); 124 } 125 126 /* llcondensed */ 127 PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 128 if (flg) { 129 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 130 PetscFunctionReturn(PETSC_SUCCESS); 131 } 132 133 /* rowmerge */ 134 PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 135 if (flg) { 136 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 137 PetscFunctionReturn(PETSC_SUCCESS); 138 } 139 140 #if defined(PETSC_HAVE_HYPRE) 141 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 142 if (flg) { 143 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 #endif 147 148 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 149 } 150 151 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) 152 { 153 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 154 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 155 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 156 PetscReal afill; 157 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 158 PetscHMapI ta; 159 PetscBT lnkbt; 160 PetscFreeSpaceList free_space = NULL, current_space = NULL; 161 162 PetscFunctionBegin; 163 /* Get ci and cj */ 164 /* Allocate ci array, arrays for fill computation and */ 165 /* free space for accumulating nonzero column info */ 166 PetscCall(PetscMalloc1(am + 2, &ci)); 167 ci[0] = 0; 168 169 /* create and initialize a linked list */ 170 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 171 MatRowMergeMax_SeqAIJ(b, bm, ta); 172 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 173 PetscCall(PetscHMapIDestroy(&ta)); 174 175 PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 176 177 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 178 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 179 180 current_space = free_space; 181 182 /* Determine ci and cj */ 183 for (i = 0; i < am; i++) { 184 anzi = ai[i + 1] - ai[i]; 185 aj = a->j + ai[i]; 186 for (j = 0; j < anzi; j++) { 187 brow = aj[j]; 188 bnzj = bi[brow + 1] - bi[brow]; 189 bj = b->j + bi[brow]; 190 /* add non-zero cols of B into the sorted linked list lnk */ 191 PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 192 } 193 /* add possible missing diagonal entry */ 194 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 195 cnzi = lnk[0]; 196 197 /* If free space is not available, make more free space */ 198 /* Double the amount of total space in the list */ 199 if (current_space->local_remaining < cnzi) { 200 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 201 ndouble++; 202 } 203 204 /* Copy data into free space, then initialize lnk */ 205 PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 206 207 current_space->array += cnzi; 208 current_space->local_used += cnzi; 209 current_space->local_remaining -= cnzi; 210 211 ci[i + 1] = ci[i] + cnzi; 212 } 213 214 /* Column indices are in the list of free space */ 215 /* Allocate space for cj, initialize cj, and */ 216 /* destroy list of free space and other temporary array(s) */ 217 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 218 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 219 PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 220 221 /* put together the new symbolic matrix */ 222 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 223 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 224 225 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 226 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 227 c = (Mat_SeqAIJ *)(C->data); 228 c->free_a = PETSC_FALSE; 229 c->free_ij = PETSC_TRUE; 230 c->nonew = 0; 231 232 /* fast, needs non-scalable O(bn) array 'abdense' */ 233 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 234 235 /* set MatInfo */ 236 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 237 if (afill < 1.0) afill = 1.0; 238 C->info.mallocs = ndouble; 239 C->info.fill_ratio_given = fill; 240 C->info.fill_ratio_needed = afill; 241 242 #if defined(PETSC_USE_INFO) 243 if (ci[am]) { 244 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 245 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 246 } else { 247 PetscCall(PetscInfo(C, "Empty matrix product\n")); 248 } 249 #endif 250 PetscFunctionReturn(PETSC_SUCCESS); 251 } 252 253 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) 254 { 255 PetscLogDouble flops = 0.0; 256 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 257 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 258 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 259 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 260 PetscInt am = A->rmap->n, cm = C->rmap->n; 261 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 262 PetscScalar *ca, valtmp; 263 PetscScalar *ab_dense; 264 PetscContainer cab_dense; 265 const PetscScalar *aa, *ba, *baj; 266 267 PetscFunctionBegin; 268 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 269 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 270 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 271 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 272 c->a = ca; 273 c->free_a = PETSC_TRUE; 274 } else ca = c->a; 275 276 /* TODO this should be done in the symbolic phase */ 277 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 278 that is hard to eradicate) */ 279 PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 280 if (!cab_dense) { 281 PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 282 PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &cab_dense)); 283 PetscCall(PetscContainerSetPointer(cab_dense, ab_dense)); 284 PetscCall(PetscContainerSetUserDestroy(cab_dense, PetscContainerUserDestroyDefault)); 285 PetscCall(PetscObjectCompose((PetscObject)C, "__PETSc__ab_dense", (PetscObject)cab_dense)); 286 PetscCall(PetscObjectDereference((PetscObject)cab_dense)); 287 } 288 PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense)); 289 PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 290 291 /* clean old values in C */ 292 PetscCall(PetscArrayzero(ca, ci[cm])); 293 /* Traverse A row-wise. */ 294 /* Build the ith row in C by summing over nonzero columns in A, */ 295 /* the rows of B corresponding to nonzeros of A. */ 296 for (i = 0; i < am; i++) { 297 anzi = ai[i + 1] - ai[i]; 298 for (j = 0; j < anzi; j++) { 299 brow = aj[j]; 300 bnzi = bi[brow + 1] - bi[brow]; 301 bjj = bj + bi[brow]; 302 baj = ba + bi[brow]; 303 /* perform dense axpy */ 304 valtmp = aa[j]; 305 for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 306 flops += 2 * bnzi; 307 } 308 aj = PetscSafePointerPlusOffset(aj, anzi); 309 aa = PetscSafePointerPlusOffset(aa, anzi); 310 311 cnzi = ci[i + 1] - ci[i]; 312 for (k = 0; k < cnzi; k++) { 313 ca[k] += ab_dense[cj[k]]; 314 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 315 } 316 flops += cnzi; 317 cj = PetscSafePointerPlusOffset(cj, cnzi); 318 ca += cnzi; 319 } 320 #if defined(PETSC_HAVE_DEVICE) 321 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 322 #endif 323 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 324 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 325 PetscCall(PetscLogFlops(flops)); 326 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 327 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 328 PetscFunctionReturn(PETSC_SUCCESS); 329 } 330 331 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) 332 { 333 PetscLogDouble flops = 0.0; 334 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 335 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 336 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 337 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 338 PetscInt am = A->rmap->N, cm = C->rmap->N; 339 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 340 PetscScalar *ca = c->a, valtmp; 341 const PetscScalar *aa, *ba, *baj; 342 PetscInt nextb; 343 344 PetscFunctionBegin; 345 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 346 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 347 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 348 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 349 c->a = ca; 350 c->free_a = PETSC_TRUE; 351 } 352 353 /* clean old values in C */ 354 PetscCall(PetscArrayzero(ca, ci[cm])); 355 /* Traverse A row-wise. */ 356 /* Build the ith row in C by summing over nonzero columns in A, */ 357 /* the rows of B corresponding to nonzeros of A. */ 358 for (i = 0; i < am; i++) { 359 anzi = ai[i + 1] - ai[i]; 360 cnzi = ci[i + 1] - ci[i]; 361 for (j = 0; j < anzi; j++) { 362 brow = aj[j]; 363 bnzi = bi[brow + 1] - bi[brow]; 364 bjj = bj + bi[brow]; 365 baj = ba + bi[brow]; 366 /* perform sparse axpy */ 367 valtmp = aa[j]; 368 nextb = 0; 369 for (k = 0; nextb < bnzi; k++) { 370 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 371 ca[k] += valtmp * baj[nextb++]; 372 } 373 } 374 flops += 2 * bnzi; 375 } 376 aj += anzi; 377 aa += anzi; 378 cj += cnzi; 379 ca += cnzi; 380 } 381 #if defined(PETSC_HAVE_DEVICE) 382 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 383 #endif 384 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 385 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 386 PetscCall(PetscLogFlops(flops)); 387 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 388 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 389 PetscFunctionReturn(PETSC_SUCCESS); 390 } 391 392 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) 393 { 394 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 395 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 396 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 397 MatScalar *ca; 398 PetscReal afill; 399 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 400 PetscHMapI ta; 401 PetscFreeSpaceList free_space = NULL, current_space = NULL; 402 403 PetscFunctionBegin; 404 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 405 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 406 PetscCall(PetscMalloc1(am + 2, &ci)); 407 ci[0] = 0; 408 409 /* create and initialize a linked list */ 410 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 411 MatRowMergeMax_SeqAIJ(b, bm, ta); 412 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 413 PetscCall(PetscHMapIDestroy(&ta)); 414 415 PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 416 417 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 418 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 419 current_space = free_space; 420 421 /* Determine ci and cj */ 422 for (i = 0; i < am; i++) { 423 anzi = ai[i + 1] - ai[i]; 424 aj = a->j + ai[i]; 425 for (j = 0; j < anzi; j++) { 426 brow = aj[j]; 427 bnzj = bi[brow + 1] - bi[brow]; 428 bj = b->j + bi[brow]; 429 /* add non-zero cols of B into the sorted linked list lnk */ 430 PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 431 } 432 /* add possible missing diagonal entry */ 433 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 434 cnzi = lnk[1]; 435 436 /* If free space is not available, make more free space */ 437 /* Double the amount of total space in the list */ 438 if (current_space->local_remaining < cnzi) { 439 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 440 ndouble++; 441 } 442 443 /* Copy data into free space, then initialize lnk */ 444 PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 445 446 current_space->array += cnzi; 447 current_space->local_used += cnzi; 448 current_space->local_remaining -= cnzi; 449 450 ci[i + 1] = ci[i] + cnzi; 451 } 452 453 /* Column indices are in the list of free space */ 454 /* Allocate space for cj, initialize cj, and */ 455 /* destroy list of free space and other temporary array(s) */ 456 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 457 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 458 PetscCall(PetscLLCondensedDestroy_fast(lnk)); 459 460 /* Allocate space for ca */ 461 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 462 463 /* put together the new symbolic matrix */ 464 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 465 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 466 467 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 468 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 469 c = (Mat_SeqAIJ *)(C->data); 470 c->free_a = PETSC_TRUE; 471 c->free_ij = PETSC_TRUE; 472 c->nonew = 0; 473 474 /* slower, less memory */ 475 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 476 477 /* set MatInfo */ 478 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 479 if (afill < 1.0) afill = 1.0; 480 C->info.mallocs = ndouble; 481 C->info.fill_ratio_given = fill; 482 C->info.fill_ratio_needed = afill; 483 484 #if defined(PETSC_USE_INFO) 485 if (ci[am]) { 486 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 487 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 488 } else { 489 PetscCall(PetscInfo(C, "Empty matrix product\n")); 490 } 491 #endif 492 PetscFunctionReturn(PETSC_SUCCESS); 493 } 494 495 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) 496 { 497 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 498 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 499 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 500 MatScalar *ca; 501 PetscReal afill; 502 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 503 PetscHMapI ta; 504 PetscFreeSpaceList free_space = NULL, current_space = NULL; 505 506 PetscFunctionBegin; 507 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 508 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 509 PetscCall(PetscMalloc1(am + 2, &ci)); 510 ci[0] = 0; 511 512 /* create and initialize a linked list */ 513 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 514 MatRowMergeMax_SeqAIJ(b, bm, ta); 515 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 516 PetscCall(PetscHMapIDestroy(&ta)); 517 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 518 519 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 520 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 521 current_space = free_space; 522 523 /* Determine ci and cj */ 524 for (i = 0; i < am; i++) { 525 anzi = ai[i + 1] - ai[i]; 526 aj = a->j + ai[i]; 527 for (j = 0; j < anzi; j++) { 528 brow = aj[j]; 529 bnzj = bi[brow + 1] - bi[brow]; 530 bj = b->j + bi[brow]; 531 /* add non-zero cols of B into the sorted linked list lnk */ 532 PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 533 } 534 /* add possible missing diagonal entry */ 535 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 536 537 cnzi = lnk[0]; 538 539 /* If free space is not available, make more free space */ 540 /* Double the amount of total space in the list */ 541 if (current_space->local_remaining < cnzi) { 542 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 543 ndouble++; 544 } 545 546 /* Copy data into free space, then initialize lnk */ 547 PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 548 549 current_space->array += cnzi; 550 current_space->local_used += cnzi; 551 current_space->local_remaining -= cnzi; 552 553 ci[i + 1] = ci[i] + cnzi; 554 } 555 556 /* Column indices are in the list of free space */ 557 /* Allocate space for cj, initialize cj, and */ 558 /* destroy list of free space and other temporary array(s) */ 559 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 560 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 561 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 562 563 /* Allocate space for ca */ 564 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 565 566 /* put together the new symbolic matrix */ 567 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 568 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 569 570 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 571 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 572 c = (Mat_SeqAIJ *)(C->data); 573 c->free_a = PETSC_TRUE; 574 c->free_ij = PETSC_TRUE; 575 c->nonew = 0; 576 577 /* slower, less memory */ 578 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 579 580 /* set MatInfo */ 581 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 582 if (afill < 1.0) afill = 1.0; 583 C->info.mallocs = ndouble; 584 C->info.fill_ratio_given = fill; 585 C->info.fill_ratio_needed = afill; 586 587 #if defined(PETSC_USE_INFO) 588 if (ci[am]) { 589 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 590 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 591 } else { 592 PetscCall(PetscInfo(C, "Empty matrix product\n")); 593 } 594 #endif 595 PetscFunctionReturn(PETSC_SUCCESS); 596 } 597 598 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) 599 { 600 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 601 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 602 PetscInt *ci, *cj, *bb; 603 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 604 PetscReal afill; 605 PetscInt i, j, col, ndouble = 0; 606 PetscFreeSpaceList free_space = NULL, current_space = NULL; 607 PetscHeap h; 608 609 PetscFunctionBegin; 610 /* Get ci and cj - by merging sorted rows using a heap */ 611 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 612 PetscCall(PetscMalloc1(am + 2, &ci)); 613 ci[0] = 0; 614 615 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 616 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 617 current_space = free_space; 618 619 PetscCall(PetscHeapCreate(a->rmax, &h)); 620 PetscCall(PetscMalloc1(a->rmax, &bb)); 621 622 /* Determine ci and cj */ 623 for (i = 0; i < am; i++) { 624 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 625 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 626 ci[i + 1] = ci[i]; 627 /* Populate the min heap */ 628 for (j = 0; j < anzi; j++) { 629 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 630 if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 631 PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 632 } 633 } 634 /* Pick off the min element, adding it to free space */ 635 PetscCall(PetscHeapPop(h, &j, &col)); 636 while (j >= 0) { 637 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 638 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 639 ndouble++; 640 } 641 *(current_space->array++) = col; 642 current_space->local_used++; 643 current_space->local_remaining--; 644 ci[i + 1]++; 645 646 /* stash if anything else remains in this row of B */ 647 if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 648 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 649 PetscInt j2, col2; 650 PetscCall(PetscHeapPeek(h, &j2, &col2)); 651 if (col2 != col) break; 652 PetscCall(PetscHeapPop(h, &j2, &col2)); 653 if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 654 } 655 /* Put any stashed elements back into the min heap */ 656 PetscCall(PetscHeapUnstash(h)); 657 PetscCall(PetscHeapPop(h, &j, &col)); 658 } 659 } 660 PetscCall(PetscFree(bb)); 661 PetscCall(PetscHeapDestroy(&h)); 662 663 /* Column indices are in the list of free space */ 664 /* Allocate space for cj, initialize cj, and */ 665 /* destroy list of free space and other temporary array(s) */ 666 PetscCall(PetscMalloc1(ci[am], &cj)); 667 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 668 669 /* put together the new symbolic matrix */ 670 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 671 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 672 673 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 674 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 675 c = (Mat_SeqAIJ *)(C->data); 676 c->free_a = PETSC_TRUE; 677 c->free_ij = PETSC_TRUE; 678 c->nonew = 0; 679 680 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 681 682 /* set MatInfo */ 683 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 684 if (afill < 1.0) afill = 1.0; 685 C->info.mallocs = ndouble; 686 C->info.fill_ratio_given = fill; 687 C->info.fill_ratio_needed = afill; 688 689 #if defined(PETSC_USE_INFO) 690 if (ci[am]) { 691 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 692 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 693 } else { 694 PetscCall(PetscInfo(C, "Empty matrix product\n")); 695 } 696 #endif 697 PetscFunctionReturn(PETSC_SUCCESS); 698 } 699 700 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) 701 { 702 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 703 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 704 PetscInt *ci, *cj, *bb; 705 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 706 PetscReal afill; 707 PetscInt i, j, col, ndouble = 0; 708 PetscFreeSpaceList free_space = NULL, current_space = NULL; 709 PetscHeap h; 710 PetscBT bt; 711 712 PetscFunctionBegin; 713 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 714 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 715 PetscCall(PetscMalloc1(am + 2, &ci)); 716 ci[0] = 0; 717 718 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 719 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 720 721 current_space = free_space; 722 723 PetscCall(PetscHeapCreate(a->rmax, &h)); 724 PetscCall(PetscMalloc1(a->rmax, &bb)); 725 PetscCall(PetscBTCreate(bn, &bt)); 726 727 /* Determine ci and cj */ 728 for (i = 0; i < am; i++) { 729 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 730 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 731 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 732 ci[i + 1] = ci[i]; 733 /* Populate the min heap */ 734 for (j = 0; j < anzi; j++) { 735 PetscInt brow = acol[j]; 736 for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 737 PetscInt bcol = bj[bb[j]]; 738 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 739 PetscCall(PetscHeapAdd(h, j, bcol)); 740 bb[j]++; 741 break; 742 } 743 } 744 } 745 /* Pick off the min element, adding it to free space */ 746 PetscCall(PetscHeapPop(h, &j, &col)); 747 while (j >= 0) { 748 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 749 fptr = NULL; /* need PetscBTMemzero */ 750 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 751 ndouble++; 752 } 753 *(current_space->array++) = col; 754 current_space->local_used++; 755 current_space->local_remaining--; 756 ci[i + 1]++; 757 758 /* stash if anything else remains in this row of B */ 759 for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 760 PetscInt bcol = bj[bb[j]]; 761 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 762 PetscCall(PetscHeapAdd(h, j, bcol)); 763 bb[j]++; 764 break; 765 } 766 } 767 PetscCall(PetscHeapPop(h, &j, &col)); 768 } 769 if (fptr) { /* Clear the bits for this row */ 770 for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 771 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 772 PetscCall(PetscBTMemzero(bn, bt)); 773 } 774 } 775 PetscCall(PetscFree(bb)); 776 PetscCall(PetscHeapDestroy(&h)); 777 PetscCall(PetscBTDestroy(&bt)); 778 779 /* Column indices are in the list of free space */ 780 /* Allocate space for cj, initialize cj, and */ 781 /* destroy list of free space and other temporary array(s) */ 782 PetscCall(PetscMalloc1(ci[am], &cj)); 783 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 784 785 /* put together the new symbolic matrix */ 786 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 787 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 788 789 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 790 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 791 c = (Mat_SeqAIJ *)(C->data); 792 c->free_a = PETSC_TRUE; 793 c->free_ij = PETSC_TRUE; 794 c->nonew = 0; 795 796 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 797 798 /* set MatInfo */ 799 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 800 if (afill < 1.0) afill = 1.0; 801 C->info.mallocs = ndouble; 802 C->info.fill_ratio_given = fill; 803 C->info.fill_ratio_needed = afill; 804 805 #if defined(PETSC_USE_INFO) 806 if (ci[am]) { 807 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 808 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 809 } else { 810 PetscCall(PetscInfo(C, "Empty matrix product\n")); 811 } 812 #endif 813 PetscFunctionReturn(PETSC_SUCCESS); 814 } 815 816 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) 817 { 818 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 819 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 820 PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 821 PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 822 const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 823 const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 824 const PetscInt *brow_ptr[8], *brow_end[8]; 825 PetscInt window[8]; 826 PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 827 PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 828 PetscReal afill; 829 PetscInt *workj_L1, *workj_L2, *workj_L3; 830 PetscInt L1_nnz, L2_nnz; 831 832 /* Step 1: Get upper bound on memory required for allocation. 833 Because of the way virtual memory works, 834 only the memory pages that are actually needed will be physically allocated. */ 835 PetscFunctionBegin; 836 PetscCall(PetscMalloc1(am + 1, &ci)); 837 for (i = 0; i < am; i++) { 838 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 839 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 840 a_rownnz = 0; 841 for (k = 0; k < anzi; ++k) { 842 a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 843 if (a_rownnz > bn) { 844 a_rownnz = bn; 845 break; 846 } 847 } 848 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 849 } 850 /* temporary work areas for merging rows */ 851 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 852 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 853 PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 854 855 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 856 c_maxmem = 8 * (ai[am] + bi[bm]); 857 /* Step 2: Populate pattern for C */ 858 PetscCall(PetscMalloc1(c_maxmem, &cj)); 859 860 ci_nnz = 0; 861 ci[0] = 0; 862 worki_L1[0] = 0; 863 worki_L2[0] = 0; 864 for (i = 0; i < am; i++) { 865 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 866 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 867 rowsleft = anzi; 868 inputcol_L1 = acol; 869 L2_nnz = 0; 870 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 871 worki_L2[1] = 0; 872 outputi_nnz = 0; 873 874 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 875 while (ci_nnz + a_maxrownnz > c_maxmem) { 876 c_maxmem *= 2; 877 ndouble++; 878 PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 879 } 880 881 while (rowsleft) { 882 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 883 L1_nrows = 0; 884 L1_nnz = 0; 885 inputcol = inputcol_L1; 886 inputi = bi; 887 inputj = bj; 888 889 /* The following macro is used to specialize for small rows in A. 890 This helps with compiler unrolling, improving performance substantially. 891 Input: inputj inputi inputcol bn 892 Output: outputj outputi_nnz */ 893 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 894 do { \ 895 window_min = bn; \ 896 outputi_nnz = 0; \ 897 for (k = 0; k < ANNZ; ++k) { \ 898 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 899 brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 900 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 901 window_min = PetscMin(window[k], window_min); \ 902 } \ 903 while (window_min < bn) { \ 904 outputj[outputi_nnz++] = window_min; \ 905 /* advance front and compute new minimum */ \ 906 old_window_min = window_min; \ 907 window_min = bn; \ 908 for (k = 0; k < ANNZ; ++k) { \ 909 if (window[k] == old_window_min) { \ 910 brow_ptr[k]++; \ 911 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 912 } \ 913 window_min = PetscMin(window[k], window_min); \ 914 } \ 915 } \ 916 } while (0) 917 918 /************** L E V E L 1 ***************/ 919 /* Merge up to 8 rows of B to L1 work array*/ 920 while (L1_rowsleft) { 921 outputi_nnz = 0; 922 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 923 else outputj = cj + ci_nnz; /* Merge directly to C */ 924 925 switch (L1_rowsleft) { 926 case 1: 927 brow_ptr[0] = inputj + inputi[inputcol[0]]; 928 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 929 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 930 inputcol += L1_rowsleft; 931 rowsleft -= L1_rowsleft; 932 L1_rowsleft = 0; 933 break; 934 case 2: 935 MatMatMultSymbolic_RowMergeMacro(2); 936 inputcol += L1_rowsleft; 937 rowsleft -= L1_rowsleft; 938 L1_rowsleft = 0; 939 break; 940 case 3: 941 MatMatMultSymbolic_RowMergeMacro(3); 942 inputcol += L1_rowsleft; 943 rowsleft -= L1_rowsleft; 944 L1_rowsleft = 0; 945 break; 946 case 4: 947 MatMatMultSymbolic_RowMergeMacro(4); 948 inputcol += L1_rowsleft; 949 rowsleft -= L1_rowsleft; 950 L1_rowsleft = 0; 951 break; 952 case 5: 953 MatMatMultSymbolic_RowMergeMacro(5); 954 inputcol += L1_rowsleft; 955 rowsleft -= L1_rowsleft; 956 L1_rowsleft = 0; 957 break; 958 case 6: 959 MatMatMultSymbolic_RowMergeMacro(6); 960 inputcol += L1_rowsleft; 961 rowsleft -= L1_rowsleft; 962 L1_rowsleft = 0; 963 break; 964 case 7: 965 MatMatMultSymbolic_RowMergeMacro(7); 966 inputcol += L1_rowsleft; 967 rowsleft -= L1_rowsleft; 968 L1_rowsleft = 0; 969 break; 970 default: 971 MatMatMultSymbolic_RowMergeMacro(8); 972 inputcol += 8; 973 rowsleft -= 8; 974 L1_rowsleft -= 8; 975 break; 976 } 977 inputcol_L1 = inputcol; 978 L1_nnz += outputi_nnz; 979 worki_L1[++L1_nrows] = L1_nnz; 980 } 981 982 /********************** L E V E L 2 ************************/ 983 /* Merge from L1 work array to either C or to L2 work array */ 984 if (anzi > 8) { 985 inputi = worki_L1; 986 inputj = workj_L1; 987 inputcol = workcol; 988 outputi_nnz = 0; 989 990 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 991 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 992 993 switch (L1_nrows) { 994 case 1: 995 brow_ptr[0] = inputj + inputi[inputcol[0]]; 996 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 997 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 998 break; 999 case 2: 1000 MatMatMultSymbolic_RowMergeMacro(2); 1001 break; 1002 case 3: 1003 MatMatMultSymbolic_RowMergeMacro(3); 1004 break; 1005 case 4: 1006 MatMatMultSymbolic_RowMergeMacro(4); 1007 break; 1008 case 5: 1009 MatMatMultSymbolic_RowMergeMacro(5); 1010 break; 1011 case 6: 1012 MatMatMultSymbolic_RowMergeMacro(6); 1013 break; 1014 case 7: 1015 MatMatMultSymbolic_RowMergeMacro(7); 1016 break; 1017 case 8: 1018 MatMatMultSymbolic_RowMergeMacro(8); 1019 break; 1020 default: 1021 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1022 } 1023 L2_nnz += outputi_nnz; 1024 worki_L2[++L2_nrows] = L2_nnz; 1025 1026 /************************ L E V E L 3 **********************/ 1027 /* Merge from L2 work array to either C or to L2 work array */ 1028 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1029 inputi = worki_L2; 1030 inputj = workj_L2; 1031 inputcol = workcol; 1032 outputi_nnz = 0; 1033 if (rowsleft) outputj = workj_L3; 1034 else outputj = cj + ci_nnz; 1035 switch (L2_nrows) { 1036 case 1: 1037 brow_ptr[0] = inputj + inputi[inputcol[0]]; 1038 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1039 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1040 break; 1041 case 2: 1042 MatMatMultSymbolic_RowMergeMacro(2); 1043 break; 1044 case 3: 1045 MatMatMultSymbolic_RowMergeMacro(3); 1046 break; 1047 case 4: 1048 MatMatMultSymbolic_RowMergeMacro(4); 1049 break; 1050 case 5: 1051 MatMatMultSymbolic_RowMergeMacro(5); 1052 break; 1053 case 6: 1054 MatMatMultSymbolic_RowMergeMacro(6); 1055 break; 1056 case 7: 1057 MatMatMultSymbolic_RowMergeMacro(7); 1058 break; 1059 case 8: 1060 MatMatMultSymbolic_RowMergeMacro(8); 1061 break; 1062 default: 1063 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1064 } 1065 L2_nrows = 1; 1066 L2_nnz = outputi_nnz; 1067 worki_L2[1] = outputi_nnz; 1068 /* Copy to workj_L2 */ 1069 if (rowsleft) { 1070 for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1071 } 1072 } 1073 } 1074 } /* while (rowsleft) */ 1075 #undef MatMatMultSymbolic_RowMergeMacro 1076 1077 /* terminate current row */ 1078 ci_nnz += outputi_nnz; 1079 ci[i + 1] = ci_nnz; 1080 } 1081 1082 /* Step 3: Create the new symbolic matrix */ 1083 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1084 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1085 1086 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1087 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1088 c = (Mat_SeqAIJ *)(C->data); 1089 c->free_a = PETSC_TRUE; 1090 c->free_ij = PETSC_TRUE; 1091 c->nonew = 0; 1092 1093 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1094 1095 /* set MatInfo */ 1096 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1097 if (afill < 1.0) afill = 1.0; 1098 C->info.mallocs = ndouble; 1099 C->info.fill_ratio_given = fill; 1100 C->info.fill_ratio_needed = afill; 1101 1102 #if defined(PETSC_USE_INFO) 1103 if (ci[am]) { 1104 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1105 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1106 } else { 1107 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1108 } 1109 #endif 1110 1111 /* Step 4: Free temporary work areas */ 1112 PetscCall(PetscFree(workj_L1)); 1113 PetscCall(PetscFree(workj_L2)); 1114 PetscCall(PetscFree(workj_L3)); 1115 PetscFunctionReturn(PETSC_SUCCESS); 1116 } 1117 1118 /* concatenate unique entries and then sort */ 1119 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) 1120 { 1121 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1122 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 1123 PetscInt *ci, *cj, bcol; 1124 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1125 PetscReal afill; 1126 PetscInt i, j, ndouble = 0; 1127 PetscSegBuffer seg, segrow; 1128 char *seen; 1129 1130 PetscFunctionBegin; 1131 PetscCall(PetscMalloc1(am + 1, &ci)); 1132 ci[0] = 0; 1133 1134 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1135 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 1136 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 1137 PetscCall(PetscCalloc1(bn, &seen)); 1138 1139 /* Determine ci and cj */ 1140 for (i = 0; i < am; i++) { 1141 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1142 const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */ 1143 PetscInt packlen = 0, *PETSC_RESTRICT crow; 1144 1145 /* Pack segrow */ 1146 for (j = 0; j < anzi; j++) { 1147 PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1148 for (k = bjstart; k < bjend; k++) { 1149 bcol = bj[k]; 1150 if (!seen[bcol]) { /* new entry */ 1151 PetscInt *PETSC_RESTRICT slot; 1152 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1153 *slot = bcol; 1154 seen[bcol] = 1; 1155 packlen++; 1156 } 1157 } 1158 } 1159 1160 /* Check i-th diagonal entry */ 1161 if (C->force_diagonals && !seen[i]) { 1162 PetscInt *PETSC_RESTRICT slot; 1163 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1164 *slot = i; 1165 seen[i] = 1; 1166 packlen++; 1167 } 1168 1169 PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 1170 PetscCall(PetscSegBufferExtractTo(segrow, crow)); 1171 PetscCall(PetscSortInt(packlen, crow)); 1172 ci[i + 1] = ci[i] + packlen; 1173 for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1174 } 1175 PetscCall(PetscSegBufferDestroy(&segrow)); 1176 PetscCall(PetscFree(seen)); 1177 1178 /* Column indices are in the segmented buffer */ 1179 PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 1180 PetscCall(PetscSegBufferDestroy(&seg)); 1181 1182 /* put together the new symbolic matrix */ 1183 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1184 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1185 1186 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1187 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1188 c = (Mat_SeqAIJ *)(C->data); 1189 c->free_a = PETSC_TRUE; 1190 c->free_ij = PETSC_TRUE; 1191 c->nonew = 0; 1192 1193 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1194 1195 /* set MatInfo */ 1196 afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1197 if (afill < 1.0) afill = 1.0; 1198 C->info.mallocs = ndouble; 1199 C->info.fill_ratio_given = fill; 1200 C->info.fill_ratio_needed = afill; 1201 1202 #if defined(PETSC_USE_INFO) 1203 if (ci[am]) { 1204 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1205 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1206 } else { 1207 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1208 } 1209 #endif 1210 PetscFunctionReturn(PETSC_SUCCESS); 1211 } 1212 1213 static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1214 { 1215 Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data; 1216 1217 PetscFunctionBegin; 1218 PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 1219 PetscCall(MatDestroy(&abt->Bt_den)); 1220 PetscCall(MatDestroy(&abt->ABt_den)); 1221 PetscCall(PetscFree(abt)); 1222 PetscFunctionReturn(PETSC_SUCCESS); 1223 } 1224 1225 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1226 { 1227 Mat Bt; 1228 Mat_MatMatTransMult *abt; 1229 Mat_Product *product = C->product; 1230 char *alg; 1231 1232 PetscFunctionBegin; 1233 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1234 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1235 1236 /* create symbolic Bt */ 1237 PetscCall(MatTransposeSymbolic(B, &Bt)); 1238 PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1239 PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 1240 1241 /* get symbolic C=A*Bt */ 1242 PetscCall(PetscStrallocpy(product->alg, &alg)); 1243 PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 1244 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 1245 PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 1246 PetscCall(PetscFree(alg)); 1247 1248 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1249 PetscCall(PetscNew(&abt)); 1250 1251 product->data = abt; 1252 product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 1253 1254 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1255 1256 abt->usecoloring = PETSC_FALSE; 1257 PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 1258 if (abt->usecoloring) { 1259 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1260 MatTransposeColoring matcoloring; 1261 MatColoring coloring; 1262 ISColoring iscoloring; 1263 Mat Bt_dense, C_dense; 1264 1265 /* inode causes memory problem */ 1266 PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 1267 1268 PetscCall(MatColoringCreate(C, &coloring)); 1269 PetscCall(MatColoringSetDistance(coloring, 2)); 1270 PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 1271 PetscCall(MatColoringSetFromOptions(coloring)); 1272 PetscCall(MatColoringApply(coloring, &iscoloring)); 1273 PetscCall(MatColoringDestroy(&coloring)); 1274 PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 1275 1276 abt->matcoloring = matcoloring; 1277 1278 PetscCall(ISColoringDestroy(&iscoloring)); 1279 1280 /* Create Bt_dense and C_dense = A*Bt_dense */ 1281 PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 1282 PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 1283 PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 1284 PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 1285 1286 Bt_dense->assembled = PETSC_TRUE; 1287 abt->Bt_den = Bt_dense; 1288 1289 PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 1290 PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 1291 PetscCall(MatSetType(C_dense, MATSEQDENSE)); 1292 PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 1293 1294 Bt_dense->assembled = PETSC_TRUE; 1295 abt->ABt_den = C_dense; 1296 1297 #if defined(PETSC_USE_INFO) 1298 { 1299 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 1300 PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 1301 Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)(c->nz)) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 1302 } 1303 #endif 1304 } 1305 /* clean up */ 1306 PetscCall(MatDestroy(&Bt)); 1307 PetscFunctionReturn(PETSC_SUCCESS); 1308 } 1309 1310 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1311 { 1312 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1313 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 1314 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 1315 PetscLogDouble flops = 0.0; 1316 MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 1317 Mat_MatMatTransMult *abt; 1318 Mat_Product *product = C->product; 1319 1320 PetscFunctionBegin; 1321 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1322 abt = (Mat_MatMatTransMult *)product->data; 1323 PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1324 /* clear old values in C */ 1325 if (!c->a) { 1326 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1327 c->a = ca; 1328 c->free_a = PETSC_TRUE; 1329 } else { 1330 ca = c->a; 1331 PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 1332 } 1333 1334 if (abt->usecoloring) { 1335 MatTransposeColoring matcoloring = abt->matcoloring; 1336 Mat Bt_dense, C_dense = abt->ABt_den; 1337 1338 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1339 Bt_dense = abt->Bt_den; 1340 PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1341 1342 /* C_dense = A*Bt_dense */ 1343 PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1344 1345 /* Recover C from C_dense */ 1346 PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 1347 PetscFunctionReturn(PETSC_SUCCESS); 1348 } 1349 1350 for (i = 0; i < cm; i++) { 1351 anzi = ai[i + 1] - ai[i]; 1352 acol = PetscSafePointerPlusOffset(aj, ai[i]); 1353 aval = PetscSafePointerPlusOffset(aa, ai[i]); 1354 cnzi = ci[i + 1] - ci[i]; 1355 ccol = PetscSafePointerPlusOffset(cj, ci[i]); 1356 cval = ca + ci[i]; 1357 for (j = 0; j < cnzi; j++) { 1358 brow = ccol[j]; 1359 bnzj = bi[brow + 1] - bi[brow]; 1360 bcol = bj + bi[brow]; 1361 bval = ba + bi[brow]; 1362 1363 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1364 nexta = 0; 1365 nextb = 0; 1366 while (nexta < anzi && nextb < bnzj) { 1367 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1368 if (nexta == anzi) break; 1369 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1370 if (nextb == bnzj) break; 1371 if (acol[nexta] == bcol[nextb]) { 1372 cval[j] += aval[nexta] * bval[nextb]; 1373 nexta++; 1374 nextb++; 1375 flops += 2; 1376 } 1377 } 1378 } 1379 } 1380 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1381 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1382 PetscCall(PetscLogFlops(flops)); 1383 PetscFunctionReturn(PETSC_SUCCESS); 1384 } 1385 1386 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1387 { 1388 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data; 1389 1390 PetscFunctionBegin; 1391 PetscCall(MatDestroy(&atb->At)); 1392 if (atb->destroy) PetscCall((*atb->destroy)(atb->data)); 1393 PetscCall(PetscFree(atb)); 1394 PetscFunctionReturn(PETSC_SUCCESS); 1395 } 1396 1397 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1398 { 1399 Mat At = NULL; 1400 Mat_Product *product = C->product; 1401 PetscBool flg, def, square; 1402 1403 PetscFunctionBegin; 1404 MatCheckProduct(C, 4); 1405 square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 1406 /* outerproduct */ 1407 PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 1408 if (flg) { 1409 /* create symbolic At */ 1410 if (!square) { 1411 PetscCall(MatTransposeSymbolic(A, &At)); 1412 PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1413 PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1414 } 1415 /* get symbolic C=At*B */ 1416 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1417 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1418 1419 /* clean up */ 1420 if (!square) PetscCall(MatDestroy(&At)); 1421 1422 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1423 PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 1424 PetscFunctionReturn(PETSC_SUCCESS); 1425 } 1426 1427 /* matmatmult */ 1428 PetscCall(PetscStrcmp(product->alg, "default", &def)); 1429 PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1430 if (flg || def) { 1431 Mat_MatTransMatMult *atb; 1432 1433 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1434 PetscCall(PetscNew(&atb)); 1435 if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 1436 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1437 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1438 PetscCall(MatProductSetAlgorithm(C, "at*b")); 1439 product->data = atb; 1440 product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 1441 atb->At = At; 1442 1443 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1444 PetscFunctionReturn(PETSC_SUCCESS); 1445 } 1446 1447 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1448 } 1449 1450 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1451 { 1452 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1453 PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1454 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1455 PetscLogDouble flops = 0.0; 1456 MatScalar *aa = a->a, *ba, *ca, *caj; 1457 1458 PetscFunctionBegin; 1459 if (!c->a) { 1460 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1461 1462 c->a = ca; 1463 c->free_a = PETSC_TRUE; 1464 } else { 1465 ca = c->a; 1466 PetscCall(PetscArrayzero(ca, ci[cm])); 1467 } 1468 1469 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1470 for (i = 0; i < am; i++) { 1471 bj = b->j + bi[i]; 1472 ba = b->a + bi[i]; 1473 bnzi = bi[i + 1] - bi[i]; 1474 anzi = ai[i + 1] - ai[i]; 1475 for (j = 0; j < anzi; j++) { 1476 nextb = 0; 1477 crow = *aj++; 1478 cjj = cj + ci[crow]; 1479 caj = ca + ci[crow]; 1480 /* perform sparse axpy operation. Note cjj includes bj. */ 1481 for (k = 0; nextb < bnzi; k++) { 1482 if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 1483 caj[k] += (*aa) * (*(ba + nextb)); 1484 nextb++; 1485 } 1486 } 1487 flops += 2 * bnzi; 1488 aa++; 1489 } 1490 } 1491 1492 /* Assemble the final matrix and clean up */ 1493 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1494 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1495 PetscCall(PetscLogFlops(flops)); 1496 PetscFunctionReturn(PETSC_SUCCESS); 1497 } 1498 1499 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1500 { 1501 PetscFunctionBegin; 1502 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1503 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1504 PetscFunctionReturn(PETSC_SUCCESS); 1505 } 1506 1507 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) 1508 { 1509 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1510 PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1511 const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1512 const PetscInt *aj; 1513 PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 1514 PetscInt clda; 1515 PetscInt am4, bm4, col, i, j, n; 1516 1517 PetscFunctionBegin; 1518 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1519 PetscCall(MatSeqAIJGetArrayRead(A, &av)); 1520 if (add) { 1521 PetscCall(MatDenseGetArray(C, &c)); 1522 } else { 1523 PetscCall(MatDenseGetArrayWrite(C, &c)); 1524 } 1525 PetscCall(MatDenseGetArrayRead(B, &b)); 1526 PetscCall(MatDenseGetLDA(B, &bm)); 1527 PetscCall(MatDenseGetLDA(C, &clda)); 1528 am4 = 4 * clda; 1529 bm4 = 4 * bm; 1530 if (b) { 1531 b1 = b; 1532 b2 = b1 + bm; 1533 b3 = b2 + bm; 1534 b4 = b3 + bm; 1535 } else b1 = b2 = b3 = b4 = NULL; 1536 c1 = c; 1537 c2 = c1 + clda; 1538 c3 = c2 + clda; 1539 c4 = c3 + clda; 1540 for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 1541 for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1542 r1 = r2 = r3 = r4 = 0.0; 1543 n = a->i[i + 1] - a->i[i]; 1544 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1545 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1546 for (j = 0; j < n; j++) { 1547 const PetscScalar aatmp = aa[j]; 1548 const PetscInt ajtmp = aj[j]; 1549 r1 += aatmp * b1[ajtmp]; 1550 r2 += aatmp * b2[ajtmp]; 1551 r3 += aatmp * b3[ajtmp]; 1552 r4 += aatmp * b4[ajtmp]; 1553 } 1554 if (add) { 1555 c1[i] += r1; 1556 c2[i] += r2; 1557 c3[i] += r3; 1558 c4[i] += r4; 1559 } else { 1560 c1[i] = r1; 1561 c2[i] = r2; 1562 c3[i] = r3; 1563 c4[i] = r4; 1564 } 1565 } 1566 if (b) { 1567 b1 += bm4; 1568 b2 += bm4; 1569 b3 += bm4; 1570 b4 += bm4; 1571 } 1572 c1 += am4; 1573 c2 += am4; 1574 c3 += am4; 1575 c4 += am4; 1576 } 1577 /* process remaining columns */ 1578 if (col != cn) { 1579 PetscInt rc = cn - col; 1580 1581 if (rc == 1) { 1582 for (i = 0; i < am; i++) { 1583 r1 = 0.0; 1584 n = a->i[i + 1] - a->i[i]; 1585 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1586 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1587 for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 1588 if (add) c1[i] += r1; 1589 else c1[i] = r1; 1590 } 1591 } else if (rc == 2) { 1592 for (i = 0; i < am; i++) { 1593 r1 = r2 = 0.0; 1594 n = a->i[i + 1] - a->i[i]; 1595 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1596 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1597 for (j = 0; j < n; j++) { 1598 const PetscScalar aatmp = aa[j]; 1599 const PetscInt ajtmp = aj[j]; 1600 r1 += aatmp * b1[ajtmp]; 1601 r2 += aatmp * b2[ajtmp]; 1602 } 1603 if (add) { 1604 c1[i] += r1; 1605 c2[i] += r2; 1606 } else { 1607 c1[i] = r1; 1608 c2[i] = r2; 1609 } 1610 } 1611 } else { 1612 for (i = 0; i < am; i++) { 1613 r1 = r2 = r3 = 0.0; 1614 n = a->i[i + 1] - a->i[i]; 1615 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1616 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1617 for (j = 0; j < n; j++) { 1618 const PetscScalar aatmp = aa[j]; 1619 const PetscInt ajtmp = aj[j]; 1620 r1 += aatmp * b1[ajtmp]; 1621 r2 += aatmp * b2[ajtmp]; 1622 r3 += aatmp * b3[ajtmp]; 1623 } 1624 if (add) { 1625 c1[i] += r1; 1626 c2[i] += r2; 1627 c3[i] += r3; 1628 } else { 1629 c1[i] = r1; 1630 c2[i] = r2; 1631 c3[i] = r3; 1632 } 1633 } 1634 } 1635 } 1636 PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 1637 if (add) { 1638 PetscCall(MatDenseRestoreArray(C, &c)); 1639 } else { 1640 PetscCall(MatDenseRestoreArrayWrite(C, &c)); 1641 } 1642 PetscCall(MatDenseRestoreArrayRead(B, &b)); 1643 PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 1644 PetscFunctionReturn(PETSC_SUCCESS); 1645 } 1646 1647 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) 1648 { 1649 PetscFunctionBegin; 1650 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1651 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1652 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1653 1654 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 1655 PetscFunctionReturn(PETSC_SUCCESS); 1656 } 1657 1658 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1659 { 1660 PetscFunctionBegin; 1661 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1662 C->ops->productsymbolic = MatProductSymbolic_AB; 1663 PetscFunctionReturn(PETSC_SUCCESS); 1664 } 1665 1666 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 1667 1668 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1669 { 1670 PetscFunctionBegin; 1671 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1672 C->ops->productsymbolic = MatProductSymbolic_AtB; 1673 PetscFunctionReturn(PETSC_SUCCESS); 1674 } 1675 1676 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1677 { 1678 PetscFunctionBegin; 1679 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1680 C->ops->productsymbolic = MatProductSymbolic_ABt; 1681 PetscFunctionReturn(PETSC_SUCCESS); 1682 } 1683 1684 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1685 { 1686 Mat_Product *product = C->product; 1687 1688 PetscFunctionBegin; 1689 switch (product->type) { 1690 case MATPRODUCT_AB: 1691 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1692 break; 1693 case MATPRODUCT_AtB: 1694 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1695 break; 1696 case MATPRODUCT_ABt: 1697 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1698 break; 1699 default: 1700 break; 1701 } 1702 PetscFunctionReturn(PETSC_SUCCESS); 1703 } 1704 1705 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1706 { 1707 Mat_Product *product = C->product; 1708 Mat A = product->A; 1709 PetscBool baij; 1710 1711 PetscFunctionBegin; 1712 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 1713 if (!baij) { /* A is seqsbaij */ 1714 PetscBool sbaij; 1715 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 1716 PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 1717 1718 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1719 } else { /* A is seqbaij */ 1720 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1721 } 1722 1723 C->ops->productsymbolic = MatProductSymbolic_AB; 1724 PetscFunctionReturn(PETSC_SUCCESS); 1725 } 1726 1727 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1728 { 1729 Mat_Product *product = C->product; 1730 1731 PetscFunctionBegin; 1732 MatCheckProduct(C, 1); 1733 PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1734 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 1735 PetscFunctionReturn(PETSC_SUCCESS); 1736 } 1737 1738 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1739 { 1740 PetscFunctionBegin; 1741 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1742 C->ops->productsymbolic = MatProductSymbolic_AB; 1743 PetscFunctionReturn(PETSC_SUCCESS); 1744 } 1745 1746 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1747 { 1748 Mat_Product *product = C->product; 1749 1750 PetscFunctionBegin; 1751 if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 1752 PetscFunctionReturn(PETSC_SUCCESS); 1753 } 1754 1755 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) 1756 { 1757 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1758 Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 1759 PetscInt *bi = b->i, *bj = b->j; 1760 PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 1761 MatScalar *btval, *btval_den, *ba = b->a; 1762 PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1763 1764 PetscFunctionBegin; 1765 btval_den = btdense->v; 1766 PetscCall(PetscArrayzero(btval_den, m * n)); 1767 for (k = 0; k < ncolors; k++) { 1768 ncolumns = coloring->ncolumns[k]; 1769 for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1770 col = *(columns + colorforcol[k] + l); 1771 btcol = bj + bi[col]; 1772 btval = ba + bi[col]; 1773 anz = bi[col + 1] - bi[col]; 1774 for (j = 0; j < anz; j++) { 1775 brow = btcol[j]; 1776 btval_den[brow] = btval[j]; 1777 } 1778 } 1779 btval_den += m; 1780 } 1781 PetscFunctionReturn(PETSC_SUCCESS); 1782 } 1783 1784 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) 1785 { 1786 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 1787 const PetscScalar *ca_den, *ca_den_ptr; 1788 PetscScalar *ca = csp->a; 1789 PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1790 PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1791 PetscInt nrows, *row, *idx; 1792 PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 1793 1794 PetscFunctionBegin; 1795 PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1796 1797 if (brows > 0) { 1798 PetscInt *lstart, row_end, row_start; 1799 lstart = matcoloring->lstart; 1800 PetscCall(PetscArrayzero(lstart, ncolors)); 1801 1802 row_end = brows; 1803 if (row_end > m) row_end = m; 1804 for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1805 ca_den_ptr = ca_den; 1806 for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1807 nrows = matcoloring->nrows[k]; 1808 row = rows + colorforrow[k]; 1809 idx = den2sp + colorforrow[k]; 1810 for (l = lstart[k]; l < nrows; l++) { 1811 if (row[l] >= row_end) { 1812 lstart[k] = l; 1813 break; 1814 } else { 1815 ca[idx[l]] = ca_den_ptr[row[l]]; 1816 } 1817 } 1818 ca_den_ptr += m; 1819 } 1820 row_end += brows; 1821 if (row_end > m) row_end = m; 1822 } 1823 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1824 ca_den_ptr = ca_den; 1825 for (k = 0; k < ncolors; k++) { 1826 nrows = matcoloring->nrows[k]; 1827 row = rows + colorforrow[k]; 1828 idx = den2sp + colorforrow[k]; 1829 for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1830 ca_den_ptr += m; 1831 } 1832 } 1833 1834 PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1835 #if defined(PETSC_USE_INFO) 1836 if (matcoloring->brows > 0) { 1837 PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1838 } else { 1839 PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1840 } 1841 #endif 1842 PetscFunctionReturn(PETSC_SUCCESS); 1843 } 1844 1845 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) 1846 { 1847 PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 1848 const PetscInt *is, *ci, *cj, *row_idx; 1849 PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1850 IS *isa; 1851 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1852 PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1853 PetscInt *colorforcol, *columns, *columns_i, brows; 1854 PetscBool flg; 1855 1856 PetscFunctionBegin; 1857 PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1858 1859 /* bs >1 is not being tested yet! */ 1860 Nbs = mat->cmap->N / bs; 1861 c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1862 c->N = Nbs; 1863 c->m = c->M; 1864 c->rstart = 0; 1865 c->brows = 100; 1866 1867 c->ncolors = nis; 1868 PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 1869 PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 1870 PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1871 1872 brows = c->brows; 1873 PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1874 if (flg) c->brows = brows; 1875 if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 1876 1877 colorforrow[0] = 0; 1878 rows_i = rows; 1879 den2sp_i = den2sp; 1880 1881 PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 1882 PetscCall(PetscMalloc1(Nbs + 1, &columns)); 1883 1884 colorforcol[0] = 0; 1885 columns_i = columns; 1886 1887 /* get column-wise storage of mat */ 1888 PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1889 1890 cm = c->m; 1891 PetscCall(PetscMalloc1(cm + 1, &rowhit)); 1892 PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1893 for (i = 0; i < nis; i++) { /* loop over color */ 1894 PetscCall(ISGetLocalSize(isa[i], &n)); 1895 PetscCall(ISGetIndices(isa[i], &is)); 1896 1897 c->ncolumns[i] = n; 1898 if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1899 colorforcol[i + 1] = colorforcol[i] + n; 1900 columns_i += n; 1901 1902 /* fast, crude version requires O(N*N) work */ 1903 PetscCall(PetscArrayzero(rowhit, cm)); 1904 1905 for (j = 0; j < n; j++) { /* loop over columns*/ 1906 col = is[j]; 1907 row_idx = cj + ci[col]; 1908 m = ci[col + 1] - ci[col]; 1909 for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1910 idxhit[*row_idx] = spidx[ci[col] + k]; 1911 rowhit[*row_idx++] = col + 1; 1912 } 1913 } 1914 /* count the number of hits */ 1915 nrows = 0; 1916 for (j = 0; j < cm; j++) { 1917 if (rowhit[j]) nrows++; 1918 } 1919 c->nrows[i] = nrows; 1920 colorforrow[i + 1] = colorforrow[i] + nrows; 1921 1922 nrows = 0; 1923 for (j = 0; j < cm; j++) { /* loop over rows */ 1924 if (rowhit[j]) { 1925 rows_i[nrows] = j; 1926 den2sp_i[nrows] = idxhit[j]; 1927 nrows++; 1928 } 1929 } 1930 den2sp_i += nrows; 1931 1932 PetscCall(ISRestoreIndices(isa[i], &is)); 1933 rows_i += nrows; 1934 } 1935 PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1936 PetscCall(PetscFree(rowhit)); 1937 PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 1938 PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1939 1940 c->colorforrow = colorforrow; 1941 c->rows = rows; 1942 c->den2sp = den2sp; 1943 c->colorforcol = colorforcol; 1944 c->columns = columns; 1945 1946 PetscCall(PetscFree(idxhit)); 1947 PetscFunctionReturn(PETSC_SUCCESS); 1948 } 1949 1950 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1951 { 1952 Mat_Product *product = C->product; 1953 Mat A = product->A, B = product->B; 1954 1955 PetscFunctionBegin; 1956 if (C->ops->mattransposemultnumeric) { 1957 /* Alg: "outerproduct" */ 1958 PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 1959 } else { 1960 /* Alg: "matmatmult" -- C = At*B */ 1961 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 1962 1963 PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1964 if (atb->At) { 1965 /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 1966 user may have called MatProductReplaceMats() to get this A=product->A */ 1967 PetscCall(MatTransposeSetPrecursor(A, atb->At)); 1968 PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 1969 } 1970 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 1971 } 1972 PetscFunctionReturn(PETSC_SUCCESS); 1973 } 1974 1975 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1976 { 1977 Mat_Product *product = C->product; 1978 Mat A = product->A, B = product->B; 1979 PetscReal fill = product->fill; 1980 1981 PetscFunctionBegin; 1982 PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 1983 1984 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1985 PetscFunctionReturn(PETSC_SUCCESS); 1986 } 1987 1988 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1989 { 1990 Mat_Product *product = C->product; 1991 PetscInt alg = 0; /* default algorithm */ 1992 PetscBool flg = PETSC_FALSE; 1993 #if !defined(PETSC_HAVE_HYPRE) 1994 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 1995 PetscInt nalg = 7; 1996 #else 1997 const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 1998 PetscInt nalg = 8; 1999 #endif 2000 2001 PetscFunctionBegin; 2002 /* Set default algorithm */ 2003 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2004 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2005 2006 /* Get runtime option */ 2007 if (product->api_user) { 2008 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 2009 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 2010 PetscOptionsEnd(); 2011 } else { 2012 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 2013 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 2014 PetscOptionsEnd(); 2015 } 2016 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2017 2018 C->ops->productsymbolic = MatProductSymbolic_AB; 2019 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 2020 PetscFunctionReturn(PETSC_SUCCESS); 2021 } 2022 2023 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2024 { 2025 Mat_Product *product = C->product; 2026 PetscInt alg = 0; /* default algorithm */ 2027 PetscBool flg = PETSC_FALSE; 2028 const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 2029 PetscInt nalg = 3; 2030 2031 PetscFunctionBegin; 2032 /* Get runtime option */ 2033 if (product->api_user) { 2034 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 2035 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2036 PetscOptionsEnd(); 2037 } else { 2038 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 2039 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 2040 PetscOptionsEnd(); 2041 } 2042 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2043 2044 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 2045 PetscFunctionReturn(PETSC_SUCCESS); 2046 } 2047 2048 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2049 { 2050 Mat_Product *product = C->product; 2051 PetscInt alg = 0; /* default algorithm */ 2052 PetscBool flg = PETSC_FALSE; 2053 const char *algTypes[2] = {"default", "color"}; 2054 PetscInt nalg = 2; 2055 2056 PetscFunctionBegin; 2057 /* Set default algorithm */ 2058 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2059 if (!flg) { 2060 alg = 1; 2061 PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2062 } 2063 2064 /* Get runtime option */ 2065 if (product->api_user) { 2066 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 2067 PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2068 PetscOptionsEnd(); 2069 } else { 2070 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 2071 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2072 PetscOptionsEnd(); 2073 } 2074 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2075 2076 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2077 C->ops->productsymbolic = MatProductSymbolic_ABt; 2078 PetscFunctionReturn(PETSC_SUCCESS); 2079 } 2080 2081 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2082 { 2083 Mat_Product *product = C->product; 2084 PetscBool flg = PETSC_FALSE; 2085 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2086 #if !defined(PETSC_HAVE_HYPRE) 2087 const char *algTypes[2] = {"scalable", "rap"}; 2088 PetscInt nalg = 2; 2089 #else 2090 const char *algTypes[3] = {"scalable", "rap", "hypre"}; 2091 PetscInt nalg = 3; 2092 #endif 2093 2094 PetscFunctionBegin; 2095 /* Set default algorithm */ 2096 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2097 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2098 2099 /* Get runtime option */ 2100 if (product->api_user) { 2101 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 2102 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2103 PetscOptionsEnd(); 2104 } else { 2105 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 2106 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2107 PetscOptionsEnd(); 2108 } 2109 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2110 2111 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2112 PetscFunctionReturn(PETSC_SUCCESS); 2113 } 2114 2115 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2116 { 2117 Mat_Product *product = C->product; 2118 PetscBool flg = PETSC_FALSE; 2119 PetscInt alg = 0; /* default algorithm */ 2120 const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 2121 PetscInt nalg = 3; 2122 2123 PetscFunctionBegin; 2124 /* Set default algorithm */ 2125 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2126 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2127 2128 /* Get runtime option */ 2129 if (product->api_user) { 2130 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 2131 PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2132 PetscOptionsEnd(); 2133 } else { 2134 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 2135 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2136 PetscOptionsEnd(); 2137 } 2138 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2139 2140 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2141 PetscFunctionReturn(PETSC_SUCCESS); 2142 } 2143 2144 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2145 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2146 { 2147 Mat_Product *product = C->product; 2148 PetscInt alg = 0; /* default algorithm */ 2149 PetscBool flg = PETSC_FALSE; 2150 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 2151 PetscInt nalg = 7; 2152 2153 PetscFunctionBegin; 2154 /* Set default algorithm */ 2155 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2156 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2157 2158 /* Get runtime option */ 2159 if (product->api_user) { 2160 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 2161 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2162 PetscOptionsEnd(); 2163 } else { 2164 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 2165 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2166 PetscOptionsEnd(); 2167 } 2168 if (flg) PetscCall(MatProductSetAlgorithm(C, (MatProductAlgorithm)algTypes[alg])); 2169 2170 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2171 C->ops->productsymbolic = MatProductSymbolic_ABC; 2172 PetscFunctionReturn(PETSC_SUCCESS); 2173 } 2174 2175 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2176 { 2177 Mat_Product *product = C->product; 2178 2179 PetscFunctionBegin; 2180 switch (product->type) { 2181 case MATPRODUCT_AB: 2182 PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2183 break; 2184 case MATPRODUCT_AtB: 2185 PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2186 break; 2187 case MATPRODUCT_ABt: 2188 PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2189 break; 2190 case MATPRODUCT_PtAP: 2191 PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2192 break; 2193 case MATPRODUCT_RARt: 2194 PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2195 break; 2196 case MATPRODUCT_ABC: 2197 PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2198 break; 2199 default: 2200 break; 2201 } 2202 PetscFunctionReturn(PETSC_SUCCESS); 2203 } 2204