1 /* 2 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 3 C = A * B 4 */ 5 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <petscbt.h> 9 #include <petsc/private/isimpl.h> 10 #include <../src/mat/impls/dense/seq/dense.h> 11 12 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 13 { 14 PetscFunctionBegin; 15 if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 16 else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 17 PetscFunctionReturn(PETSC_SUCCESS); 18 } 19 20 /* Modified from MatCreateSeqAIJWithArrays() */ 21 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) 22 { 23 PetscInt ii; 24 Mat_SeqAIJ *aij; 25 PetscBool isseqaij, ofree_a, ofree_ij; 26 27 PetscFunctionBegin; 28 PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 29 PetscCall(MatSetSizes(mat, m, n, m, n)); 30 31 if (!mtype) { 32 PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 33 if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 34 } else { 35 PetscCall(MatSetType(mat, mtype)); 36 } 37 38 aij = (Mat_SeqAIJ *)mat->data; 39 ofree_a = aij->free_a; 40 ofree_ij = aij->free_ij; 41 /* changes the free flags */ 42 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 43 44 PetscCall(PetscFree(aij->ilen)); 45 PetscCall(PetscFree(aij->imax)); 46 PetscCall(PetscMalloc1(m, &aij->imax)); 47 PetscCall(PetscMalloc1(m, &aij->ilen)); 48 for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 49 const PetscInt rnz = i[ii + 1] - i[ii]; 50 aij->nonzerorowcnt += !!rnz; 51 aij->rmax = PetscMax(aij->rmax, rnz); 52 aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 53 } 54 aij->maxnz = i[m]; 55 aij->nz = i[m]; 56 57 if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a)); 58 if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j)); 59 if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i)); 60 61 aij->i = i; 62 aij->j = j; 63 aij->a = a; 64 aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 65 aij->free_a = PETSC_FALSE; 66 aij->free_ij = PETSC_FALSE; 67 PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 68 // Always build the diag info when i, j are set 69 PetscCall(MatMarkDiagonal_SeqAIJ(mat)); 70 PetscFunctionReturn(PETSC_SUCCESS); 71 } 72 73 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 74 { 75 Mat_Product *product = C->product; 76 MatProductAlgorithm alg; 77 PetscBool flg; 78 79 PetscFunctionBegin; 80 if (product) { 81 alg = product->alg; 82 } else { 83 alg = "sorted"; 84 } 85 /* sorted */ 86 PetscCall(PetscStrcmp(alg, "sorted", &flg)); 87 if (flg) { 88 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 89 PetscFunctionReturn(PETSC_SUCCESS); 90 } 91 92 /* scalable */ 93 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 94 if (flg) { 95 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 96 PetscFunctionReturn(PETSC_SUCCESS); 97 } 98 99 /* scalable_fast */ 100 PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 101 if (flg) { 102 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 103 PetscFunctionReturn(PETSC_SUCCESS); 104 } 105 106 /* heap */ 107 PetscCall(PetscStrcmp(alg, "heap", &flg)); 108 if (flg) { 109 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 110 PetscFunctionReturn(PETSC_SUCCESS); 111 } 112 113 /* btheap */ 114 PetscCall(PetscStrcmp(alg, "btheap", &flg)); 115 if (flg) { 116 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 117 PetscFunctionReturn(PETSC_SUCCESS); 118 } 119 120 /* llcondensed */ 121 PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 122 if (flg) { 123 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 124 PetscFunctionReturn(PETSC_SUCCESS); 125 } 126 127 /* rowmerge */ 128 PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 129 if (flg) { 130 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 #if defined(PETSC_HAVE_HYPRE) 135 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 136 if (flg) { 137 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 138 PetscFunctionReturn(PETSC_SUCCESS); 139 } 140 #endif 141 142 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 143 } 144 145 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) 146 { 147 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 148 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 149 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 150 PetscReal afill; 151 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 152 PetscHMapI ta; 153 PetscBT lnkbt; 154 PetscFreeSpaceList free_space = NULL, current_space = NULL; 155 156 PetscFunctionBegin; 157 /* Get ci and cj */ 158 /* Allocate ci array, arrays for fill computation and */ 159 /* free space for accumulating nonzero column info */ 160 PetscCall(PetscMalloc1(am + 2, &ci)); 161 ci[0] = 0; 162 163 /* create and initialize a linked list */ 164 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 165 MatRowMergeMax_SeqAIJ(b, bm, ta); 166 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 167 PetscCall(PetscHMapIDestroy(&ta)); 168 169 PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 170 171 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 172 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 173 174 current_space = free_space; 175 176 /* Determine ci and cj */ 177 for (i = 0; i < am; i++) { 178 anzi = ai[i + 1] - ai[i]; 179 aj = a->j + ai[i]; 180 for (j = 0; j < anzi; j++) { 181 brow = aj[j]; 182 bnzj = bi[brow + 1] - bi[brow]; 183 bj = b->j + bi[brow]; 184 /* add non-zero cols of B into the sorted linked list lnk */ 185 PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 186 } 187 /* add possible missing diagonal entry */ 188 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 189 cnzi = lnk[0]; 190 191 /* If free space is not available, make more free space */ 192 /* Double the amount of total space in the list */ 193 if (current_space->local_remaining < cnzi) { 194 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 195 ndouble++; 196 } 197 198 /* Copy data into free space, then initialize lnk */ 199 PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 200 201 current_space->array += cnzi; 202 current_space->local_used += cnzi; 203 current_space->local_remaining -= cnzi; 204 205 ci[i + 1] = ci[i] + cnzi; 206 } 207 208 /* Column indices are in the list of free space */ 209 /* Allocate space for cj, initialize cj, and */ 210 /* destroy list of free space and other temporary array(s) */ 211 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 212 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 213 PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 214 215 /* put together the new symbolic matrix */ 216 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 217 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 218 219 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 220 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 221 c = (Mat_SeqAIJ *)C->data; 222 c->free_a = PETSC_FALSE; 223 c->free_ij = PETSC_TRUE; 224 c->nonew = 0; 225 226 /* fast, needs non-scalable O(bn) array 'abdense' */ 227 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 228 229 /* set MatInfo */ 230 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 231 if (afill < 1.0) afill = 1.0; 232 C->info.mallocs = ndouble; 233 C->info.fill_ratio_given = fill; 234 C->info.fill_ratio_needed = afill; 235 236 #if defined(PETSC_USE_INFO) 237 if (ci[am]) { 238 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 239 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 240 } else { 241 PetscCall(PetscInfo(C, "Empty matrix product\n")); 242 } 243 #endif 244 PetscFunctionReturn(PETSC_SUCCESS); 245 } 246 247 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) 248 { 249 PetscLogDouble flops = 0.0; 250 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 251 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 252 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 253 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 254 PetscInt am = A->rmap->n, cm = C->rmap->n; 255 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 256 PetscScalar *ca, valtmp; 257 PetscScalar *ab_dense; 258 PetscContainer cab_dense; 259 const PetscScalar *aa, *ba, *baj; 260 261 PetscFunctionBegin; 262 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 263 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 264 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 265 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 266 c->a = ca; 267 c->free_a = PETSC_TRUE; 268 } else ca = c->a; 269 270 /* TODO this should be done in the symbolic phase */ 271 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 272 that is hard to eradicate) */ 273 PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 274 if (!cab_dense) { 275 PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 276 PetscCall(PetscObjectContainerCompose((PetscObject)C, "__PETSc__ab_dense", ab_dense, PetscCtxDestroyDefault)); 277 } else PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense)); 278 PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 279 280 /* clean old values in C */ 281 PetscCall(PetscArrayzero(ca, ci[cm])); 282 /* Traverse A row-wise. */ 283 /* Build the ith row in C by summing over nonzero columns in A, */ 284 /* the rows of B corresponding to nonzeros of A. */ 285 for (i = 0; i < am; i++) { 286 anzi = ai[i + 1] - ai[i]; 287 for (j = 0; j < anzi; j++) { 288 brow = aj[j]; 289 bnzi = bi[brow + 1] - bi[brow]; 290 bjj = PetscSafePointerPlusOffset(bj, bi[brow]); 291 baj = PetscSafePointerPlusOffset(ba, bi[brow]); 292 /* perform dense axpy */ 293 valtmp = aa[j]; 294 for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 295 flops += 2 * bnzi; 296 } 297 aj = PetscSafePointerPlusOffset(aj, anzi); 298 aa = PetscSafePointerPlusOffset(aa, anzi); 299 300 cnzi = ci[i + 1] - ci[i]; 301 for (k = 0; k < cnzi; k++) { 302 ca[k] += ab_dense[cj[k]]; 303 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 304 } 305 flops += cnzi; 306 cj = PetscSafePointerPlusOffset(cj, cnzi); 307 ca += cnzi; 308 } 309 #if defined(PETSC_HAVE_DEVICE) 310 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 311 #endif 312 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 313 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 314 PetscCall(PetscLogFlops(flops)); 315 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 316 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 317 PetscFunctionReturn(PETSC_SUCCESS); 318 } 319 320 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) 321 { 322 PetscLogDouble flops = 0.0; 323 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 324 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 325 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 326 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 327 PetscInt am = A->rmap->N, cm = C->rmap->N; 328 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 329 PetscScalar *ca = c->a, valtmp; 330 const PetscScalar *aa, *ba, *baj; 331 PetscInt nextb; 332 333 PetscFunctionBegin; 334 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 335 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 336 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 337 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 338 c->a = ca; 339 c->free_a = PETSC_TRUE; 340 } 341 342 /* clean old values in C */ 343 PetscCall(PetscArrayzero(ca, ci[cm])); 344 /* Traverse A row-wise. */ 345 /* Build the ith row in C by summing over nonzero columns in A, */ 346 /* the rows of B corresponding to nonzeros of A. */ 347 for (i = 0; i < am; i++) { 348 anzi = ai[i + 1] - ai[i]; 349 cnzi = ci[i + 1] - ci[i]; 350 for (j = 0; j < anzi; j++) { 351 brow = aj[j]; 352 bnzi = bi[brow + 1] - bi[brow]; 353 bjj = bj + bi[brow]; 354 baj = ba + bi[brow]; 355 /* perform sparse axpy */ 356 valtmp = aa[j]; 357 nextb = 0; 358 for (k = 0; nextb < bnzi; k++) { 359 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 360 ca[k] += valtmp * baj[nextb++]; 361 } 362 } 363 flops += 2 * bnzi; 364 } 365 aj += anzi; 366 aa += anzi; 367 cj += cnzi; 368 ca += cnzi; 369 } 370 #if defined(PETSC_HAVE_DEVICE) 371 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 372 #endif 373 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 374 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 375 PetscCall(PetscLogFlops(flops)); 376 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 377 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 378 PetscFunctionReturn(PETSC_SUCCESS); 379 } 380 381 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) 382 { 383 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 384 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 385 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 386 MatScalar *ca; 387 PetscReal afill; 388 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 389 PetscHMapI ta; 390 PetscFreeSpaceList free_space = NULL, current_space = NULL; 391 392 PetscFunctionBegin; 393 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 394 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 395 PetscCall(PetscMalloc1(am + 2, &ci)); 396 ci[0] = 0; 397 398 /* create and initialize a linked list */ 399 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 400 MatRowMergeMax_SeqAIJ(b, bm, ta); 401 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 402 PetscCall(PetscHMapIDestroy(&ta)); 403 404 PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 405 406 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 407 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 408 current_space = free_space; 409 410 /* Determine ci and cj */ 411 for (i = 0; i < am; i++) { 412 anzi = ai[i + 1] - ai[i]; 413 aj = a->j + ai[i]; 414 for (j = 0; j < anzi; j++) { 415 brow = aj[j]; 416 bnzj = bi[brow + 1] - bi[brow]; 417 bj = b->j + bi[brow]; 418 /* add non-zero cols of B into the sorted linked list lnk */ 419 PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 420 } 421 /* add possible missing diagonal entry */ 422 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 423 cnzi = lnk[1]; 424 425 /* If free space is not available, make more free space */ 426 /* Double the amount of total space in the list */ 427 if (current_space->local_remaining < cnzi) { 428 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 429 ndouble++; 430 } 431 432 /* Copy data into free space, then initialize lnk */ 433 PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 434 435 current_space->array += cnzi; 436 current_space->local_used += cnzi; 437 current_space->local_remaining -= cnzi; 438 439 ci[i + 1] = ci[i] + cnzi; 440 } 441 442 /* Column indices are in the list of free space */ 443 /* Allocate space for cj, initialize cj, and */ 444 /* destroy list of free space and other temporary array(s) */ 445 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 446 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 447 PetscCall(PetscLLCondensedDestroy_fast(lnk)); 448 449 /* Allocate space for ca */ 450 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 451 452 /* put together the new symbolic matrix */ 453 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 454 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 455 456 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 457 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 458 c = (Mat_SeqAIJ *)C->data; 459 c->free_a = PETSC_TRUE; 460 c->free_ij = PETSC_TRUE; 461 c->nonew = 0; 462 463 /* slower, less memory */ 464 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 465 466 /* set MatInfo */ 467 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 468 if (afill < 1.0) afill = 1.0; 469 C->info.mallocs = ndouble; 470 C->info.fill_ratio_given = fill; 471 C->info.fill_ratio_needed = afill; 472 473 #if defined(PETSC_USE_INFO) 474 if (ci[am]) { 475 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 476 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 477 } else { 478 PetscCall(PetscInfo(C, "Empty matrix product\n")); 479 } 480 #endif 481 PetscFunctionReturn(PETSC_SUCCESS); 482 } 483 484 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) 485 { 486 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 487 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 488 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 489 MatScalar *ca; 490 PetscReal afill; 491 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 492 PetscHMapI ta; 493 PetscFreeSpaceList free_space = NULL, current_space = NULL; 494 495 PetscFunctionBegin; 496 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 497 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 498 PetscCall(PetscMalloc1(am + 2, &ci)); 499 ci[0] = 0; 500 501 /* create and initialize a linked list */ 502 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 503 MatRowMergeMax_SeqAIJ(b, bm, ta); 504 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 505 PetscCall(PetscHMapIDestroy(&ta)); 506 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 507 508 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 509 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 510 current_space = free_space; 511 512 /* Determine ci and cj */ 513 for (i = 0; i < am; i++) { 514 anzi = ai[i + 1] - ai[i]; 515 aj = a->j + ai[i]; 516 for (j = 0; j < anzi; j++) { 517 brow = aj[j]; 518 bnzj = bi[brow + 1] - bi[brow]; 519 bj = b->j + bi[brow]; 520 /* add non-zero cols of B into the sorted linked list lnk */ 521 PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 522 } 523 /* add possible missing diagonal entry */ 524 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 525 526 cnzi = lnk[0]; 527 528 /* If free space is not available, make more free space */ 529 /* Double the amount of total space in the list */ 530 if (current_space->local_remaining < cnzi) { 531 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 532 ndouble++; 533 } 534 535 /* Copy data into free space, then initialize lnk */ 536 PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 537 538 current_space->array += cnzi; 539 current_space->local_used += cnzi; 540 current_space->local_remaining -= cnzi; 541 542 ci[i + 1] = ci[i] + cnzi; 543 } 544 545 /* Column indices are in the list of free space */ 546 /* Allocate space for cj, initialize cj, and */ 547 /* destroy list of free space and other temporary array(s) */ 548 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 549 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 550 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 551 552 /* Allocate space for ca */ 553 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 554 555 /* put together the new symbolic matrix */ 556 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 557 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 558 559 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 560 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 561 c = (Mat_SeqAIJ *)C->data; 562 c->free_a = PETSC_TRUE; 563 c->free_ij = PETSC_TRUE; 564 c->nonew = 0; 565 566 /* slower, less memory */ 567 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 568 569 /* set MatInfo */ 570 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 571 if (afill < 1.0) afill = 1.0; 572 C->info.mallocs = ndouble; 573 C->info.fill_ratio_given = fill; 574 C->info.fill_ratio_needed = afill; 575 576 #if defined(PETSC_USE_INFO) 577 if (ci[am]) { 578 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 579 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 580 } else { 581 PetscCall(PetscInfo(C, "Empty matrix product\n")); 582 } 583 #endif 584 PetscFunctionReturn(PETSC_SUCCESS); 585 } 586 587 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) 588 { 589 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 590 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 591 PetscInt *ci, *cj, *bb; 592 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 593 PetscReal afill; 594 PetscInt i, j, col, ndouble = 0; 595 PetscFreeSpaceList free_space = NULL, current_space = NULL; 596 PetscHeap h; 597 598 PetscFunctionBegin; 599 /* Get ci and cj - by merging sorted rows using a heap */ 600 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 601 PetscCall(PetscMalloc1(am + 2, &ci)); 602 ci[0] = 0; 603 604 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 605 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 606 current_space = free_space; 607 608 PetscCall(PetscHeapCreate(a->rmax, &h)); 609 PetscCall(PetscMalloc1(a->rmax, &bb)); 610 611 /* Determine ci and cj */ 612 for (i = 0; i < am; i++) { 613 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 614 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 615 ci[i + 1] = ci[i]; 616 /* Populate the min heap */ 617 for (j = 0; j < anzi; j++) { 618 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 619 if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 620 PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 621 } 622 } 623 /* Pick off the min element, adding it to free space */ 624 PetscCall(PetscHeapPop(h, &j, &col)); 625 while (j >= 0) { 626 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 627 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 628 ndouble++; 629 } 630 *(current_space->array++) = col; 631 current_space->local_used++; 632 current_space->local_remaining--; 633 ci[i + 1]++; 634 635 /* stash if anything else remains in this row of B */ 636 if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 637 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 638 PetscInt j2, col2; 639 PetscCall(PetscHeapPeek(h, &j2, &col2)); 640 if (col2 != col) break; 641 PetscCall(PetscHeapPop(h, &j2, &col2)); 642 if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 643 } 644 /* Put any stashed elements back into the min heap */ 645 PetscCall(PetscHeapUnstash(h)); 646 PetscCall(PetscHeapPop(h, &j, &col)); 647 } 648 } 649 PetscCall(PetscFree(bb)); 650 PetscCall(PetscHeapDestroy(&h)); 651 652 /* Column indices are in the list of free space */ 653 /* Allocate space for cj, initialize cj, and */ 654 /* destroy list of free space and other temporary array(s) */ 655 PetscCall(PetscMalloc1(ci[am], &cj)); 656 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 657 658 /* put together the new symbolic matrix */ 659 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 660 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 661 662 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 663 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 664 c = (Mat_SeqAIJ *)C->data; 665 c->free_a = PETSC_TRUE; 666 c->free_ij = PETSC_TRUE; 667 c->nonew = 0; 668 669 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 670 671 /* set MatInfo */ 672 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 673 if (afill < 1.0) afill = 1.0; 674 C->info.mallocs = ndouble; 675 C->info.fill_ratio_given = fill; 676 C->info.fill_ratio_needed = afill; 677 678 #if defined(PETSC_USE_INFO) 679 if (ci[am]) { 680 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 681 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 682 } else { 683 PetscCall(PetscInfo(C, "Empty matrix product\n")); 684 } 685 #endif 686 PetscFunctionReturn(PETSC_SUCCESS); 687 } 688 689 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) 690 { 691 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 692 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 693 PetscInt *ci, *cj, *bb; 694 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 695 PetscReal afill; 696 PetscInt i, j, col, ndouble = 0; 697 PetscFreeSpaceList free_space = NULL, current_space = NULL; 698 PetscHeap h; 699 PetscBT bt; 700 701 PetscFunctionBegin; 702 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 703 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 704 PetscCall(PetscMalloc1(am + 2, &ci)); 705 ci[0] = 0; 706 707 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 708 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 709 710 current_space = free_space; 711 712 PetscCall(PetscHeapCreate(a->rmax, &h)); 713 PetscCall(PetscMalloc1(a->rmax, &bb)); 714 PetscCall(PetscBTCreate(bn, &bt)); 715 716 /* Determine ci and cj */ 717 for (i = 0; i < am; i++) { 718 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 719 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 720 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 721 ci[i + 1] = ci[i]; 722 /* Populate the min heap */ 723 for (j = 0; j < anzi; j++) { 724 PetscInt brow = acol[j]; 725 for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 726 PetscInt bcol = bj[bb[j]]; 727 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 728 PetscCall(PetscHeapAdd(h, j, bcol)); 729 bb[j]++; 730 break; 731 } 732 } 733 } 734 /* Pick off the min element, adding it to free space */ 735 PetscCall(PetscHeapPop(h, &j, &col)); 736 while (j >= 0) { 737 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 738 fptr = NULL; /* need PetscBTMemzero */ 739 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 740 ndouble++; 741 } 742 *(current_space->array++) = col; 743 current_space->local_used++; 744 current_space->local_remaining--; 745 ci[i + 1]++; 746 747 /* stash if anything else remains in this row of B */ 748 for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 749 PetscInt bcol = bj[bb[j]]; 750 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 751 PetscCall(PetscHeapAdd(h, j, bcol)); 752 bb[j]++; 753 break; 754 } 755 } 756 PetscCall(PetscHeapPop(h, &j, &col)); 757 } 758 if (fptr) { /* Clear the bits for this row */ 759 for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 760 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 761 PetscCall(PetscBTMemzero(bn, bt)); 762 } 763 } 764 PetscCall(PetscFree(bb)); 765 PetscCall(PetscHeapDestroy(&h)); 766 PetscCall(PetscBTDestroy(&bt)); 767 768 /* Column indices are in the list of free space */ 769 /* Allocate space for cj, initialize cj, and */ 770 /* destroy list of free space and other temporary array(s) */ 771 PetscCall(PetscMalloc1(ci[am], &cj)); 772 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 773 774 /* put together the new symbolic matrix */ 775 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 776 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 777 778 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 779 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 780 c = (Mat_SeqAIJ *)C->data; 781 c->free_a = PETSC_TRUE; 782 c->free_ij = PETSC_TRUE; 783 c->nonew = 0; 784 785 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 786 787 /* set MatInfo */ 788 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 789 if (afill < 1.0) afill = 1.0; 790 C->info.mallocs = ndouble; 791 C->info.fill_ratio_given = fill; 792 C->info.fill_ratio_needed = afill; 793 794 #if defined(PETSC_USE_INFO) 795 if (ci[am]) { 796 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 797 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 798 } else { 799 PetscCall(PetscInfo(C, "Empty matrix product\n")); 800 } 801 #endif 802 PetscFunctionReturn(PETSC_SUCCESS); 803 } 804 805 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) 806 { 807 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 808 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 809 PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 810 PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 811 const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 812 const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 813 const PetscInt *brow_ptr[8], *brow_end[8]; 814 PetscInt window[8]; 815 PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 816 PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 817 PetscReal afill; 818 PetscInt *workj_L1, *workj_L2, *workj_L3; 819 PetscInt L1_nnz, L2_nnz; 820 821 /* Step 1: Get upper bound on memory required for allocation. 822 Because of the way virtual memory works, 823 only the memory pages that are actually needed will be physically allocated. */ 824 PetscFunctionBegin; 825 PetscCall(PetscMalloc1(am + 1, &ci)); 826 for (i = 0; i < am; i++) { 827 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 828 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 829 a_rownnz = 0; 830 for (k = 0; k < anzi; ++k) { 831 a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 832 if (a_rownnz > bn) { 833 a_rownnz = bn; 834 break; 835 } 836 } 837 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 838 } 839 /* temporary work areas for merging rows */ 840 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 841 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 842 PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 843 844 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 845 c_maxmem = 8 * (ai[am] + bi[bm]); 846 /* Step 2: Populate pattern for C */ 847 PetscCall(PetscMalloc1(c_maxmem, &cj)); 848 849 ci_nnz = 0; 850 ci[0] = 0; 851 worki_L1[0] = 0; 852 worki_L2[0] = 0; 853 for (i = 0; i < am; i++) { 854 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 855 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 856 rowsleft = anzi; 857 inputcol_L1 = acol; 858 L2_nnz = 0; 859 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 860 worki_L2[1] = 0; 861 outputi_nnz = 0; 862 863 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 864 while (ci_nnz + a_maxrownnz > c_maxmem) { 865 c_maxmem *= 2; 866 ndouble++; 867 PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 868 } 869 870 while (rowsleft) { 871 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 872 L1_nrows = 0; 873 L1_nnz = 0; 874 inputcol = inputcol_L1; 875 inputi = bi; 876 inputj = bj; 877 878 /* The following macro is used to specialize for small rows in A. 879 This helps with compiler unrolling, improving performance substantially. 880 Input: inputj inputi inputcol bn 881 Output: outputj outputi_nnz */ 882 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 883 do { \ 884 window_min = bn; \ 885 outputi_nnz = 0; \ 886 for (k = 0; k < ANNZ; ++k) { \ 887 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 888 brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 889 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 890 window_min = PetscMin(window[k], window_min); \ 891 } \ 892 while (window_min < bn) { \ 893 outputj[outputi_nnz++] = window_min; \ 894 /* advance front and compute new minimum */ \ 895 old_window_min = window_min; \ 896 window_min = bn; \ 897 for (k = 0; k < ANNZ; ++k) { \ 898 if (window[k] == old_window_min) { \ 899 brow_ptr[k]++; \ 900 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 901 } \ 902 window_min = PetscMin(window[k], window_min); \ 903 } \ 904 } \ 905 } while (0) 906 907 /************** L E V E L 1 ***************/ 908 /* Merge up to 8 rows of B to L1 work array*/ 909 while (L1_rowsleft) { 910 outputi_nnz = 0; 911 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 912 else outputj = cj + ci_nnz; /* Merge directly to C */ 913 914 switch (L1_rowsleft) { 915 case 1: 916 brow_ptr[0] = inputj + inputi[inputcol[0]]; 917 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 918 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 919 inputcol += L1_rowsleft; 920 rowsleft -= L1_rowsleft; 921 L1_rowsleft = 0; 922 break; 923 case 2: 924 MatMatMultSymbolic_RowMergeMacro(2); 925 inputcol += L1_rowsleft; 926 rowsleft -= L1_rowsleft; 927 L1_rowsleft = 0; 928 break; 929 case 3: 930 MatMatMultSymbolic_RowMergeMacro(3); 931 inputcol += L1_rowsleft; 932 rowsleft -= L1_rowsleft; 933 L1_rowsleft = 0; 934 break; 935 case 4: 936 MatMatMultSymbolic_RowMergeMacro(4); 937 inputcol += L1_rowsleft; 938 rowsleft -= L1_rowsleft; 939 L1_rowsleft = 0; 940 break; 941 case 5: 942 MatMatMultSymbolic_RowMergeMacro(5); 943 inputcol += L1_rowsleft; 944 rowsleft -= L1_rowsleft; 945 L1_rowsleft = 0; 946 break; 947 case 6: 948 MatMatMultSymbolic_RowMergeMacro(6); 949 inputcol += L1_rowsleft; 950 rowsleft -= L1_rowsleft; 951 L1_rowsleft = 0; 952 break; 953 case 7: 954 MatMatMultSymbolic_RowMergeMacro(7); 955 inputcol += L1_rowsleft; 956 rowsleft -= L1_rowsleft; 957 L1_rowsleft = 0; 958 break; 959 default: 960 MatMatMultSymbolic_RowMergeMacro(8); 961 inputcol += 8; 962 rowsleft -= 8; 963 L1_rowsleft -= 8; 964 break; 965 } 966 inputcol_L1 = inputcol; 967 L1_nnz += outputi_nnz; 968 worki_L1[++L1_nrows] = L1_nnz; 969 } 970 971 /********************** L E V E L 2 ************************/ 972 /* Merge from L1 work array to either C or to L2 work array */ 973 if (anzi > 8) { 974 inputi = worki_L1; 975 inputj = workj_L1; 976 inputcol = workcol; 977 outputi_nnz = 0; 978 979 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 980 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 981 982 switch (L1_nrows) { 983 case 1: 984 brow_ptr[0] = inputj + inputi[inputcol[0]]; 985 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 986 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 987 break; 988 case 2: 989 MatMatMultSymbolic_RowMergeMacro(2); 990 break; 991 case 3: 992 MatMatMultSymbolic_RowMergeMacro(3); 993 break; 994 case 4: 995 MatMatMultSymbolic_RowMergeMacro(4); 996 break; 997 case 5: 998 MatMatMultSymbolic_RowMergeMacro(5); 999 break; 1000 case 6: 1001 MatMatMultSymbolic_RowMergeMacro(6); 1002 break; 1003 case 7: 1004 MatMatMultSymbolic_RowMergeMacro(7); 1005 break; 1006 case 8: 1007 MatMatMultSymbolic_RowMergeMacro(8); 1008 break; 1009 default: 1010 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1011 } 1012 L2_nnz += outputi_nnz; 1013 worki_L2[++L2_nrows] = L2_nnz; 1014 1015 /************************ L E V E L 3 **********************/ 1016 /* Merge from L2 work array to either C or to L2 work array */ 1017 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1018 inputi = worki_L2; 1019 inputj = workj_L2; 1020 inputcol = workcol; 1021 outputi_nnz = 0; 1022 if (rowsleft) outputj = workj_L3; 1023 else outputj = cj + ci_nnz; 1024 switch (L2_nrows) { 1025 case 1: 1026 brow_ptr[0] = inputj + inputi[inputcol[0]]; 1027 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1028 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1029 break; 1030 case 2: 1031 MatMatMultSymbolic_RowMergeMacro(2); 1032 break; 1033 case 3: 1034 MatMatMultSymbolic_RowMergeMacro(3); 1035 break; 1036 case 4: 1037 MatMatMultSymbolic_RowMergeMacro(4); 1038 break; 1039 case 5: 1040 MatMatMultSymbolic_RowMergeMacro(5); 1041 break; 1042 case 6: 1043 MatMatMultSymbolic_RowMergeMacro(6); 1044 break; 1045 case 7: 1046 MatMatMultSymbolic_RowMergeMacro(7); 1047 break; 1048 case 8: 1049 MatMatMultSymbolic_RowMergeMacro(8); 1050 break; 1051 default: 1052 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1053 } 1054 L2_nrows = 1; 1055 L2_nnz = outputi_nnz; 1056 worki_L2[1] = outputi_nnz; 1057 /* Copy to workj_L2 */ 1058 if (rowsleft) { 1059 for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1060 } 1061 } 1062 } 1063 } /* while (rowsleft) */ 1064 #undef MatMatMultSymbolic_RowMergeMacro 1065 1066 /* terminate current row */ 1067 ci_nnz += outputi_nnz; 1068 ci[i + 1] = ci_nnz; 1069 } 1070 1071 /* Step 3: Create the new symbolic matrix */ 1072 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1073 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1074 1075 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1076 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1077 c = (Mat_SeqAIJ *)C->data; 1078 c->free_a = PETSC_TRUE; 1079 c->free_ij = PETSC_TRUE; 1080 c->nonew = 0; 1081 1082 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1083 1084 /* set MatInfo */ 1085 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1086 if (afill < 1.0) afill = 1.0; 1087 C->info.mallocs = ndouble; 1088 C->info.fill_ratio_given = fill; 1089 C->info.fill_ratio_needed = afill; 1090 1091 #if defined(PETSC_USE_INFO) 1092 if (ci[am]) { 1093 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1094 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1095 } else { 1096 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1097 } 1098 #endif 1099 1100 /* Step 4: Free temporary work areas */ 1101 PetscCall(PetscFree(workj_L1)); 1102 PetscCall(PetscFree(workj_L2)); 1103 PetscCall(PetscFree(workj_L3)); 1104 PetscFunctionReturn(PETSC_SUCCESS); 1105 } 1106 1107 /* concatenate unique entries and then sort */ 1108 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) 1109 { 1110 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1111 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 1112 PetscInt *ci, *cj, bcol; 1113 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1114 PetscReal afill; 1115 PetscInt i, j, ndouble = 0; 1116 PetscSegBuffer seg, segrow; 1117 char *seen; 1118 1119 PetscFunctionBegin; 1120 PetscCall(PetscMalloc1(am + 1, &ci)); 1121 ci[0] = 0; 1122 1123 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1124 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 1125 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 1126 PetscCall(PetscCalloc1(bn, &seen)); 1127 1128 /* Determine ci and cj */ 1129 for (i = 0; i < am; i++) { 1130 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1131 const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */ 1132 PetscInt packlen = 0, *PETSC_RESTRICT crow; 1133 1134 /* Pack segrow */ 1135 for (j = 0; j < anzi; j++) { 1136 PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1137 for (k = bjstart; k < bjend; k++) { 1138 bcol = bj[k]; 1139 if (!seen[bcol]) { /* new entry */ 1140 PetscInt *PETSC_RESTRICT slot; 1141 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1142 *slot = bcol; 1143 seen[bcol] = 1; 1144 packlen++; 1145 } 1146 } 1147 } 1148 1149 /* Check i-th diagonal entry */ 1150 if (C->force_diagonals && !seen[i]) { 1151 PetscInt *PETSC_RESTRICT slot; 1152 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1153 *slot = i; 1154 seen[i] = 1; 1155 packlen++; 1156 } 1157 1158 PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 1159 PetscCall(PetscSegBufferExtractTo(segrow, crow)); 1160 PetscCall(PetscSortInt(packlen, crow)); 1161 ci[i + 1] = ci[i] + packlen; 1162 for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1163 } 1164 PetscCall(PetscSegBufferDestroy(&segrow)); 1165 PetscCall(PetscFree(seen)); 1166 1167 /* Column indices are in the segmented buffer */ 1168 PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 1169 PetscCall(PetscSegBufferDestroy(&seg)); 1170 1171 /* put together the new symbolic matrix */ 1172 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1173 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1174 1175 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1176 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1177 c = (Mat_SeqAIJ *)C->data; 1178 c->free_a = PETSC_TRUE; 1179 c->free_ij = PETSC_TRUE; 1180 c->nonew = 0; 1181 1182 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1183 1184 /* set MatInfo */ 1185 afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1186 if (afill < 1.0) afill = 1.0; 1187 C->info.mallocs = ndouble; 1188 C->info.fill_ratio_given = fill; 1189 C->info.fill_ratio_needed = afill; 1190 1191 #if defined(PETSC_USE_INFO) 1192 if (ci[am]) { 1193 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1194 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1195 } else { 1196 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1197 } 1198 #endif 1199 PetscFunctionReturn(PETSC_SUCCESS); 1200 } 1201 1202 static PetscErrorCode MatDestroy_SeqAIJ_MatMatMultTrans(void *data) 1203 { 1204 Mat_MatMatTransMult *abt = (Mat_MatMatTransMult *)data; 1205 1206 PetscFunctionBegin; 1207 PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 1208 PetscCall(MatDestroy(&abt->Bt_den)); 1209 PetscCall(MatDestroy(&abt->ABt_den)); 1210 PetscCall(PetscFree(abt)); 1211 PetscFunctionReturn(PETSC_SUCCESS); 1212 } 1213 1214 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1215 { 1216 Mat Bt; 1217 Mat_MatMatTransMult *abt; 1218 Mat_Product *product = C->product; 1219 char *alg; 1220 1221 PetscFunctionBegin; 1222 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1223 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1224 1225 /* create symbolic Bt */ 1226 PetscCall(MatTransposeSymbolic(B, &Bt)); 1227 PetscCall(MatSetBlockSizes(Bt, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1228 PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 1229 1230 /* get symbolic C=A*Bt */ 1231 PetscCall(PetscStrallocpy(product->alg, &alg)); 1232 PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 1233 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 1234 PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 1235 PetscCall(PetscFree(alg)); 1236 1237 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1238 PetscCall(PetscNew(&abt)); 1239 1240 product->data = abt; 1241 product->destroy = MatDestroy_SeqAIJ_MatMatMultTrans; 1242 1243 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1244 1245 abt->usecoloring = PETSC_FALSE; 1246 PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 1247 if (abt->usecoloring) { 1248 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1249 MatTransposeColoring matcoloring; 1250 MatColoring coloring; 1251 ISColoring iscoloring; 1252 Mat Bt_dense, C_dense; 1253 1254 /* inode causes memory problem */ 1255 PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 1256 1257 PetscCall(MatColoringCreate(C, &coloring)); 1258 PetscCall(MatColoringSetDistance(coloring, 2)); 1259 PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 1260 PetscCall(MatColoringSetFromOptions(coloring)); 1261 PetscCall(MatColoringApply(coloring, &iscoloring)); 1262 PetscCall(MatColoringDestroy(&coloring)); 1263 PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 1264 1265 abt->matcoloring = matcoloring; 1266 1267 PetscCall(ISColoringDestroy(&iscoloring)); 1268 1269 /* Create Bt_dense and C_dense = A*Bt_dense */ 1270 PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 1271 PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 1272 PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 1273 PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 1274 1275 Bt_dense->assembled = PETSC_TRUE; 1276 abt->Bt_den = Bt_dense; 1277 1278 PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 1279 PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 1280 PetscCall(MatSetType(C_dense, MATSEQDENSE)); 1281 PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 1282 1283 Bt_dense->assembled = PETSC_TRUE; 1284 abt->ABt_den = C_dense; 1285 1286 #if defined(PETSC_USE_INFO) 1287 { 1288 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 1289 PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 1290 Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 1291 } 1292 #endif 1293 } 1294 /* clean up */ 1295 PetscCall(MatDestroy(&Bt)); 1296 PetscFunctionReturn(PETSC_SUCCESS); 1297 } 1298 1299 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1300 { 1301 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1302 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 1303 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 1304 PetscLogDouble flops = 0.0; 1305 MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 1306 Mat_MatMatTransMult *abt; 1307 Mat_Product *product = C->product; 1308 1309 PetscFunctionBegin; 1310 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1311 abt = (Mat_MatMatTransMult *)product->data; 1312 PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1313 /* clear old values in C */ 1314 if (!c->a) { 1315 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1316 c->a = ca; 1317 c->free_a = PETSC_TRUE; 1318 } else { 1319 ca = c->a; 1320 PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 1321 } 1322 1323 if (abt->usecoloring) { 1324 MatTransposeColoring matcoloring = abt->matcoloring; 1325 Mat Bt_dense, C_dense = abt->ABt_den; 1326 1327 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1328 Bt_dense = abt->Bt_den; 1329 PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1330 1331 /* C_dense = A*Bt_dense */ 1332 PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1333 1334 /* Recover C from C_dense */ 1335 PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 1336 PetscFunctionReturn(PETSC_SUCCESS); 1337 } 1338 1339 for (i = 0; i < cm; i++) { 1340 anzi = ai[i + 1] - ai[i]; 1341 acol = PetscSafePointerPlusOffset(aj, ai[i]); 1342 aval = PetscSafePointerPlusOffset(aa, ai[i]); 1343 cnzi = ci[i + 1] - ci[i]; 1344 ccol = PetscSafePointerPlusOffset(cj, ci[i]); 1345 cval = ca + ci[i]; 1346 for (j = 0; j < cnzi; j++) { 1347 brow = ccol[j]; 1348 bnzj = bi[brow + 1] - bi[brow]; 1349 bcol = bj + bi[brow]; 1350 bval = ba + bi[brow]; 1351 1352 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1353 nexta = 0; 1354 nextb = 0; 1355 while (nexta < anzi && nextb < bnzj) { 1356 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1357 if (nexta == anzi) break; 1358 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1359 if (nextb == bnzj) break; 1360 if (acol[nexta] == bcol[nextb]) { 1361 cval[j] += aval[nexta] * bval[nextb]; 1362 nexta++; 1363 nextb++; 1364 flops += 2; 1365 } 1366 } 1367 } 1368 } 1369 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1370 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1371 PetscCall(PetscLogFlops(flops)); 1372 PetscFunctionReturn(PETSC_SUCCESS); 1373 } 1374 1375 PetscErrorCode MatDestroy_SeqAIJ_MatTransMatMult(void *data) 1376 { 1377 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)data; 1378 1379 PetscFunctionBegin; 1380 PetscCall(MatDestroy(&atb->At)); 1381 if (atb->destroy) PetscCall((*atb->destroy)(atb->data)); 1382 PetscCall(PetscFree(atb)); 1383 PetscFunctionReturn(PETSC_SUCCESS); 1384 } 1385 1386 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1387 { 1388 Mat At = NULL; 1389 Mat_Product *product = C->product; 1390 PetscBool flg, def, square; 1391 1392 PetscFunctionBegin; 1393 MatCheckProduct(C, 4); 1394 square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 1395 /* outerproduct */ 1396 PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 1397 if (flg) { 1398 /* create symbolic At */ 1399 if (!square) { 1400 PetscCall(MatTransposeSymbolic(A, &At)); 1401 PetscCall(MatSetBlockSizes(At, PetscAbs(A->cmap->bs), PetscAbs(B->cmap->bs))); 1402 PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1403 } 1404 /* get symbolic C=At*B */ 1405 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1406 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1407 1408 /* clean up */ 1409 if (!square) PetscCall(MatDestroy(&At)); 1410 1411 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1412 PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 1413 PetscFunctionReturn(PETSC_SUCCESS); 1414 } 1415 1416 /* matmatmult */ 1417 PetscCall(PetscStrcmp(product->alg, "default", &def)); 1418 PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1419 if (flg || def) { 1420 Mat_MatTransMatMult *atb; 1421 1422 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1423 PetscCall(PetscNew(&atb)); 1424 if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 1425 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1426 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1427 PetscCall(MatProductSetAlgorithm(C, "at*b")); 1428 product->data = atb; 1429 product->destroy = MatDestroy_SeqAIJ_MatTransMatMult; 1430 atb->At = At; 1431 1432 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1433 PetscFunctionReturn(PETSC_SUCCESS); 1434 } 1435 1436 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1437 } 1438 1439 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1440 { 1441 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1442 PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1443 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1444 PetscLogDouble flops = 0.0; 1445 MatScalar *aa = a->a, *ba, *ca, *caj; 1446 1447 PetscFunctionBegin; 1448 if (!c->a) { 1449 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1450 1451 c->a = ca; 1452 c->free_a = PETSC_TRUE; 1453 } else { 1454 ca = c->a; 1455 PetscCall(PetscArrayzero(ca, ci[cm])); 1456 } 1457 1458 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1459 for (i = 0; i < am; i++) { 1460 bj = b->j + bi[i]; 1461 ba = b->a + bi[i]; 1462 bnzi = bi[i + 1] - bi[i]; 1463 anzi = ai[i + 1] - ai[i]; 1464 for (j = 0; j < anzi; j++) { 1465 nextb = 0; 1466 crow = *aj++; 1467 cjj = cj + ci[crow]; 1468 caj = ca + ci[crow]; 1469 /* perform sparse axpy operation. Note cjj includes bj. */ 1470 for (k = 0; nextb < bnzi; k++) { 1471 if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 1472 caj[k] += (*aa) * (*(ba + nextb)); 1473 nextb++; 1474 } 1475 } 1476 flops += 2 * bnzi; 1477 aa++; 1478 } 1479 } 1480 1481 /* Assemble the final matrix and clean up */ 1482 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1483 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1484 PetscCall(PetscLogFlops(flops)); 1485 PetscFunctionReturn(PETSC_SUCCESS); 1486 } 1487 1488 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1489 { 1490 PetscFunctionBegin; 1491 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1492 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1493 PetscFunctionReturn(PETSC_SUCCESS); 1494 } 1495 1496 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) 1497 { 1498 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1499 PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1500 const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1501 const PetscInt *aj; 1502 PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 1503 PetscInt clda; 1504 PetscInt am4, bm4, col, i, j, n; 1505 1506 PetscFunctionBegin; 1507 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1508 PetscCall(MatSeqAIJGetArrayRead(A, &av)); 1509 if (add) { 1510 PetscCall(MatDenseGetArray(C, &c)); 1511 } else { 1512 PetscCall(MatDenseGetArrayWrite(C, &c)); 1513 } 1514 PetscCall(MatDenseGetArrayRead(B, &b)); 1515 PetscCall(MatDenseGetLDA(B, &bm)); 1516 PetscCall(MatDenseGetLDA(C, &clda)); 1517 am4 = 4 * clda; 1518 bm4 = 4 * bm; 1519 if (b) { 1520 b1 = b; 1521 b2 = b1 + bm; 1522 b3 = b2 + bm; 1523 b4 = b3 + bm; 1524 } else b1 = b2 = b3 = b4 = NULL; 1525 c1 = c; 1526 c2 = c1 + clda; 1527 c3 = c2 + clda; 1528 c4 = c3 + clda; 1529 for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 1530 for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1531 r1 = r2 = r3 = r4 = 0.0; 1532 n = a->i[i + 1] - a->i[i]; 1533 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1534 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1535 for (j = 0; j < n; j++) { 1536 const PetscScalar aatmp = aa[j]; 1537 const PetscInt ajtmp = aj[j]; 1538 r1 += aatmp * b1[ajtmp]; 1539 r2 += aatmp * b2[ajtmp]; 1540 r3 += aatmp * b3[ajtmp]; 1541 r4 += aatmp * b4[ajtmp]; 1542 } 1543 if (add) { 1544 c1[i] += r1; 1545 c2[i] += r2; 1546 c3[i] += r3; 1547 c4[i] += r4; 1548 } else { 1549 c1[i] = r1; 1550 c2[i] = r2; 1551 c3[i] = r3; 1552 c4[i] = r4; 1553 } 1554 } 1555 if (b) { 1556 b1 += bm4; 1557 b2 += bm4; 1558 b3 += bm4; 1559 b4 += bm4; 1560 } 1561 c1 += am4; 1562 c2 += am4; 1563 c3 += am4; 1564 c4 += am4; 1565 } 1566 /* process remaining columns */ 1567 if (col != cn) { 1568 PetscInt rc = cn - col; 1569 1570 if (rc == 1) { 1571 for (i = 0; i < am; i++) { 1572 r1 = 0.0; 1573 n = a->i[i + 1] - a->i[i]; 1574 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1575 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1576 for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 1577 if (add) c1[i] += r1; 1578 else c1[i] = r1; 1579 } 1580 } else if (rc == 2) { 1581 for (i = 0; i < am; i++) { 1582 r1 = r2 = 0.0; 1583 n = a->i[i + 1] - a->i[i]; 1584 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1585 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1586 for (j = 0; j < n; j++) { 1587 const PetscScalar aatmp = aa[j]; 1588 const PetscInt ajtmp = aj[j]; 1589 r1 += aatmp * b1[ajtmp]; 1590 r2 += aatmp * b2[ajtmp]; 1591 } 1592 if (add) { 1593 c1[i] += r1; 1594 c2[i] += r2; 1595 } else { 1596 c1[i] = r1; 1597 c2[i] = r2; 1598 } 1599 } 1600 } else { 1601 for (i = 0; i < am; i++) { 1602 r1 = r2 = r3 = 0.0; 1603 n = a->i[i + 1] - a->i[i]; 1604 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1605 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1606 for (j = 0; j < n; j++) { 1607 const PetscScalar aatmp = aa[j]; 1608 const PetscInt ajtmp = aj[j]; 1609 r1 += aatmp * b1[ajtmp]; 1610 r2 += aatmp * b2[ajtmp]; 1611 r3 += aatmp * b3[ajtmp]; 1612 } 1613 if (add) { 1614 c1[i] += r1; 1615 c2[i] += r2; 1616 c3[i] += r3; 1617 } else { 1618 c1[i] = r1; 1619 c2[i] = r2; 1620 c3[i] = r3; 1621 } 1622 } 1623 } 1624 } 1625 PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 1626 if (add) { 1627 PetscCall(MatDenseRestoreArray(C, &c)); 1628 } else { 1629 PetscCall(MatDenseRestoreArrayWrite(C, &c)); 1630 } 1631 PetscCall(MatDenseRestoreArrayRead(B, &b)); 1632 PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 1633 PetscFunctionReturn(PETSC_SUCCESS); 1634 } 1635 1636 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) 1637 { 1638 PetscFunctionBegin; 1639 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1640 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1641 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1642 1643 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 1644 PetscFunctionReturn(PETSC_SUCCESS); 1645 } 1646 1647 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1648 { 1649 PetscFunctionBegin; 1650 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1651 C->ops->productsymbolic = MatProductSymbolic_AB; 1652 PetscFunctionReturn(PETSC_SUCCESS); 1653 } 1654 1655 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 1656 1657 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1658 { 1659 PetscFunctionBegin; 1660 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1661 C->ops->productsymbolic = MatProductSymbolic_AtB; 1662 PetscFunctionReturn(PETSC_SUCCESS); 1663 } 1664 1665 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1666 { 1667 PetscFunctionBegin; 1668 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1669 C->ops->productsymbolic = MatProductSymbolic_ABt; 1670 PetscFunctionReturn(PETSC_SUCCESS); 1671 } 1672 1673 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1674 { 1675 Mat_Product *product = C->product; 1676 1677 PetscFunctionBegin; 1678 switch (product->type) { 1679 case MATPRODUCT_AB: 1680 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1681 break; 1682 case MATPRODUCT_AtB: 1683 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1684 break; 1685 case MATPRODUCT_ABt: 1686 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1687 break; 1688 default: 1689 break; 1690 } 1691 PetscFunctionReturn(PETSC_SUCCESS); 1692 } 1693 1694 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1695 { 1696 Mat_Product *product = C->product; 1697 Mat A = product->A; 1698 PetscBool baij; 1699 1700 PetscFunctionBegin; 1701 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 1702 if (!baij) { /* A is seqsbaij */ 1703 PetscBool sbaij; 1704 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 1705 PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 1706 1707 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1708 } else { /* A is seqbaij */ 1709 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1710 } 1711 1712 C->ops->productsymbolic = MatProductSymbolic_AB; 1713 PetscFunctionReturn(PETSC_SUCCESS); 1714 } 1715 1716 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1717 { 1718 Mat_Product *product = C->product; 1719 1720 PetscFunctionBegin; 1721 MatCheckProduct(C, 1); 1722 PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1723 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 1724 PetscFunctionReturn(PETSC_SUCCESS); 1725 } 1726 1727 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1728 { 1729 PetscFunctionBegin; 1730 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1731 C->ops->productsymbolic = MatProductSymbolic_AB; 1732 PetscFunctionReturn(PETSC_SUCCESS); 1733 } 1734 1735 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1736 { 1737 Mat_Product *product = C->product; 1738 1739 PetscFunctionBegin; 1740 if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 1741 PetscFunctionReturn(PETSC_SUCCESS); 1742 } 1743 1744 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) 1745 { 1746 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1747 Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 1748 PetscInt *bi = b->i, *bj = b->j; 1749 PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 1750 MatScalar *btval, *btval_den, *ba = b->a; 1751 PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1752 1753 PetscFunctionBegin; 1754 btval_den = btdense->v; 1755 PetscCall(PetscArrayzero(btval_den, m * n)); 1756 for (k = 0; k < ncolors; k++) { 1757 ncolumns = coloring->ncolumns[k]; 1758 for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1759 col = *(columns + colorforcol[k] + l); 1760 btcol = bj + bi[col]; 1761 btval = ba + bi[col]; 1762 anz = bi[col + 1] - bi[col]; 1763 for (j = 0; j < anz; j++) { 1764 brow = btcol[j]; 1765 btval_den[brow] = btval[j]; 1766 } 1767 } 1768 btval_den += m; 1769 } 1770 PetscFunctionReturn(PETSC_SUCCESS); 1771 } 1772 1773 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) 1774 { 1775 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 1776 const PetscScalar *ca_den, *ca_den_ptr; 1777 PetscScalar *ca = csp->a; 1778 PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1779 PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1780 PetscInt nrows, *row, *idx; 1781 PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 1782 1783 PetscFunctionBegin; 1784 PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1785 1786 if (brows > 0) { 1787 PetscInt *lstart, row_end, row_start; 1788 lstart = matcoloring->lstart; 1789 PetscCall(PetscArrayzero(lstart, ncolors)); 1790 1791 row_end = brows; 1792 if (row_end > m) row_end = m; 1793 for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1794 ca_den_ptr = ca_den; 1795 for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1796 nrows = matcoloring->nrows[k]; 1797 row = rows + colorforrow[k]; 1798 idx = den2sp + colorforrow[k]; 1799 for (l = lstart[k]; l < nrows; l++) { 1800 if (row[l] >= row_end) { 1801 lstart[k] = l; 1802 break; 1803 } else { 1804 ca[idx[l]] = ca_den_ptr[row[l]]; 1805 } 1806 } 1807 ca_den_ptr += m; 1808 } 1809 row_end += brows; 1810 if (row_end > m) row_end = m; 1811 } 1812 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1813 ca_den_ptr = ca_den; 1814 for (k = 0; k < ncolors; k++) { 1815 nrows = matcoloring->nrows[k]; 1816 row = rows + colorforrow[k]; 1817 idx = den2sp + colorforrow[k]; 1818 for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1819 ca_den_ptr += m; 1820 } 1821 } 1822 1823 PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1824 #if defined(PETSC_USE_INFO) 1825 if (matcoloring->brows > 0) { 1826 PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1827 } else { 1828 PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1829 } 1830 #endif 1831 PetscFunctionReturn(PETSC_SUCCESS); 1832 } 1833 1834 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) 1835 { 1836 PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 1837 const PetscInt *is, *ci, *cj, *row_idx; 1838 PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1839 IS *isa; 1840 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1841 PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1842 PetscInt *colorforcol, *columns, *columns_i, brows; 1843 PetscBool flg; 1844 1845 PetscFunctionBegin; 1846 PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1847 1848 /* bs >1 is not being tested yet! */ 1849 Nbs = mat->cmap->N / bs; 1850 c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1851 c->N = Nbs; 1852 c->m = c->M; 1853 c->rstart = 0; 1854 c->brows = 100; 1855 1856 c->ncolors = nis; 1857 PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 1858 PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 1859 PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1860 1861 brows = c->brows; 1862 PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1863 if (flg) c->brows = brows; 1864 if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 1865 1866 colorforrow[0] = 0; 1867 rows_i = rows; 1868 den2sp_i = den2sp; 1869 1870 PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 1871 PetscCall(PetscMalloc1(Nbs + 1, &columns)); 1872 1873 colorforcol[0] = 0; 1874 columns_i = columns; 1875 1876 /* get column-wise storage of mat */ 1877 PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1878 1879 cm = c->m; 1880 PetscCall(PetscMalloc1(cm + 1, &rowhit)); 1881 PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1882 for (i = 0; i < nis; i++) { /* loop over color */ 1883 PetscCall(ISGetLocalSize(isa[i], &n)); 1884 PetscCall(ISGetIndices(isa[i], &is)); 1885 1886 c->ncolumns[i] = n; 1887 if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1888 colorforcol[i + 1] = colorforcol[i] + n; 1889 columns_i += n; 1890 1891 /* fast, crude version requires O(N*N) work */ 1892 PetscCall(PetscArrayzero(rowhit, cm)); 1893 1894 for (j = 0; j < n; j++) { /* loop over columns*/ 1895 col = is[j]; 1896 row_idx = cj + ci[col]; 1897 m = ci[col + 1] - ci[col]; 1898 for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1899 idxhit[*row_idx] = spidx[ci[col] + k]; 1900 rowhit[*row_idx++] = col + 1; 1901 } 1902 } 1903 /* count the number of hits */ 1904 nrows = 0; 1905 for (j = 0; j < cm; j++) { 1906 if (rowhit[j]) nrows++; 1907 } 1908 c->nrows[i] = nrows; 1909 colorforrow[i + 1] = colorforrow[i] + nrows; 1910 1911 nrows = 0; 1912 for (j = 0; j < cm; j++) { /* loop over rows */ 1913 if (rowhit[j]) { 1914 rows_i[nrows] = j; 1915 den2sp_i[nrows] = idxhit[j]; 1916 nrows++; 1917 } 1918 } 1919 den2sp_i += nrows; 1920 1921 PetscCall(ISRestoreIndices(isa[i], &is)); 1922 rows_i += nrows; 1923 } 1924 PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1925 PetscCall(PetscFree(rowhit)); 1926 PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 1927 PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1928 1929 c->colorforrow = colorforrow; 1930 c->rows = rows; 1931 c->den2sp = den2sp; 1932 c->colorforcol = colorforcol; 1933 c->columns = columns; 1934 1935 PetscCall(PetscFree(idxhit)); 1936 PetscFunctionReturn(PETSC_SUCCESS); 1937 } 1938 1939 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1940 { 1941 Mat_Product *product = C->product; 1942 Mat A = product->A, B = product->B; 1943 1944 PetscFunctionBegin; 1945 if (C->ops->mattransposemultnumeric) { 1946 /* Alg: "outerproduct" */ 1947 PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 1948 } else { 1949 /* Alg: "matmatmult" -- C = At*B */ 1950 Mat_MatTransMatMult *atb = (Mat_MatTransMatMult *)product->data; 1951 1952 PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1953 if (atb->At) { 1954 /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 1955 user may have called MatProductReplaceMats() to get this A=product->A */ 1956 PetscCall(MatTransposeSetPrecursor(A, atb->At)); 1957 PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 1958 } 1959 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 1960 } 1961 PetscFunctionReturn(PETSC_SUCCESS); 1962 } 1963 1964 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1965 { 1966 Mat_Product *product = C->product; 1967 Mat A = product->A, B = product->B; 1968 PetscReal fill = product->fill; 1969 1970 PetscFunctionBegin; 1971 PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 1972 1973 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1974 PetscFunctionReturn(PETSC_SUCCESS); 1975 } 1976 1977 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1978 { 1979 Mat_Product *product = C->product; 1980 PetscInt alg = 0; /* default algorithm */ 1981 PetscBool flg = PETSC_FALSE; 1982 #if !defined(PETSC_HAVE_HYPRE) 1983 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 1984 PetscInt nalg = 7; 1985 #else 1986 const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 1987 PetscInt nalg = 8; 1988 #endif 1989 1990 PetscFunctionBegin; 1991 /* Set default algorithm */ 1992 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 1993 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 1994 1995 /* Get runtime option */ 1996 if (product->api_user) { 1997 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 1998 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 1999 PetscOptionsEnd(); 2000 } else { 2001 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 2002 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 2003 PetscOptionsEnd(); 2004 } 2005 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2006 2007 C->ops->productsymbolic = MatProductSymbolic_AB; 2008 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 2009 PetscFunctionReturn(PETSC_SUCCESS); 2010 } 2011 2012 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2013 { 2014 Mat_Product *product = C->product; 2015 PetscInt alg = 0; /* default algorithm */ 2016 PetscBool flg = PETSC_FALSE; 2017 const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 2018 PetscInt nalg = 3; 2019 2020 PetscFunctionBegin; 2021 /* Get runtime option */ 2022 if (product->api_user) { 2023 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 2024 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2025 PetscOptionsEnd(); 2026 } else { 2027 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 2028 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 2029 PetscOptionsEnd(); 2030 } 2031 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2032 2033 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 2034 PetscFunctionReturn(PETSC_SUCCESS); 2035 } 2036 2037 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2038 { 2039 Mat_Product *product = C->product; 2040 PetscInt alg = 0; /* default algorithm */ 2041 PetscBool flg = PETSC_FALSE; 2042 const char *algTypes[2] = {"default", "color"}; 2043 PetscInt nalg = 2; 2044 2045 PetscFunctionBegin; 2046 /* Set default algorithm */ 2047 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2048 if (!flg) { 2049 alg = 1; 2050 PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2051 } 2052 2053 /* Get runtime option */ 2054 if (product->api_user) { 2055 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 2056 PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2057 PetscOptionsEnd(); 2058 } else { 2059 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 2060 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2061 PetscOptionsEnd(); 2062 } 2063 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2064 2065 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2066 C->ops->productsymbolic = MatProductSymbolic_ABt; 2067 PetscFunctionReturn(PETSC_SUCCESS); 2068 } 2069 2070 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2071 { 2072 Mat_Product *product = C->product; 2073 PetscBool flg = PETSC_FALSE; 2074 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2075 #if !defined(PETSC_HAVE_HYPRE) 2076 const char *algTypes[2] = {"scalable", "rap"}; 2077 PetscInt nalg = 2; 2078 #else 2079 const char *algTypes[3] = {"scalable", "rap", "hypre"}; 2080 PetscInt nalg = 3; 2081 #endif 2082 2083 PetscFunctionBegin; 2084 /* Set default algorithm */ 2085 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2086 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2087 2088 /* Get runtime option */ 2089 if (product->api_user) { 2090 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 2091 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2092 PetscOptionsEnd(); 2093 } else { 2094 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 2095 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2096 PetscOptionsEnd(); 2097 } 2098 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2099 2100 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2101 PetscFunctionReturn(PETSC_SUCCESS); 2102 } 2103 2104 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2105 { 2106 Mat_Product *product = C->product; 2107 PetscBool flg = PETSC_FALSE; 2108 PetscInt alg = 0; /* default algorithm */ 2109 const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 2110 PetscInt nalg = 3; 2111 2112 PetscFunctionBegin; 2113 /* Set default algorithm */ 2114 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2115 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2116 2117 /* Get runtime option */ 2118 if (product->api_user) { 2119 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 2120 PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2121 PetscOptionsEnd(); 2122 } else { 2123 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 2124 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2125 PetscOptionsEnd(); 2126 } 2127 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2128 2129 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2130 PetscFunctionReturn(PETSC_SUCCESS); 2131 } 2132 2133 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2134 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2135 { 2136 Mat_Product *product = C->product; 2137 PetscInt alg = 0; /* default algorithm */ 2138 PetscBool flg = PETSC_FALSE; 2139 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 2140 PetscInt nalg = 7; 2141 2142 PetscFunctionBegin; 2143 /* Set default algorithm */ 2144 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2145 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2146 2147 /* Get runtime option */ 2148 if (product->api_user) { 2149 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 2150 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2151 PetscOptionsEnd(); 2152 } else { 2153 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 2154 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2155 PetscOptionsEnd(); 2156 } 2157 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2158 2159 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2160 C->ops->productsymbolic = MatProductSymbolic_ABC; 2161 PetscFunctionReturn(PETSC_SUCCESS); 2162 } 2163 2164 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2165 { 2166 Mat_Product *product = C->product; 2167 2168 PetscFunctionBegin; 2169 switch (product->type) { 2170 case MATPRODUCT_AB: 2171 PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2172 break; 2173 case MATPRODUCT_AtB: 2174 PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2175 break; 2176 case MATPRODUCT_ABt: 2177 PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2178 break; 2179 case MATPRODUCT_PtAP: 2180 PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2181 break; 2182 case MATPRODUCT_RARt: 2183 PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2184 break; 2185 case MATPRODUCT_ABC: 2186 PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2187 break; 2188 default: 2189 break; 2190 } 2191 PetscFunctionReturn(PETSC_SUCCESS); 2192 } 2193