1 /* 2 Defines matrix-matrix product routines for pairs of SeqAIJ matrices 3 C = A * B 4 */ 5 6 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 7 #include <../src/mat/utils/freespace.h> 8 #include <petscbt.h> 9 #include <petsc/private/isimpl.h> 10 #include <../src/mat/impls/dense/seq/dense.h> 11 12 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 13 { 14 PetscFunctionBegin; 15 if (C->ops->matmultnumeric) PetscCall((*C->ops->matmultnumeric)(A, B, C)); 16 else PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(A, B, C)); 17 PetscFunctionReturn(PETSC_SUCCESS); 18 } 19 20 /* Modified from MatCreateSeqAIJWithArrays() */ 21 PETSC_INTERN PetscErrorCode MatSetSeqAIJWithArrays_private(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt i[], PetscInt j[], PetscScalar a[], MatType mtype, Mat mat) 22 { 23 PetscInt ii; 24 Mat_SeqAIJ *aij; 25 PetscBool isseqaij, ofree_a, ofree_ij; 26 27 PetscFunctionBegin; 28 PetscCheck(m <= 0 || !i[0], PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "i (row indices) must start with 0"); 29 PetscCall(MatSetSizes(mat, m, n, m, n)); 30 31 if (!mtype) { 32 PetscCall(PetscObjectBaseTypeCompare((PetscObject)mat, MATSEQAIJ, &isseqaij)); 33 if (!isseqaij) PetscCall(MatSetType(mat, MATSEQAIJ)); 34 } else { 35 PetscCall(MatSetType(mat, mtype)); 36 } 37 38 aij = (Mat_SeqAIJ *)mat->data; 39 ofree_a = aij->free_a; 40 ofree_ij = aij->free_ij; 41 /* changes the free flags */ 42 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(mat, MAT_SKIP_ALLOCATION, NULL)); 43 44 PetscCall(PetscFree(aij->ilen)); 45 PetscCall(PetscFree(aij->imax)); 46 PetscCall(PetscMalloc1(m, &aij->imax)); 47 PetscCall(PetscMalloc1(m, &aij->ilen)); 48 for (ii = 0, aij->nonzerorowcnt = 0, aij->rmax = 0; ii < m; ii++) { 49 const PetscInt rnz = i[ii + 1] - i[ii]; 50 aij->nonzerorowcnt += !!rnz; 51 aij->rmax = PetscMax(aij->rmax, rnz); 52 aij->ilen[ii] = aij->imax[ii] = i[ii + 1] - i[ii]; 53 } 54 aij->maxnz = i[m]; 55 aij->nz = i[m]; 56 57 if (ofree_a) PetscCall(PetscShmgetDeallocateArray((void **)&aij->a)); 58 if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->j)); 59 if (ofree_ij) PetscCall(PetscShmgetDeallocateArray((void **)&aij->i)); 60 61 aij->i = i; 62 aij->j = j; 63 aij->a = a; 64 aij->nonew = -1; /* this indicates that inserting a new value in the matrix that generates a new nonzero is an error */ 65 aij->free_a = PETSC_FALSE; 66 aij->free_ij = PETSC_FALSE; 67 PetscCall(MatCheckCompressedRow(mat, aij->nonzerorowcnt, &aij->compressedrow, aij->i, m, 0.6)); 68 // Always build the diag info when i, j are set 69 PetscCall(MatMarkDiagonal_SeqAIJ(mat)); 70 PetscFunctionReturn(PETSC_SUCCESS); 71 } 72 73 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 74 { 75 Mat_Product *product = C->product; 76 MatProductAlgorithm alg; 77 PetscBool flg; 78 79 PetscFunctionBegin; 80 if (product) { 81 alg = product->alg; 82 } else { 83 alg = "sorted"; 84 } 85 /* sorted */ 86 PetscCall(PetscStrcmp(alg, "sorted", &flg)); 87 if (flg) { 88 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(A, B, fill, C)); 89 PetscFunctionReturn(PETSC_SUCCESS); 90 } 91 92 /* scalable */ 93 PetscCall(PetscStrcmp(alg, "scalable", &flg)); 94 if (flg) { 95 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(A, B, fill, C)); 96 PetscFunctionReturn(PETSC_SUCCESS); 97 } 98 99 /* scalable_fast */ 100 PetscCall(PetscStrcmp(alg, "scalable_fast", &flg)); 101 if (flg) { 102 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(A, B, fill, C)); 103 PetscFunctionReturn(PETSC_SUCCESS); 104 } 105 106 /* heap */ 107 PetscCall(PetscStrcmp(alg, "heap", &flg)); 108 if (flg) { 109 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(A, B, fill, C)); 110 PetscFunctionReturn(PETSC_SUCCESS); 111 } 112 113 /* btheap */ 114 PetscCall(PetscStrcmp(alg, "btheap", &flg)); 115 if (flg) { 116 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(A, B, fill, C)); 117 PetscFunctionReturn(PETSC_SUCCESS); 118 } 119 120 /* llcondensed */ 121 PetscCall(PetscStrcmp(alg, "llcondensed", &flg)); 122 if (flg) { 123 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(A, B, fill, C)); 124 PetscFunctionReturn(PETSC_SUCCESS); 125 } 126 127 /* rowmerge */ 128 PetscCall(PetscStrcmp(alg, "rowmerge", &flg)); 129 if (flg) { 130 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(A, B, fill, C)); 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 #if defined(PETSC_HAVE_HYPRE) 135 PetscCall(PetscStrcmp(alg, "hypre", &flg)); 136 if (flg) { 137 PetscCall(MatMatMultSymbolic_AIJ_AIJ_wHYPRE(A, B, fill, C)); 138 PetscFunctionReturn(PETSC_SUCCESS); 139 } 140 #endif 141 142 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 143 } 144 145 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_LLCondensed(Mat A, Mat B, PetscReal fill, Mat C) 146 { 147 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 148 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 149 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 150 PetscReal afill; 151 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 152 PetscHMapI ta; 153 PetscBT lnkbt; 154 PetscFreeSpaceList free_space = NULL, current_space = NULL; 155 156 PetscFunctionBegin; 157 /* Get ci and cj */ 158 /* Allocate ci array, arrays for fill computation and */ 159 /* free space for accumulating nonzero column info */ 160 PetscCall(PetscMalloc1(am + 2, &ci)); 161 ci[0] = 0; 162 163 /* create and initialize a linked list */ 164 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 165 MatRowMergeMax_SeqAIJ(b, bm, ta); 166 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 167 PetscCall(PetscHMapIDestroy(&ta)); 168 169 PetscCall(PetscLLCondensedCreate(Crmax, bn, &lnk, &lnkbt)); 170 171 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 172 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 173 174 current_space = free_space; 175 176 /* Determine ci and cj */ 177 for (i = 0; i < am; i++) { 178 anzi = ai[i + 1] - ai[i]; 179 aj = a->j + ai[i]; 180 for (j = 0; j < anzi; j++) { 181 brow = aj[j]; 182 bnzj = bi[brow + 1] - bi[brow]; 183 bj = b->j + bi[brow]; 184 /* add non-zero cols of B into the sorted linked list lnk */ 185 PetscCall(PetscLLCondensedAddSorted(bnzj, bj, lnk, lnkbt)); 186 } 187 /* add possible missing diagonal entry */ 188 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted(1, &i, lnk, lnkbt)); 189 cnzi = lnk[0]; 190 191 /* If free space is not available, make more free space */ 192 /* Double the amount of total space in the list */ 193 if (current_space->local_remaining < cnzi) { 194 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 195 ndouble++; 196 } 197 198 /* Copy data into free space, then initialize lnk */ 199 PetscCall(PetscLLCondensedClean(bn, cnzi, current_space->array, lnk, lnkbt)); 200 201 current_space->array += cnzi; 202 current_space->local_used += cnzi; 203 current_space->local_remaining -= cnzi; 204 205 ci[i + 1] = ci[i] + cnzi; 206 } 207 208 /* Column indices are in the list of free space */ 209 /* Allocate space for cj, initialize cj, and */ 210 /* destroy list of free space and other temporary array(s) */ 211 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 212 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 213 PetscCall(PetscLLCondensedDestroy(lnk, lnkbt)); 214 215 /* put together the new symbolic matrix */ 216 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 217 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 218 219 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 220 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 221 c = (Mat_SeqAIJ *)C->data; 222 c->free_a = PETSC_FALSE; 223 c->free_ij = PETSC_TRUE; 224 c->nonew = 0; 225 226 /* fast, needs non-scalable O(bn) array 'abdense' */ 227 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 228 229 /* set MatInfo */ 230 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 231 if (afill < 1.0) afill = 1.0; 232 C->info.mallocs = ndouble; 233 C->info.fill_ratio_given = fill; 234 C->info.fill_ratio_needed = afill; 235 236 #if defined(PETSC_USE_INFO) 237 if (ci[am]) { 238 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 239 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 240 } else { 241 PetscCall(PetscInfo(C, "Empty matrix product\n")); 242 } 243 #endif 244 PetscFunctionReturn(PETSC_SUCCESS); 245 } 246 247 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, Mat C) 248 { 249 PetscLogDouble flops = 0.0; 250 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 251 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 252 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 253 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 254 PetscInt am = A->rmap->n, cm = C->rmap->n; 255 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 256 PetscScalar *ca, valtmp; 257 PetscScalar *ab_dense; 258 PetscContainer cab_dense; 259 const PetscScalar *aa, *ba, *baj; 260 261 PetscFunctionBegin; 262 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 263 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 264 if (!c->a) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 265 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 266 c->a = ca; 267 c->free_a = PETSC_TRUE; 268 } else ca = c->a; 269 270 /* TODO this should be done in the symbolic phase */ 271 /* However, this function is so heavily used (sometimes in an hidden way through multnumeric function pointers 272 that is hard to eradicate) */ 273 PetscCall(PetscObjectQuery((PetscObject)C, "__PETSc__ab_dense", (PetscObject *)&cab_dense)); 274 if (!cab_dense) { 275 PetscCall(PetscMalloc1(B->cmap->N, &ab_dense)); 276 PetscCall(PetscObjectContainerCompose((PetscObject)C, "__PETSc__ab_dense", ab_dense, PetscCtxDestroyDefault)); 277 } else PetscCall(PetscContainerGetPointer(cab_dense, (void **)&ab_dense)); 278 PetscCall(PetscArrayzero(ab_dense, B->cmap->N)); 279 280 /* clean old values in C */ 281 PetscCall(PetscArrayzero(ca, ci[cm])); 282 /* Traverse A row-wise. */ 283 /* Build the ith row in C by summing over nonzero columns in A, */ 284 /* the rows of B corresponding to nonzeros of A. */ 285 for (i = 0; i < am; i++) { 286 anzi = ai[i + 1] - ai[i]; 287 for (j = 0; j < anzi; j++) { 288 brow = aj[j]; 289 bnzi = bi[brow + 1] - bi[brow]; 290 bjj = PetscSafePointerPlusOffset(bj, bi[brow]); 291 baj = PetscSafePointerPlusOffset(ba, bi[brow]); 292 /* perform dense axpy */ 293 valtmp = aa[j]; 294 for (k = 0; k < bnzi; k++) ab_dense[bjj[k]] += valtmp * baj[k]; 295 flops += 2 * bnzi; 296 } 297 aj = PetscSafePointerPlusOffset(aj, anzi); 298 aa = PetscSafePointerPlusOffset(aa, anzi); 299 300 cnzi = ci[i + 1] - ci[i]; 301 for (k = 0; k < cnzi; k++) { 302 ca[k] += ab_dense[cj[k]]; 303 ab_dense[cj[k]] = 0.0; /* zero ab_dense */ 304 } 305 flops += cnzi; 306 cj = PetscSafePointerPlusOffset(cj, cnzi); 307 ca += cnzi; 308 } 309 #if defined(PETSC_HAVE_DEVICE) 310 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 311 #endif 312 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 313 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 314 PetscCall(PetscLogFlops(flops)); 315 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 316 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 317 PetscFunctionReturn(PETSC_SUCCESS); 318 } 319 320 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, Mat C) 321 { 322 PetscLogDouble flops = 0.0; 323 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 324 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 325 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 326 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bjj, *ci = c->i, *cj = c->j; 327 PetscInt am = A->rmap->N, cm = C->rmap->N; 328 PetscInt i, j, k, anzi, bnzi, cnzi, brow; 329 PetscScalar *ca = c->a, valtmp; 330 const PetscScalar *aa, *ba, *baj; 331 PetscInt nextb; 332 333 PetscFunctionBegin; 334 PetscCall(MatSeqAIJGetArrayRead(A, &aa)); 335 PetscCall(MatSeqAIJGetArrayRead(B, &ba)); 336 if (!ca) { /* first call of MatMatMultNumeric_SeqAIJ_SeqAIJ, allocate ca and matmult_abdense */ 337 PetscCall(PetscMalloc1(ci[cm] + 1, &ca)); 338 c->a = ca; 339 c->free_a = PETSC_TRUE; 340 } 341 342 /* clean old values in C */ 343 PetscCall(PetscArrayzero(ca, ci[cm])); 344 /* Traverse A row-wise. */ 345 /* Build the ith row in C by summing over nonzero columns in A, */ 346 /* the rows of B corresponding to nonzeros of A. */ 347 for (i = 0; i < am; i++) { 348 anzi = ai[i + 1] - ai[i]; 349 cnzi = ci[i + 1] - ci[i]; 350 for (j = 0; j < anzi; j++) { 351 brow = aj[j]; 352 bnzi = bi[brow + 1] - bi[brow]; 353 bjj = bj + bi[brow]; 354 baj = ba + bi[brow]; 355 /* perform sparse axpy */ 356 valtmp = aa[j]; 357 nextb = 0; 358 for (k = 0; nextb < bnzi; k++) { 359 if (cj[k] == bjj[nextb]) { /* ccol == bcol */ 360 ca[k] += valtmp * baj[nextb++]; 361 } 362 } 363 flops += 2 * bnzi; 364 } 365 aj += anzi; 366 aa += anzi; 367 cj += cnzi; 368 ca += cnzi; 369 } 370 #if defined(PETSC_HAVE_DEVICE) 371 if (C->offloadmask != PETSC_OFFLOAD_UNALLOCATED) C->offloadmask = PETSC_OFFLOAD_CPU; 372 #endif 373 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 374 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 375 PetscCall(PetscLogFlops(flops)); 376 PetscCall(MatSeqAIJRestoreArrayRead(A, &aa)); 377 PetscCall(MatSeqAIJRestoreArrayRead(B, &ba)); 378 PetscFunctionReturn(PETSC_SUCCESS); 379 } 380 381 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable_fast(Mat A, Mat B, PetscReal fill, Mat C) 382 { 383 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 384 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 385 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 386 MatScalar *ca; 387 PetscReal afill; 388 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 389 PetscHMapI ta; 390 PetscFreeSpaceList free_space = NULL, current_space = NULL; 391 392 PetscFunctionBegin; 393 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_fast() */ 394 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 395 PetscCall(PetscMalloc1(am + 2, &ci)); 396 ci[0] = 0; 397 398 /* create and initialize a linked list */ 399 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 400 MatRowMergeMax_SeqAIJ(b, bm, ta); 401 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 402 PetscCall(PetscHMapIDestroy(&ta)); 403 404 PetscCall(PetscLLCondensedCreate_fast(Crmax, &lnk)); 405 406 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 407 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 408 current_space = free_space; 409 410 /* Determine ci and cj */ 411 for (i = 0; i < am; i++) { 412 anzi = ai[i + 1] - ai[i]; 413 aj = a->j + ai[i]; 414 for (j = 0; j < anzi; j++) { 415 brow = aj[j]; 416 bnzj = bi[brow + 1] - bi[brow]; 417 bj = b->j + bi[brow]; 418 /* add non-zero cols of B into the sorted linked list lnk */ 419 PetscCall(PetscLLCondensedAddSorted_fast(bnzj, bj, lnk)); 420 } 421 /* add possible missing diagonal entry */ 422 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_fast(1, &i, lnk)); 423 cnzi = lnk[1]; 424 425 /* If free space is not available, make more free space */ 426 /* Double the amount of total space in the list */ 427 if (current_space->local_remaining < cnzi) { 428 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 429 ndouble++; 430 } 431 432 /* Copy data into free space, then initialize lnk */ 433 PetscCall(PetscLLCondensedClean_fast(cnzi, current_space->array, lnk)); 434 435 current_space->array += cnzi; 436 current_space->local_used += cnzi; 437 current_space->local_remaining -= cnzi; 438 439 ci[i + 1] = ci[i] + cnzi; 440 } 441 442 /* Column indices are in the list of free space */ 443 /* Allocate space for cj, initialize cj, and */ 444 /* destroy list of free space and other temporary array(s) */ 445 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 446 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 447 PetscCall(PetscLLCondensedDestroy_fast(lnk)); 448 449 /* Allocate space for ca */ 450 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 451 452 /* put together the new symbolic matrix */ 453 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 454 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 455 456 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 457 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 458 c = (Mat_SeqAIJ *)C->data; 459 c->free_a = PETSC_TRUE; 460 c->free_ij = PETSC_TRUE; 461 c->nonew = 0; 462 463 /* slower, less memory */ 464 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 465 466 /* set MatInfo */ 467 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 468 if (afill < 1.0) afill = 1.0; 469 C->info.mallocs = ndouble; 470 C->info.fill_ratio_given = fill; 471 C->info.fill_ratio_needed = afill; 472 473 #if defined(PETSC_USE_INFO) 474 if (ci[am]) { 475 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 476 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 477 } else { 478 PetscCall(PetscInfo(C, "Empty matrix product\n")); 479 } 480 #endif 481 PetscFunctionReturn(PETSC_SUCCESS); 482 } 483 484 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Scalable(Mat A, Mat B, PetscReal fill, Mat C) 485 { 486 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 487 PetscInt *ai = a->i, *bi = b->i, *ci, *cj; 488 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 489 MatScalar *ca; 490 PetscReal afill; 491 PetscInt i, j, anzi, brow, bnzj, cnzi, *bj, *aj, *lnk, ndouble = 0, Crmax; 492 PetscHMapI ta; 493 PetscFreeSpaceList free_space = NULL, current_space = NULL; 494 495 PetscFunctionBegin; 496 /* Get ci and cj - same as MatMatMultSymbolic_SeqAIJ_SeqAIJ except using PetscLLxxx_Scalalbe() */ 497 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 498 PetscCall(PetscMalloc1(am + 2, &ci)); 499 ci[0] = 0; 500 501 /* create and initialize a linked list */ 502 PetscCall(PetscHMapICreateWithSize(bn, &ta)); 503 MatRowMergeMax_SeqAIJ(b, bm, ta); 504 PetscCall(PetscHMapIGetSize(ta, &Crmax)); 505 PetscCall(PetscHMapIDestroy(&ta)); 506 PetscCall(PetscLLCondensedCreate_Scalable(Crmax, &lnk)); 507 508 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 509 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 510 current_space = free_space; 511 512 /* Determine ci and cj */ 513 for (i = 0; i < am; i++) { 514 anzi = ai[i + 1] - ai[i]; 515 aj = a->j + ai[i]; 516 for (j = 0; j < anzi; j++) { 517 brow = aj[j]; 518 bnzj = bi[brow + 1] - bi[brow]; 519 bj = b->j + bi[brow]; 520 /* add non-zero cols of B into the sorted linked list lnk */ 521 PetscCall(PetscLLCondensedAddSorted_Scalable(bnzj, bj, lnk)); 522 } 523 /* add possible missing diagonal entry */ 524 if (C->force_diagonals) PetscCall(PetscLLCondensedAddSorted_Scalable(1, &i, lnk)); 525 526 cnzi = lnk[0]; 527 528 /* If free space is not available, make more free space */ 529 /* Double the amount of total space in the list */ 530 if (current_space->local_remaining < cnzi) { 531 PetscCall(PetscFreeSpaceGet(PetscIntSumTruncate(cnzi, current_space->total_array_size), ¤t_space)); 532 ndouble++; 533 } 534 535 /* Copy data into free space, then initialize lnk */ 536 PetscCall(PetscLLCondensedClean_Scalable(cnzi, current_space->array, lnk)); 537 538 current_space->array += cnzi; 539 current_space->local_used += cnzi; 540 current_space->local_remaining -= cnzi; 541 542 ci[i + 1] = ci[i] + cnzi; 543 } 544 545 /* Column indices are in the list of free space */ 546 /* Allocate space for cj, initialize cj, and */ 547 /* destroy list of free space and other temporary array(s) */ 548 PetscCall(PetscMalloc1(ci[am] + 1, &cj)); 549 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 550 PetscCall(PetscLLCondensedDestroy_Scalable(lnk)); 551 552 /* Allocate space for ca */ 553 PetscCall(PetscCalloc1(ci[am] + 1, &ca)); 554 555 /* put together the new symbolic matrix */ 556 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, ca, ((PetscObject)A)->type_name, C)); 557 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 558 559 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 560 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 561 c = (Mat_SeqAIJ *)C->data; 562 c->free_a = PETSC_TRUE; 563 c->free_ij = PETSC_TRUE; 564 c->nonew = 0; 565 566 /* slower, less memory */ 567 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable; 568 569 /* set MatInfo */ 570 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 571 if (afill < 1.0) afill = 1.0; 572 C->info.mallocs = ndouble; 573 C->info.fill_ratio_given = fill; 574 C->info.fill_ratio_needed = afill; 575 576 #if defined(PETSC_USE_INFO) 577 if (ci[am]) { 578 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 579 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 580 } else { 581 PetscCall(PetscInfo(C, "Empty matrix product\n")); 582 } 583 #endif 584 PetscFunctionReturn(PETSC_SUCCESS); 585 } 586 587 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Heap(Mat A, Mat B, PetscReal fill, Mat C) 588 { 589 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 590 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 591 PetscInt *ci, *cj, *bb; 592 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 593 PetscReal afill; 594 PetscInt i, j, col, ndouble = 0; 595 PetscFreeSpaceList free_space = NULL, current_space = NULL; 596 PetscHeap h; 597 598 PetscFunctionBegin; 599 /* Get ci and cj - by merging sorted rows using a heap */ 600 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 601 PetscCall(PetscMalloc1(am + 2, &ci)); 602 ci[0] = 0; 603 604 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 605 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 606 current_space = free_space; 607 608 PetscCall(PetscHeapCreate(a->rmax, &h)); 609 PetscCall(PetscMalloc1(a->rmax, &bb)); 610 611 /* Determine ci and cj */ 612 for (i = 0; i < am; i++) { 613 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 614 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 615 ci[i + 1] = ci[i]; 616 /* Populate the min heap */ 617 for (j = 0; j < anzi; j++) { 618 bb[j] = bi[acol[j]]; /* bb points at the start of the row */ 619 if (bb[j] < bi[acol[j] + 1]) { /* Add if row is nonempty */ 620 PetscCall(PetscHeapAdd(h, j, bj[bb[j]++])); 621 } 622 } 623 /* Pick off the min element, adding it to free space */ 624 PetscCall(PetscHeapPop(h, &j, &col)); 625 while (j >= 0) { 626 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 627 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 628 ndouble++; 629 } 630 *(current_space->array++) = col; 631 current_space->local_used++; 632 current_space->local_remaining--; 633 ci[i + 1]++; 634 635 /* stash if anything else remains in this row of B */ 636 if (bb[j] < bi[acol[j] + 1]) PetscCall(PetscHeapStash(h, j, bj[bb[j]++])); 637 while (1) { /* pop and stash any other rows of B that also had an entry in this column */ 638 PetscInt j2, col2; 639 PetscCall(PetscHeapPeek(h, &j2, &col2)); 640 if (col2 != col) break; 641 PetscCall(PetscHeapPop(h, &j2, &col2)); 642 if (bb[j2] < bi[acol[j2] + 1]) PetscCall(PetscHeapStash(h, j2, bj[bb[j2]++])); 643 } 644 /* Put any stashed elements back into the min heap */ 645 PetscCall(PetscHeapUnstash(h)); 646 PetscCall(PetscHeapPop(h, &j, &col)); 647 } 648 } 649 PetscCall(PetscFree(bb)); 650 PetscCall(PetscHeapDestroy(&h)); 651 652 /* Column indices are in the list of free space */ 653 /* Allocate space for cj, initialize cj, and */ 654 /* destroy list of free space and other temporary array(s) */ 655 PetscCall(PetscMalloc1(ci[am], &cj)); 656 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 657 658 /* put together the new symbolic matrix */ 659 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 660 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 661 662 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 663 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 664 c = (Mat_SeqAIJ *)C->data; 665 c->free_a = PETSC_TRUE; 666 c->free_ij = PETSC_TRUE; 667 c->nonew = 0; 668 669 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 670 671 /* set MatInfo */ 672 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 673 if (afill < 1.0) afill = 1.0; 674 C->info.mallocs = ndouble; 675 C->info.fill_ratio_given = fill; 676 C->info.fill_ratio_needed = afill; 677 678 #if defined(PETSC_USE_INFO) 679 if (ci[am]) { 680 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 681 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 682 } else { 683 PetscCall(PetscInfo(C, "Empty matrix product\n")); 684 } 685 #endif 686 PetscFunctionReturn(PETSC_SUCCESS); 687 } 688 689 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_BTHeap(Mat A, Mat B, PetscReal fill, Mat C) 690 { 691 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 692 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 693 PetscInt *ci, *cj, *bb; 694 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 695 PetscReal afill; 696 PetscInt i, j, col, ndouble = 0; 697 PetscFreeSpaceList free_space = NULL, current_space = NULL; 698 PetscHeap h; 699 PetscBT bt; 700 701 PetscFunctionBegin; 702 /* Get ci and cj - using a heap for the sorted rows, but use BT so that each index is only added once */ 703 /* Allocate arrays for fill computation and free space for accumulating nonzero column */ 704 PetscCall(PetscMalloc1(am + 2, &ci)); 705 ci[0] = 0; 706 707 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 708 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am], bi[bm])), &free_space)); 709 710 current_space = free_space; 711 712 PetscCall(PetscHeapCreate(a->rmax, &h)); 713 PetscCall(PetscMalloc1(a->rmax, &bb)); 714 PetscCall(PetscBTCreate(bn, &bt)); 715 716 /* Determine ci and cj */ 717 for (i = 0; i < am; i++) { 718 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 719 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 720 const PetscInt *fptr = current_space->array; /* Save beginning of the row so we can clear the BT later */ 721 ci[i + 1] = ci[i]; 722 /* Populate the min heap */ 723 for (j = 0; j < anzi; j++) { 724 PetscInt brow = acol[j]; 725 for (bb[j] = bi[brow]; bb[j] < bi[brow + 1]; bb[j]++) { 726 PetscInt bcol = bj[bb[j]]; 727 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 728 PetscCall(PetscHeapAdd(h, j, bcol)); 729 bb[j]++; 730 break; 731 } 732 } 733 } 734 /* Pick off the min element, adding it to free space */ 735 PetscCall(PetscHeapPop(h, &j, &col)); 736 while (j >= 0) { 737 if (current_space->local_remaining < 1) { /* double the size, but don't exceed 16 MiB */ 738 fptr = NULL; /* need PetscBTMemzero */ 739 PetscCall(PetscFreeSpaceGet(PetscMin(PetscIntMultTruncate(2, current_space->total_array_size), 16 << 20), ¤t_space)); 740 ndouble++; 741 } 742 *(current_space->array++) = col; 743 current_space->local_used++; 744 current_space->local_remaining--; 745 ci[i + 1]++; 746 747 /* stash if anything else remains in this row of B */ 748 for (; bb[j] < bi[acol[j] + 1]; bb[j]++) { 749 PetscInt bcol = bj[bb[j]]; 750 if (!PetscBTLookupSet(bt, bcol)) { /* new entry */ 751 PetscCall(PetscHeapAdd(h, j, bcol)); 752 bb[j]++; 753 break; 754 } 755 } 756 PetscCall(PetscHeapPop(h, &j, &col)); 757 } 758 if (fptr) { /* Clear the bits for this row */ 759 for (; fptr < current_space->array; fptr++) PetscCall(PetscBTClear(bt, *fptr)); 760 } else { /* We reallocated so we don't remember (easily) how to clear only the bits we changed */ 761 PetscCall(PetscBTMemzero(bn, bt)); 762 } 763 } 764 PetscCall(PetscFree(bb)); 765 PetscCall(PetscHeapDestroy(&h)); 766 PetscCall(PetscBTDestroy(&bt)); 767 768 /* Column indices are in the list of free space */ 769 /* Allocate space for cj, initialize cj, and */ 770 /* destroy list of free space and other temporary array(s) */ 771 PetscCall(PetscMalloc1(ci[am], &cj)); 772 PetscCall(PetscFreeSpaceContiguous(&free_space, cj)); 773 774 /* put together the new symbolic matrix */ 775 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 776 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 777 778 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 779 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 780 c = (Mat_SeqAIJ *)C->data; 781 c->free_a = PETSC_TRUE; 782 c->free_ij = PETSC_TRUE; 783 c->nonew = 0; 784 785 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 786 787 /* set MatInfo */ 788 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 789 if (afill < 1.0) afill = 1.0; 790 C->info.mallocs = ndouble; 791 C->info.fill_ratio_given = fill; 792 C->info.fill_ratio_needed = afill; 793 794 #if defined(PETSC_USE_INFO) 795 if (ci[am]) { 796 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 797 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 798 } else { 799 PetscCall(PetscInfo(C, "Empty matrix product\n")); 800 } 801 #endif 802 PetscFunctionReturn(PETSC_SUCCESS); 803 } 804 805 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_RowMerge(Mat A, Mat B, PetscReal fill, Mat C) 806 { 807 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 808 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j, *inputi, *inputj, *inputcol, *inputcol_L1; 809 PetscInt *ci, *cj, *outputj, worki_L1[9], worki_L2[9]; 810 PetscInt c_maxmem, a_maxrownnz = 0, a_rownnz; 811 const PetscInt workcol[8] = {0, 1, 2, 3, 4, 5, 6, 7}; 812 const PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 813 const PetscInt *brow_ptr[8], *brow_end[8]; 814 PetscInt window[8]; 815 PetscInt window_min, old_window_min, ci_nnz, outputi_nnz = 0, L1_nrows, L2_nrows; 816 PetscInt i, k, ndouble = 0, L1_rowsleft, rowsleft; 817 PetscReal afill; 818 PetscInt *workj_L1, *workj_L2, *workj_L3; 819 PetscInt L1_nnz, L2_nnz; 820 821 /* Step 1: Get upper bound on memory required for allocation. 822 Because of the way virtual memory works, 823 only the memory pages that are actually needed will be physically allocated. */ 824 PetscFunctionBegin; 825 PetscCall(PetscMalloc1(am + 1, &ci)); 826 for (i = 0; i < am; i++) { 827 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 828 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 829 a_rownnz = 0; 830 for (k = 0; k < anzi; ++k) { 831 a_rownnz += bi[acol[k] + 1] - bi[acol[k]]; 832 if (a_rownnz > bn) { 833 a_rownnz = bn; 834 break; 835 } 836 } 837 a_maxrownnz = PetscMax(a_maxrownnz, a_rownnz); 838 } 839 /* temporary work areas for merging rows */ 840 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L1)); 841 PetscCall(PetscMalloc1(a_maxrownnz * 8, &workj_L2)); 842 PetscCall(PetscMalloc1(a_maxrownnz, &workj_L3)); 843 844 /* This should be enough for almost all matrices. If not, memory is reallocated later. */ 845 c_maxmem = 8 * (ai[am] + bi[bm]); 846 /* Step 2: Populate pattern for C */ 847 PetscCall(PetscMalloc1(c_maxmem, &cj)); 848 849 ci_nnz = 0; 850 ci[0] = 0; 851 worki_L1[0] = 0; 852 worki_L2[0] = 0; 853 for (i = 0; i < am; i++) { 854 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 855 const PetscInt *acol = aj + ai[i]; /* column indices of nonzero entries in this row */ 856 rowsleft = anzi; 857 inputcol_L1 = acol; 858 L2_nnz = 0; 859 L2_nrows = 1; /* Number of rows to be merged on Level 3. output of L3 already exists -> initial value 1 */ 860 worki_L2[1] = 0; 861 outputi_nnz = 0; 862 863 /* If the number of indices in C so far + the max number of columns in the next row > c_maxmem -> allocate more memory */ 864 while (ci_nnz + a_maxrownnz > c_maxmem) { 865 c_maxmem *= 2; 866 ndouble++; 867 PetscCall(PetscRealloc(sizeof(PetscInt) * c_maxmem, &cj)); 868 } 869 870 while (rowsleft) { 871 L1_rowsleft = PetscMin(64, rowsleft); /* In the inner loop max 64 rows of B can be merged */ 872 L1_nrows = 0; 873 L1_nnz = 0; 874 inputcol = inputcol_L1; 875 inputi = bi; 876 inputj = bj; 877 878 /* The following macro is used to specialize for small rows in A. 879 This helps with compiler unrolling, improving performance substantially. 880 Input: inputj inputi inputcol bn 881 Output: outputj outputi_nnz */ 882 #define MatMatMultSymbolic_RowMergeMacro(ANNZ) \ 883 do { \ 884 window_min = bn; \ 885 outputi_nnz = 0; \ 886 for (k = 0; k < ANNZ; ++k) { \ 887 brow_ptr[k] = inputj + inputi[inputcol[k]]; \ 888 brow_end[k] = inputj + inputi[inputcol[k] + 1]; \ 889 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 890 window_min = PetscMin(window[k], window_min); \ 891 } \ 892 while (window_min < bn) { \ 893 outputj[outputi_nnz++] = window_min; \ 894 /* advance front and compute new minimum */ \ 895 old_window_min = window_min; \ 896 window_min = bn; \ 897 for (k = 0; k < ANNZ; ++k) { \ 898 if (window[k] == old_window_min) { \ 899 brow_ptr[k]++; \ 900 window[k] = (brow_ptr[k] != brow_end[k]) ? *brow_ptr[k] : bn; \ 901 } \ 902 window_min = PetscMin(window[k], window_min); \ 903 } \ 904 } \ 905 } while (0) 906 907 /************** L E V E L 1 ***************/ 908 /* Merge up to 8 rows of B to L1 work array*/ 909 while (L1_rowsleft) { 910 outputi_nnz = 0; 911 if (anzi > 8) outputj = workj_L1 + L1_nnz; /* Level 1 rowmerge*/ 912 else outputj = cj + ci_nnz; /* Merge directly to C */ 913 914 switch (L1_rowsleft) { 915 case 1: 916 brow_ptr[0] = inputj + inputi[inputcol[0]]; 917 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 918 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 919 inputcol += L1_rowsleft; 920 rowsleft -= L1_rowsleft; 921 L1_rowsleft = 0; 922 break; 923 case 2: 924 MatMatMultSymbolic_RowMergeMacro(2); 925 inputcol += L1_rowsleft; 926 rowsleft -= L1_rowsleft; 927 L1_rowsleft = 0; 928 break; 929 case 3: 930 MatMatMultSymbolic_RowMergeMacro(3); 931 inputcol += L1_rowsleft; 932 rowsleft -= L1_rowsleft; 933 L1_rowsleft = 0; 934 break; 935 case 4: 936 MatMatMultSymbolic_RowMergeMacro(4); 937 inputcol += L1_rowsleft; 938 rowsleft -= L1_rowsleft; 939 L1_rowsleft = 0; 940 break; 941 case 5: 942 MatMatMultSymbolic_RowMergeMacro(5); 943 inputcol += L1_rowsleft; 944 rowsleft -= L1_rowsleft; 945 L1_rowsleft = 0; 946 break; 947 case 6: 948 MatMatMultSymbolic_RowMergeMacro(6); 949 inputcol += L1_rowsleft; 950 rowsleft -= L1_rowsleft; 951 L1_rowsleft = 0; 952 break; 953 case 7: 954 MatMatMultSymbolic_RowMergeMacro(7); 955 inputcol += L1_rowsleft; 956 rowsleft -= L1_rowsleft; 957 L1_rowsleft = 0; 958 break; 959 default: 960 MatMatMultSymbolic_RowMergeMacro(8); 961 inputcol += 8; 962 rowsleft -= 8; 963 L1_rowsleft -= 8; 964 break; 965 } 966 inputcol_L1 = inputcol; 967 L1_nnz += outputi_nnz; 968 worki_L1[++L1_nrows] = L1_nnz; 969 } 970 971 /********************** L E V E L 2 ************************/ 972 /* Merge from L1 work array to either C or to L2 work array */ 973 if (anzi > 8) { 974 inputi = worki_L1; 975 inputj = workj_L1; 976 inputcol = workcol; 977 outputi_nnz = 0; 978 979 if (anzi <= 64) outputj = cj + ci_nnz; /* Merge from L1 work array to C */ 980 else outputj = workj_L2 + L2_nnz; /* Merge from L1 work array to L2 work array */ 981 982 switch (L1_nrows) { 983 case 1: 984 brow_ptr[0] = inputj + inputi[inputcol[0]]; 985 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 986 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 987 break; 988 case 2: 989 MatMatMultSymbolic_RowMergeMacro(2); 990 break; 991 case 3: 992 MatMatMultSymbolic_RowMergeMacro(3); 993 break; 994 case 4: 995 MatMatMultSymbolic_RowMergeMacro(4); 996 break; 997 case 5: 998 MatMatMultSymbolic_RowMergeMacro(5); 999 break; 1000 case 6: 1001 MatMatMultSymbolic_RowMergeMacro(6); 1002 break; 1003 case 7: 1004 MatMatMultSymbolic_RowMergeMacro(7); 1005 break; 1006 case 8: 1007 MatMatMultSymbolic_RowMergeMacro(8); 1008 break; 1009 default: 1010 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L1 work array!"); 1011 } 1012 L2_nnz += outputi_nnz; 1013 worki_L2[++L2_nrows] = L2_nnz; 1014 1015 /************************ L E V E L 3 **********************/ 1016 /* Merge from L2 work array to either C or to L2 work array */ 1017 if (anzi > 64 && (L2_nrows == 8 || rowsleft == 0)) { 1018 inputi = worki_L2; 1019 inputj = workj_L2; 1020 inputcol = workcol; 1021 outputi_nnz = 0; 1022 if (rowsleft) outputj = workj_L3; 1023 else outputj = cj + ci_nnz; 1024 switch (L2_nrows) { 1025 case 1: 1026 brow_ptr[0] = inputj + inputi[inputcol[0]]; 1027 brow_end[0] = inputj + inputi[inputcol[0] + 1]; 1028 for (; brow_ptr[0] != brow_end[0]; ++brow_ptr[0]) outputj[outputi_nnz++] = *brow_ptr[0]; /* copy row in b over */ 1029 break; 1030 case 2: 1031 MatMatMultSymbolic_RowMergeMacro(2); 1032 break; 1033 case 3: 1034 MatMatMultSymbolic_RowMergeMacro(3); 1035 break; 1036 case 4: 1037 MatMatMultSymbolic_RowMergeMacro(4); 1038 break; 1039 case 5: 1040 MatMatMultSymbolic_RowMergeMacro(5); 1041 break; 1042 case 6: 1043 MatMatMultSymbolic_RowMergeMacro(6); 1044 break; 1045 case 7: 1046 MatMatMultSymbolic_RowMergeMacro(7); 1047 break; 1048 case 8: 1049 MatMatMultSymbolic_RowMergeMacro(8); 1050 break; 1051 default: 1052 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatMatMult logic error: Not merging 1-8 rows from L2 work array!"); 1053 } 1054 L2_nrows = 1; 1055 L2_nnz = outputi_nnz; 1056 worki_L2[1] = outputi_nnz; 1057 /* Copy to workj_L2 */ 1058 if (rowsleft) { 1059 for (k = 0; k < outputi_nnz; ++k) workj_L2[k] = outputj[k]; 1060 } 1061 } 1062 } 1063 } /* while (rowsleft) */ 1064 #undef MatMatMultSymbolic_RowMergeMacro 1065 1066 /* terminate current row */ 1067 ci_nnz += outputi_nnz; 1068 ci[i + 1] = ci_nnz; 1069 } 1070 1071 /* Step 3: Create the new symbolic matrix */ 1072 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1073 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1074 1075 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1076 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1077 c = (Mat_SeqAIJ *)C->data; 1078 c->free_a = PETSC_TRUE; 1079 c->free_ij = PETSC_TRUE; 1080 c->nonew = 0; 1081 1082 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1083 1084 /* set MatInfo */ 1085 afill = (PetscReal)ci[am] / (ai[am] + bi[bm]) + 1.e-5; 1086 if (afill < 1.0) afill = 1.0; 1087 C->info.mallocs = ndouble; 1088 C->info.fill_ratio_given = fill; 1089 C->info.fill_ratio_needed = afill; 1090 1091 #if defined(PETSC_USE_INFO) 1092 if (ci[am]) { 1093 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1094 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1095 } else { 1096 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1097 } 1098 #endif 1099 1100 /* Step 4: Free temporary work areas */ 1101 PetscCall(PetscFree(workj_L1)); 1102 PetscCall(PetscFree(workj_L2)); 1103 PetscCall(PetscFree(workj_L3)); 1104 PetscFunctionReturn(PETSC_SUCCESS); 1105 } 1106 1107 /* concatenate unique entries and then sort */ 1108 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqAIJ_Sorted(Mat A, Mat B, PetscReal fill, Mat C) 1109 { 1110 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 1111 const PetscInt *ai = a->i, *bi = b->i, *aj = a->j, *bj = b->j; 1112 PetscInt *ci, *cj, bcol; 1113 PetscInt am = A->rmap->N, bn = B->cmap->N, bm = B->rmap->N; 1114 PetscReal afill; 1115 PetscInt i, j, ndouble = 0; 1116 PetscSegBuffer seg, segrow; 1117 char *seen; 1118 1119 PetscFunctionBegin; 1120 PetscCall(PetscMalloc1(am + 1, &ci)); 1121 ci[0] = 0; 1122 1123 /* Initial FreeSpace size is fill*(nnz(A)+nnz(B)) */ 1124 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), (PetscInt)(fill * (ai[am] + bi[bm])), &seg)); 1125 PetscCall(PetscSegBufferCreate(sizeof(PetscInt), 100, &segrow)); 1126 PetscCall(PetscCalloc1(bn, &seen)); 1127 1128 /* Determine ci and cj */ 1129 for (i = 0; i < am; i++) { 1130 const PetscInt anzi = ai[i + 1] - ai[i]; /* number of nonzeros in this row of A, this is the number of rows of B that we merge */ 1131 const PetscInt *acol = PetscSafePointerPlusOffset(aj, ai[i]); /* column indices of nonzero entries in this row */ 1132 PetscInt packlen = 0, *PETSC_RESTRICT crow; 1133 1134 /* Pack segrow */ 1135 for (j = 0; j < anzi; j++) { 1136 PetscInt brow = acol[j], bjstart = bi[brow], bjend = bi[brow + 1], k; 1137 for (k = bjstart; k < bjend; k++) { 1138 bcol = bj[k]; 1139 if (!seen[bcol]) { /* new entry */ 1140 PetscInt *PETSC_RESTRICT slot; 1141 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1142 *slot = bcol; 1143 seen[bcol] = 1; 1144 packlen++; 1145 } 1146 } 1147 } 1148 1149 /* Check i-th diagonal entry */ 1150 if (C->force_diagonals && !seen[i]) { 1151 PetscInt *PETSC_RESTRICT slot; 1152 PetscCall(PetscSegBufferGetInts(segrow, 1, &slot)); 1153 *slot = i; 1154 seen[i] = 1; 1155 packlen++; 1156 } 1157 1158 PetscCall(PetscSegBufferGetInts(seg, packlen, &crow)); 1159 PetscCall(PetscSegBufferExtractTo(segrow, crow)); 1160 PetscCall(PetscSortInt(packlen, crow)); 1161 ci[i + 1] = ci[i] + packlen; 1162 for (j = 0; j < packlen; j++) seen[crow[j]] = 0; 1163 } 1164 PetscCall(PetscSegBufferDestroy(&segrow)); 1165 PetscCall(PetscFree(seen)); 1166 1167 /* Column indices are in the segmented buffer */ 1168 PetscCall(PetscSegBufferExtractAlloc(seg, &cj)); 1169 PetscCall(PetscSegBufferDestroy(&seg)); 1170 1171 /* put together the new symbolic matrix */ 1172 PetscCall(MatSetSeqAIJWithArrays_private(PetscObjectComm((PetscObject)A), am, bn, ci, cj, NULL, ((PetscObject)A)->type_name, C)); 1173 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 1174 1175 /* MatCreateSeqAIJWithArrays flags matrix so PETSc doesn't free the user's arrays. */ 1176 /* These are PETSc arrays, so change flags so arrays can be deleted by PETSc */ 1177 c = (Mat_SeqAIJ *)C->data; 1178 c->free_a = PETSC_TRUE; 1179 c->free_ij = PETSC_TRUE; 1180 c->nonew = 0; 1181 1182 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqAIJ_Sorted; 1183 1184 /* set MatInfo */ 1185 afill = (PetscReal)ci[am] / PetscMax(ai[am] + bi[bm], 1) + 1.e-5; 1186 if (afill < 1.0) afill = 1.0; 1187 C->info.mallocs = ndouble; 1188 C->info.fill_ratio_given = fill; 1189 C->info.fill_ratio_needed = afill; 1190 1191 #if defined(PETSC_USE_INFO) 1192 if (ci[am]) { 1193 PetscCall(PetscInfo(C, "Reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n", ndouble, (double)fill, (double)afill)); 1194 PetscCall(PetscInfo(C, "Use MatMatMult(A,B,MatReuse,%g,&C) for best performance.;\n", (double)afill)); 1195 } else { 1196 PetscCall(PetscInfo(C, "Empty matrix product\n")); 1197 } 1198 #endif 1199 PetscFunctionReturn(PETSC_SUCCESS); 1200 } 1201 1202 static PetscErrorCode MatProductCtxDestroy_SeqAIJ_MatMatMultTrans(void **data) 1203 { 1204 MatProductCtx_MatMatTransMult *abt = *(MatProductCtx_MatMatTransMult **)data; 1205 1206 PetscFunctionBegin; 1207 PetscCall(MatTransposeColoringDestroy(&abt->matcoloring)); 1208 PetscCall(MatDestroy(&abt->Bt_den)); 1209 PetscCall(MatDestroy(&abt->ABt_den)); 1210 PetscCall(PetscFree(abt)); 1211 PetscFunctionReturn(PETSC_SUCCESS); 1212 } 1213 1214 PetscErrorCode MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1215 { 1216 Mat Bt; 1217 MatProductCtx_MatMatTransMult *abt; 1218 Mat_Product *product = C->product; 1219 char *alg; 1220 1221 PetscFunctionBegin; 1222 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1223 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1224 1225 /* create symbolic Bt */ 1226 PetscCall(MatTransposeSymbolic(B, &Bt)); 1227 PetscCall(MatSetBlockSizes(Bt, A->cmap->bs, B->cmap->bs)); 1228 PetscCall(MatSetType(Bt, ((PetscObject)A)->type_name)); 1229 1230 /* get symbolic C=A*Bt */ 1231 PetscCall(PetscStrallocpy(product->alg, &alg)); 1232 PetscCall(MatProductSetAlgorithm(C, "sorted")); /* set algorithm for C = A*Bt */ 1233 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(A, Bt, fill, C)); 1234 PetscCall(MatProductSetAlgorithm(C, alg)); /* resume original algorithm for ABt product */ 1235 PetscCall(PetscFree(alg)); 1236 1237 /* create a supporting struct for reuse intermediate dense matrices with matcoloring */ 1238 PetscCall(PetscNew(&abt)); 1239 1240 product->data = abt; 1241 product->destroy = MatProductCtxDestroy_SeqAIJ_MatMatMultTrans; 1242 1243 C->ops->mattransposemultnumeric = MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ; 1244 1245 abt->usecoloring = PETSC_FALSE; 1246 PetscCall(PetscStrcmp(product->alg, "color", &abt->usecoloring)); 1247 if (abt->usecoloring) { 1248 /* Create MatTransposeColoring from symbolic C=A*B^T */ 1249 MatTransposeColoring matcoloring; 1250 MatColoring coloring; 1251 ISColoring iscoloring; 1252 Mat Bt_dense, C_dense; 1253 1254 /* inode causes memory problem */ 1255 PetscCall(MatSetOption(C, MAT_USE_INODES, PETSC_FALSE)); 1256 1257 PetscCall(MatColoringCreate(C, &coloring)); 1258 PetscCall(MatColoringSetDistance(coloring, 2)); 1259 PetscCall(MatColoringSetType(coloring, MATCOLORINGSL)); 1260 PetscCall(MatColoringSetFromOptions(coloring)); 1261 PetscCall(MatColoringApply(coloring, &iscoloring)); 1262 PetscCall(MatColoringDestroy(&coloring)); 1263 PetscCall(MatTransposeColoringCreate(C, iscoloring, &matcoloring)); 1264 1265 abt->matcoloring = matcoloring; 1266 1267 PetscCall(ISColoringDestroy(&iscoloring)); 1268 1269 /* Create Bt_dense and C_dense = A*Bt_dense */ 1270 PetscCall(MatCreate(PETSC_COMM_SELF, &Bt_dense)); 1271 PetscCall(MatSetSizes(Bt_dense, A->cmap->n, matcoloring->ncolors, A->cmap->n, matcoloring->ncolors)); 1272 PetscCall(MatSetType(Bt_dense, MATSEQDENSE)); 1273 PetscCall(MatSeqDenseSetPreallocation(Bt_dense, NULL)); 1274 1275 Bt_dense->assembled = PETSC_TRUE; 1276 abt->Bt_den = Bt_dense; 1277 1278 PetscCall(MatCreate(PETSC_COMM_SELF, &C_dense)); 1279 PetscCall(MatSetSizes(C_dense, A->rmap->n, matcoloring->ncolors, A->rmap->n, matcoloring->ncolors)); 1280 PetscCall(MatSetType(C_dense, MATSEQDENSE)); 1281 PetscCall(MatSeqDenseSetPreallocation(C_dense, NULL)); 1282 1283 Bt_dense->assembled = PETSC_TRUE; 1284 abt->ABt_den = C_dense; 1285 1286 #if defined(PETSC_USE_INFO) 1287 { 1288 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 1289 PetscCall(PetscInfo(C, "Use coloring of C=A*B^T; B^T: %" PetscInt_FMT " %" PetscInt_FMT ", Bt_dense: %" PetscInt_FMT ",%" PetscInt_FMT "; Cnz %" PetscInt_FMT " / (cm*ncolors %" PetscInt_FMT ") = %g\n", B->cmap->n, B->rmap->n, Bt_dense->rmap->n, 1290 Bt_dense->cmap->n, c->nz, A->rmap->n * matcoloring->ncolors, (double)(((PetscReal)c->nz) / ((PetscReal)(A->rmap->n * matcoloring->ncolors))))); 1291 } 1292 #endif 1293 } 1294 /* clean up */ 1295 PetscCall(MatDestroy(&Bt)); 1296 PetscFunctionReturn(PETSC_SUCCESS); 1297 } 1298 1299 PetscErrorCode MatMatTransposeMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1300 { 1301 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1302 PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, anzi, bnzj, nexta, nextb, *acol, *bcol, brow; 1303 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, i, j, cnzi, *ccol; 1304 PetscLogDouble flops = 0.0; 1305 MatScalar *aa = a->a, *aval, *ba = b->a, *bval, *ca, *cval; 1306 MatProductCtx_MatMatTransMult *abt; 1307 Mat_Product *product = C->product; 1308 1309 PetscFunctionBegin; 1310 PetscCheck(product, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1311 abt = (MatProductCtx_MatMatTransMult *)product->data; 1312 PetscCheck(abt, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1313 /* clear old values in C */ 1314 if (!c->a) { 1315 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1316 c->a = ca; 1317 c->free_a = PETSC_TRUE; 1318 } else { 1319 ca = c->a; 1320 PetscCall(PetscArrayzero(ca, ci[cm] + 1)); 1321 } 1322 1323 if (abt->usecoloring) { 1324 MatTransposeColoring matcoloring = abt->matcoloring; 1325 Mat Bt_dense, C_dense = abt->ABt_den; 1326 1327 /* Get Bt_dense by Apply MatTransposeColoring to B */ 1328 Bt_dense = abt->Bt_den; 1329 PetscCall(MatTransColoringApplySpToDen(matcoloring, B, Bt_dense)); 1330 1331 /* C_dense = A*Bt_dense */ 1332 PetscCall(MatMatMultNumeric_SeqAIJ_SeqDense(A, Bt_dense, C_dense)); 1333 1334 /* Recover C from C_dense */ 1335 PetscCall(MatTransColoringApplyDenToSp(matcoloring, C_dense, C)); 1336 PetscFunctionReturn(PETSC_SUCCESS); 1337 } 1338 1339 for (i = 0; i < cm; i++) { 1340 anzi = ai[i + 1] - ai[i]; 1341 acol = PetscSafePointerPlusOffset(aj, ai[i]); 1342 aval = PetscSafePointerPlusOffset(aa, ai[i]); 1343 cnzi = ci[i + 1] - ci[i]; 1344 ccol = PetscSafePointerPlusOffset(cj, ci[i]); 1345 cval = ca + ci[i]; 1346 for (j = 0; j < cnzi; j++) { 1347 brow = ccol[j]; 1348 bnzj = bi[brow + 1] - bi[brow]; 1349 bcol = bj + bi[brow]; 1350 bval = ba + bi[brow]; 1351 1352 /* perform sparse inner-product c(i,j)=A[i,:]*B[j,:]^T */ 1353 nexta = 0; 1354 nextb = 0; 1355 while (nexta < anzi && nextb < bnzj) { 1356 while (nexta < anzi && acol[nexta] < bcol[nextb]) nexta++; 1357 if (nexta == anzi) break; 1358 while (nextb < bnzj && acol[nexta] > bcol[nextb]) nextb++; 1359 if (nextb == bnzj) break; 1360 if (acol[nexta] == bcol[nextb]) { 1361 cval[j] += aval[nexta] * bval[nextb]; 1362 nexta++; 1363 nextb++; 1364 flops += 2; 1365 } 1366 } 1367 } 1368 } 1369 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1370 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1371 PetscCall(PetscLogFlops(flops)); 1372 PetscFunctionReturn(PETSC_SUCCESS); 1373 } 1374 1375 PetscErrorCode MatProductCtxDestroy_SeqAIJ_MatTransMatMult(void **data) 1376 { 1377 MatProductCtx_MatTransMatMult *atb = *(MatProductCtx_MatTransMatMult **)data; 1378 1379 PetscFunctionBegin; 1380 PetscCall(MatDestroy(&atb->At)); 1381 if (atb->destroy) PetscCall((*atb->destroy)(&atb->data)); 1382 PetscCall(PetscFree(atb)); 1383 PetscFunctionReturn(PETSC_SUCCESS); 1384 } 1385 1386 PetscErrorCode MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(Mat A, Mat B, PetscReal fill, Mat C) 1387 { 1388 Mat At = NULL; 1389 Mat_Product *product = C->product; 1390 PetscBool flg, def, square; 1391 1392 PetscFunctionBegin; 1393 MatCheckProduct(C, 4); 1394 square = (PetscBool)(A == B && A->symmetric == PETSC_BOOL3_TRUE); 1395 /* outerproduct */ 1396 PetscCall(PetscStrcmp(product->alg, "outerproduct", &flg)); 1397 if (flg) { 1398 /* create symbolic At */ 1399 if (!square) { 1400 PetscCall(MatTransposeSymbolic(A, &At)); 1401 PetscCall(MatSetBlockSizes(At, A->cmap->bs, B->cmap->bs)); 1402 PetscCall(MatSetType(At, ((PetscObject)A)->type_name)); 1403 } 1404 /* get symbolic C=At*B */ 1405 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1406 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1407 1408 /* clean up */ 1409 if (!square) PetscCall(MatDestroy(&At)); 1410 1411 C->ops->mattransposemultnumeric = MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ; /* outerproduct */ 1412 PetscCall(MatProductSetAlgorithm(C, "outerproduct")); 1413 PetscFunctionReturn(PETSC_SUCCESS); 1414 } 1415 1416 /* matmatmult */ 1417 PetscCall(PetscStrcmp(product->alg, "default", &def)); 1418 PetscCall(PetscStrcmp(product->alg, "at*b", &flg)); 1419 if (flg || def) { 1420 MatProductCtx_MatTransMatMult *atb; 1421 1422 PetscCheck(!product->data, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Extra product struct not empty"); 1423 PetscCall(PetscNew(&atb)); 1424 if (!square) PetscCall(MatTranspose(A, MAT_INITIAL_MATRIX, &At)); 1425 PetscCall(MatProductSetAlgorithm(C, "sorted")); 1426 PetscCall(MatMatMultSymbolic_SeqAIJ_SeqAIJ(square ? A : At, B, fill, C)); 1427 PetscCall(MatProductSetAlgorithm(C, "at*b")); 1428 product->data = atb; 1429 product->destroy = MatProductCtxDestroy_SeqAIJ_MatTransMatMult; 1430 atb->At = At; 1431 1432 C->ops->mattransposemultnumeric = NULL; /* see MatProductNumeric_AtB_SeqAIJ_SeqAIJ */ 1433 PetscFunctionReturn(PETSC_SUCCESS); 1434 } 1435 1436 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Mat Product Algorithm is not supported"); 1437 } 1438 1439 PetscErrorCode MatTransposeMatMultNumeric_SeqAIJ_SeqAIJ(Mat A, Mat B, Mat C) 1440 { 1441 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c = (Mat_SeqAIJ *)C->data; 1442 PetscInt am = A->rmap->n, anzi, *ai = a->i, *aj = a->j, *bi = b->i, *bj, bnzi, nextb; 1443 PetscInt cm = C->rmap->n, *ci = c->i, *cj = c->j, crow, *cjj, i, j, k; 1444 PetscLogDouble flops = 0.0; 1445 MatScalar *aa = a->a, *ba, *ca, *caj; 1446 1447 PetscFunctionBegin; 1448 if (!c->a) { 1449 PetscCall(PetscCalloc1(ci[cm] + 1, &ca)); 1450 1451 c->a = ca; 1452 c->free_a = PETSC_TRUE; 1453 } else { 1454 ca = c->a; 1455 PetscCall(PetscArrayzero(ca, ci[cm])); 1456 } 1457 1458 /* compute A^T*B using outer product (A^T)[:,i]*B[i,:] */ 1459 for (i = 0; i < am; i++) { 1460 bj = b->j + bi[i]; 1461 ba = b->a + bi[i]; 1462 bnzi = bi[i + 1] - bi[i]; 1463 anzi = ai[i + 1] - ai[i]; 1464 for (j = 0; j < anzi; j++) { 1465 nextb = 0; 1466 crow = *aj++; 1467 cjj = cj + ci[crow]; 1468 caj = ca + ci[crow]; 1469 /* perform sparse axpy operation. Note cjj includes bj. */ 1470 for (k = 0; nextb < bnzi; k++) { 1471 if (cjj[k] == *(bj + nextb)) { /* ccol == bcol */ 1472 caj[k] += (*aa) * (*(ba + nextb)); 1473 nextb++; 1474 } 1475 } 1476 flops += 2 * bnzi; 1477 aa++; 1478 } 1479 } 1480 1481 /* Assemble the final matrix and clean up */ 1482 PetscCall(MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY)); 1483 PetscCall(MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY)); 1484 PetscCall(PetscLogFlops(flops)); 1485 PetscFunctionReturn(PETSC_SUCCESS); 1486 } 1487 1488 PetscErrorCode MatMatMultSymbolic_SeqAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C) 1489 { 1490 PetscFunctionBegin; 1491 PetscCall(MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C)); 1492 C->ops->matmultnumeric = MatMatMultNumeric_SeqAIJ_SeqDense; 1493 PetscFunctionReturn(PETSC_SUCCESS); 1494 } 1495 1496 PETSC_INTERN PetscErrorCode MatMatMultNumericAdd_SeqAIJ_SeqDense(Mat A, Mat B, Mat C, const PetscBool add) 1497 { 1498 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1499 PetscScalar *c, r1, r2, r3, r4, *c1, *c2, *c3, *c4; 1500 const PetscScalar *aa, *b, *b1, *b2, *b3, *b4, *av; 1501 const PetscInt *aj; 1502 PetscInt cm = C->rmap->n, cn = B->cmap->n, bm, am = A->rmap->n; 1503 PetscInt clda; 1504 PetscInt am4, bm4, col, i, j, n; 1505 1506 PetscFunctionBegin; 1507 if (!cm || !cn) PetscFunctionReturn(PETSC_SUCCESS); 1508 PetscCall(MatSeqAIJGetArrayRead(A, &av)); 1509 if (add) { 1510 PetscCall(MatDenseGetArray(C, &c)); 1511 } else { 1512 PetscCall(MatDenseGetArrayWrite(C, &c)); 1513 } 1514 PetscCall(MatDenseGetArrayRead(B, &b)); 1515 PetscCall(MatDenseGetLDA(B, &bm)); 1516 PetscCall(MatDenseGetLDA(C, &clda)); 1517 am4 = 4 * clda; 1518 bm4 = 4 * bm; 1519 if (b) { 1520 b1 = b; 1521 b2 = b1 + bm; 1522 b3 = b2 + bm; 1523 b4 = b3 + bm; 1524 } else b1 = b2 = b3 = b4 = NULL; 1525 c1 = c; 1526 c2 = c1 + clda; 1527 c3 = c2 + clda; 1528 c4 = c3 + clda; 1529 for (col = 0; col < (cn / 4) * 4; col += 4) { /* over columns of C */ 1530 for (i = 0; i < am; i++) { /* over rows of A in those columns */ 1531 r1 = r2 = r3 = r4 = 0.0; 1532 n = a->i[i + 1] - a->i[i]; 1533 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1534 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1535 for (j = 0; j < n; j++) { 1536 const PetscScalar aatmp = aa[j]; 1537 const PetscInt ajtmp = aj[j]; 1538 r1 += aatmp * b1[ajtmp]; 1539 r2 += aatmp * b2[ajtmp]; 1540 r3 += aatmp * b3[ajtmp]; 1541 r4 += aatmp * b4[ajtmp]; 1542 } 1543 if (add) { 1544 c1[i] += r1; 1545 c2[i] += r2; 1546 c3[i] += r3; 1547 c4[i] += r4; 1548 } else { 1549 c1[i] = r1; 1550 c2[i] = r2; 1551 c3[i] = r3; 1552 c4[i] = r4; 1553 } 1554 } 1555 if (b) { 1556 b1 += bm4; 1557 b2 += bm4; 1558 b3 += bm4; 1559 b4 += bm4; 1560 } 1561 c1 += am4; 1562 c2 += am4; 1563 c3 += am4; 1564 c4 += am4; 1565 } 1566 /* process remaining columns */ 1567 if (col != cn) { 1568 PetscInt rc = cn - col; 1569 1570 if (rc == 1) { 1571 for (i = 0; i < am; i++) { 1572 r1 = 0.0; 1573 n = a->i[i + 1] - a->i[i]; 1574 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1575 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1576 for (j = 0; j < n; j++) r1 += aa[j] * b1[aj[j]]; 1577 if (add) c1[i] += r1; 1578 else c1[i] = r1; 1579 } 1580 } else if (rc == 2) { 1581 for (i = 0; i < am; i++) { 1582 r1 = r2 = 0.0; 1583 n = a->i[i + 1] - a->i[i]; 1584 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1585 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1586 for (j = 0; j < n; j++) { 1587 const PetscScalar aatmp = aa[j]; 1588 const PetscInt ajtmp = aj[j]; 1589 r1 += aatmp * b1[ajtmp]; 1590 r2 += aatmp * b2[ajtmp]; 1591 } 1592 if (add) { 1593 c1[i] += r1; 1594 c2[i] += r2; 1595 } else { 1596 c1[i] = r1; 1597 c2[i] = r2; 1598 } 1599 } 1600 } else { 1601 for (i = 0; i < am; i++) { 1602 r1 = r2 = r3 = 0.0; 1603 n = a->i[i + 1] - a->i[i]; 1604 aj = PetscSafePointerPlusOffset(a->j, a->i[i]); 1605 aa = PetscSafePointerPlusOffset(av, a->i[i]); 1606 for (j = 0; j < n; j++) { 1607 const PetscScalar aatmp = aa[j]; 1608 const PetscInt ajtmp = aj[j]; 1609 r1 += aatmp * b1[ajtmp]; 1610 r2 += aatmp * b2[ajtmp]; 1611 r3 += aatmp * b3[ajtmp]; 1612 } 1613 if (add) { 1614 c1[i] += r1; 1615 c2[i] += r2; 1616 c3[i] += r3; 1617 } else { 1618 c1[i] = r1; 1619 c2[i] = r2; 1620 c3[i] = r3; 1621 } 1622 } 1623 } 1624 } 1625 PetscCall(PetscLogFlops(cn * (2.0 * a->nz))); 1626 if (add) { 1627 PetscCall(MatDenseRestoreArray(C, &c)); 1628 } else { 1629 PetscCall(MatDenseRestoreArrayWrite(C, &c)); 1630 } 1631 PetscCall(MatDenseRestoreArrayRead(B, &b)); 1632 PetscCall(MatSeqAIJRestoreArrayRead(A, &av)); 1633 PetscFunctionReturn(PETSC_SUCCESS); 1634 } 1635 1636 PetscErrorCode MatMatMultNumeric_SeqAIJ_SeqDense(Mat A, Mat B, Mat C) 1637 { 1638 PetscFunctionBegin; 1639 PetscCheck(B->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in A %" PetscInt_FMT " not equal rows in B %" PetscInt_FMT, A->cmap->n, B->rmap->n); 1640 PetscCheck(A->rmap->n == C->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number rows in C %" PetscInt_FMT " not equal rows in A %" PetscInt_FMT, C->rmap->n, A->rmap->n); 1641 PetscCheck(B->cmap->n == C->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Number columns in B %" PetscInt_FMT " not equal columns in C %" PetscInt_FMT, B->cmap->n, C->cmap->n); 1642 1643 PetscCall(MatMatMultNumericAdd_SeqAIJ_SeqDense(A, B, C, PETSC_FALSE)); 1644 PetscFunctionReturn(PETSC_SUCCESS); 1645 } 1646 1647 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AB(Mat C) 1648 { 1649 PetscFunctionBegin; 1650 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqDense; 1651 C->ops->productsymbolic = MatProductSymbolic_AB; 1652 PetscFunctionReturn(PETSC_SUCCESS); 1653 } 1654 1655 PETSC_INTERN PetscErrorCode MatTMatTMultSymbolic_SeqAIJ_SeqDense(Mat, Mat, PetscReal, Mat); 1656 1657 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(Mat C) 1658 { 1659 PetscFunctionBegin; 1660 C->ops->transposematmultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1661 C->ops->productsymbolic = MatProductSymbolic_AtB; 1662 PetscFunctionReturn(PETSC_SUCCESS); 1663 } 1664 1665 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(Mat C) 1666 { 1667 PetscFunctionBegin; 1668 C->ops->mattransposemultsymbolic = MatTMatTMultSymbolic_SeqAIJ_SeqDense; 1669 C->ops->productsymbolic = MatProductSymbolic_ABt; 1670 PetscFunctionReturn(PETSC_SUCCESS); 1671 } 1672 1673 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat C) 1674 { 1675 Mat_Product *product = C->product; 1676 1677 PetscFunctionBegin; 1678 switch (product->type) { 1679 case MATPRODUCT_AB: 1680 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AB(C)); 1681 break; 1682 case MATPRODUCT_AtB: 1683 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_AtB(C)); 1684 break; 1685 case MATPRODUCT_ABt: 1686 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense_ABt(C)); 1687 break; 1688 default: 1689 break; 1690 } 1691 PetscFunctionReturn(PETSC_SUCCESS); 1692 } 1693 1694 static PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(Mat C) 1695 { 1696 Mat_Product *product = C->product; 1697 Mat A = product->A; 1698 PetscBool baij; 1699 1700 PetscFunctionBegin; 1701 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQBAIJ, &baij)); 1702 if (!baij) { /* A is seqsbaij */ 1703 PetscBool sbaij; 1704 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQSBAIJ, &sbaij)); 1705 PetscCheck(sbaij, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONGSTATE, "Mat must be either seqbaij or seqsbaij format"); 1706 1707 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqSBAIJ_SeqDense; 1708 } else { /* A is seqbaij */ 1709 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqBAIJ_SeqDense; 1710 } 1711 1712 C->ops->productsymbolic = MatProductSymbolic_AB; 1713 PetscFunctionReturn(PETSC_SUCCESS); 1714 } 1715 1716 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqXBAIJ_SeqDense(Mat C) 1717 { 1718 Mat_Product *product = C->product; 1719 1720 PetscFunctionBegin; 1721 MatCheckProduct(C, 1); 1722 PetscCheck(product->A, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing A"); 1723 if (product->type == MATPRODUCT_AB || (product->type == MATPRODUCT_AtB && product->A->symmetric == PETSC_BOOL3_TRUE)) PetscCall(MatProductSetFromOptions_SeqXBAIJ_SeqDense_AB(C)); 1724 else if (product->type == MATPRODUCT_AtB) { 1725 PetscBool flg; 1726 1727 PetscCall(PetscObjectTypeCompare((PetscObject)product->A, MATSEQBAIJ, &flg)); 1728 if (flg) { 1729 C->ops->transposematmultsymbolic = MatTransposeMatMultSymbolic_SeqBAIJ_SeqDense; 1730 C->ops->productsymbolic = MatProductSymbolic_AtB; 1731 } 1732 } 1733 PetscFunctionReturn(PETSC_SUCCESS); 1734 } 1735 1736 static PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ_AB(Mat C) 1737 { 1738 PetscFunctionBegin; 1739 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqDense_SeqAIJ; 1740 C->ops->productsymbolic = MatProductSymbolic_AB; 1741 PetscFunctionReturn(PETSC_SUCCESS); 1742 } 1743 1744 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqDense_SeqAIJ(Mat C) 1745 { 1746 Mat_Product *product = C->product; 1747 1748 PetscFunctionBegin; 1749 if (product->type == MATPRODUCT_AB) PetscCall(MatProductSetFromOptions_SeqDense_SeqAIJ_AB(C)); 1750 PetscFunctionReturn(PETSC_SUCCESS); 1751 } 1752 1753 PetscErrorCode MatTransColoringApplySpToDen_SeqAIJ(MatTransposeColoring coloring, Mat B, Mat Btdense) 1754 { 1755 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1756 Mat_SeqDense *btdense = (Mat_SeqDense *)Btdense->data; 1757 PetscInt *bi = b->i, *bj = b->j; 1758 PetscInt m = Btdense->rmap->n, n = Btdense->cmap->n, j, k, l, col, anz, *btcol, brow, ncolumns; 1759 MatScalar *btval, *btval_den, *ba = b->a; 1760 PetscInt *columns = coloring->columns, *colorforcol = coloring->colorforcol, ncolors = coloring->ncolors; 1761 1762 PetscFunctionBegin; 1763 btval_den = btdense->v; 1764 PetscCall(PetscArrayzero(btval_den, m * n)); 1765 for (k = 0; k < ncolors; k++) { 1766 ncolumns = coloring->ncolumns[k]; 1767 for (l = 0; l < ncolumns; l++) { /* insert a row of B to a column of Btdense */ 1768 col = *(columns + colorforcol[k] + l); 1769 btcol = bj + bi[col]; 1770 btval = ba + bi[col]; 1771 anz = bi[col + 1] - bi[col]; 1772 for (j = 0; j < anz; j++) { 1773 brow = btcol[j]; 1774 btval_den[brow] = btval[j]; 1775 } 1776 } 1777 btval_den += m; 1778 } 1779 PetscFunctionReturn(PETSC_SUCCESS); 1780 } 1781 1782 PetscErrorCode MatTransColoringApplyDenToSp_SeqAIJ(MatTransposeColoring matcoloring, Mat Cden, Mat Csp) 1783 { 1784 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)Csp->data; 1785 const PetscScalar *ca_den, *ca_den_ptr; 1786 PetscScalar *ca = csp->a; 1787 PetscInt k, l, m = Cden->rmap->n, ncolors = matcoloring->ncolors; 1788 PetscInt brows = matcoloring->brows, *den2sp = matcoloring->den2sp; 1789 PetscInt nrows, *row, *idx; 1790 PetscInt *rows = matcoloring->rows, *colorforrow = matcoloring->colorforrow; 1791 1792 PetscFunctionBegin; 1793 PetscCall(MatDenseGetArrayRead(Cden, &ca_den)); 1794 1795 if (brows > 0) { 1796 PetscInt *lstart, row_end, row_start; 1797 lstart = matcoloring->lstart; 1798 PetscCall(PetscArrayzero(lstart, ncolors)); 1799 1800 row_end = brows; 1801 if (row_end > m) row_end = m; 1802 for (row_start = 0; row_start < m; row_start += brows) { /* loop over row blocks of Csp */ 1803 ca_den_ptr = ca_den; 1804 for (k = 0; k < ncolors; k++) { /* loop over colors (columns of Cden) */ 1805 nrows = matcoloring->nrows[k]; 1806 row = rows + colorforrow[k]; 1807 idx = den2sp + colorforrow[k]; 1808 for (l = lstart[k]; l < nrows; l++) { 1809 if (row[l] >= row_end) { 1810 lstart[k] = l; 1811 break; 1812 } else { 1813 ca[idx[l]] = ca_den_ptr[row[l]]; 1814 } 1815 } 1816 ca_den_ptr += m; 1817 } 1818 row_end += brows; 1819 if (row_end > m) row_end = m; 1820 } 1821 } else { /* non-blocked impl: loop over columns of Csp - slow if Csp is large */ 1822 ca_den_ptr = ca_den; 1823 for (k = 0; k < ncolors; k++) { 1824 nrows = matcoloring->nrows[k]; 1825 row = rows + colorforrow[k]; 1826 idx = den2sp + colorforrow[k]; 1827 for (l = 0; l < nrows; l++) ca[idx[l]] = ca_den_ptr[row[l]]; 1828 ca_den_ptr += m; 1829 } 1830 } 1831 1832 PetscCall(MatDenseRestoreArrayRead(Cden, &ca_den)); 1833 #if defined(PETSC_USE_INFO) 1834 if (matcoloring->brows > 0) { 1835 PetscCall(PetscInfo(Csp, "Loop over %" PetscInt_FMT " row blocks for den2sp\n", brows)); 1836 } else { 1837 PetscCall(PetscInfo(Csp, "Loop over colors/columns of Cden, inefficient for large sparse matrix product \n")); 1838 } 1839 #endif 1840 PetscFunctionReturn(PETSC_SUCCESS); 1841 } 1842 1843 PetscErrorCode MatTransposeColoringCreate_SeqAIJ(Mat mat, ISColoring iscoloring, MatTransposeColoring c) 1844 { 1845 PetscInt i, n, nrows, Nbs, j, k, m, ncols, col, cm; 1846 const PetscInt *is, *ci, *cj, *row_idx; 1847 PetscInt nis = iscoloring->n, *rowhit, bs = 1; 1848 IS *isa; 1849 Mat_SeqAIJ *csp = (Mat_SeqAIJ *)mat->data; 1850 PetscInt *colorforrow, *rows, *rows_i, *idxhit, *spidx, *den2sp, *den2sp_i; 1851 PetscInt *colorforcol, *columns, *columns_i, brows; 1852 PetscBool flg; 1853 1854 PetscFunctionBegin; 1855 PetscCall(ISColoringGetIS(iscoloring, PETSC_USE_POINTER, PETSC_IGNORE, &isa)); 1856 1857 /* bs > 1 is not being tested yet! */ 1858 Nbs = mat->cmap->N / bs; 1859 c->M = mat->rmap->N / bs; /* set total rows, columns and local rows */ 1860 c->N = Nbs; 1861 c->m = c->M; 1862 c->rstart = 0; 1863 c->brows = 100; 1864 1865 c->ncolors = nis; 1866 PetscCall(PetscMalloc3(nis, &c->ncolumns, nis, &c->nrows, nis + 1, &colorforrow)); 1867 PetscCall(PetscMalloc1(csp->nz + 1, &rows)); 1868 PetscCall(PetscMalloc1(csp->nz + 1, &den2sp)); 1869 1870 brows = c->brows; 1871 PetscCall(PetscOptionsGetInt(NULL, NULL, "-matden2sp_brows", &brows, &flg)); 1872 if (flg) c->brows = brows; 1873 if (brows > 0) PetscCall(PetscMalloc1(nis + 1, &c->lstart)); 1874 1875 colorforrow[0] = 0; 1876 rows_i = rows; 1877 den2sp_i = den2sp; 1878 1879 PetscCall(PetscMalloc1(nis + 1, &colorforcol)); 1880 PetscCall(PetscMalloc1(Nbs + 1, &columns)); 1881 1882 colorforcol[0] = 0; 1883 columns_i = columns; 1884 1885 /* get column-wise storage of mat */ 1886 PetscCall(MatGetColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1887 1888 cm = c->m; 1889 PetscCall(PetscMalloc1(cm + 1, &rowhit)); 1890 PetscCall(PetscMalloc1(cm + 1, &idxhit)); 1891 for (i = 0; i < nis; i++) { /* loop over color */ 1892 PetscCall(ISGetLocalSize(isa[i], &n)); 1893 PetscCall(ISGetIndices(isa[i], &is)); 1894 1895 c->ncolumns[i] = n; 1896 if (n) PetscCall(PetscArraycpy(columns_i, is, n)); 1897 colorforcol[i + 1] = colorforcol[i] + n; 1898 columns_i += n; 1899 1900 /* fast, crude version requires O(N*N) work */ 1901 PetscCall(PetscArrayzero(rowhit, cm)); 1902 1903 for (j = 0; j < n; j++) { /* loop over columns*/ 1904 col = is[j]; 1905 row_idx = cj + ci[col]; 1906 m = ci[col + 1] - ci[col]; 1907 for (k = 0; k < m; k++) { /* loop over columns marking them in rowhit */ 1908 idxhit[*row_idx] = spidx[ci[col] + k]; 1909 rowhit[*row_idx++] = col + 1; 1910 } 1911 } 1912 /* count the number of hits */ 1913 nrows = 0; 1914 for (j = 0; j < cm; j++) { 1915 if (rowhit[j]) nrows++; 1916 } 1917 c->nrows[i] = nrows; 1918 colorforrow[i + 1] = colorforrow[i] + nrows; 1919 1920 nrows = 0; 1921 for (j = 0; j < cm; j++) { /* loop over rows */ 1922 if (rowhit[j]) { 1923 rows_i[nrows] = j; 1924 den2sp_i[nrows] = idxhit[j]; 1925 nrows++; 1926 } 1927 } 1928 den2sp_i += nrows; 1929 1930 PetscCall(ISRestoreIndices(isa[i], &is)); 1931 rows_i += nrows; 1932 } 1933 PetscCall(MatRestoreColumnIJ_SeqAIJ_Color(mat, 0, PETSC_FALSE, PETSC_FALSE, &ncols, &ci, &cj, &spidx, NULL)); 1934 PetscCall(PetscFree(rowhit)); 1935 PetscCall(ISColoringRestoreIS(iscoloring, PETSC_USE_POINTER, &isa)); 1936 PetscCheck(csp->nz == colorforrow[nis], PETSC_COMM_SELF, PETSC_ERR_PLIB, "csp->nz %" PetscInt_FMT " != colorforrow[nis] %" PetscInt_FMT, csp->nz, colorforrow[nis]); 1937 1938 c->colorforrow = colorforrow; 1939 c->rows = rows; 1940 c->den2sp = den2sp; 1941 c->colorforcol = colorforcol; 1942 c->columns = columns; 1943 1944 PetscCall(PetscFree(idxhit)); 1945 PetscFunctionReturn(PETSC_SUCCESS); 1946 } 1947 1948 static PetscErrorCode MatProductNumeric_AtB_SeqAIJ_SeqAIJ(Mat C) 1949 { 1950 Mat_Product *product = C->product; 1951 Mat A = product->A, B = product->B; 1952 1953 PetscFunctionBegin; 1954 if (C->ops->mattransposemultnumeric) { 1955 /* Alg: "outerproduct" */ 1956 PetscCall((*C->ops->mattransposemultnumeric)(A, B, C)); 1957 } else { 1958 /* Alg: "matmatmult" -- C = At*B */ 1959 MatProductCtx_MatTransMatMult *atb = (MatProductCtx_MatTransMatMult *)product->data; 1960 1961 PetscCheck(atb, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing product struct"); 1962 if (atb->At) { 1963 /* At is computed in MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(); 1964 user may have called MatProductReplaceMats() to get this A=product->A */ 1965 PetscCall(MatTransposeSetPrecursor(A, atb->At)); 1966 PetscCall(MatTranspose(A, MAT_REUSE_MATRIX, &atb->At)); 1967 } 1968 PetscCall(MatMatMultNumeric_SeqAIJ_SeqAIJ(atb->At ? atb->At : A, B, C)); 1969 } 1970 PetscFunctionReturn(PETSC_SUCCESS); 1971 } 1972 1973 static PetscErrorCode MatProductSymbolic_AtB_SeqAIJ_SeqAIJ(Mat C) 1974 { 1975 Mat_Product *product = C->product; 1976 Mat A = product->A, B = product->B; 1977 PetscReal fill = product->fill; 1978 1979 PetscFunctionBegin; 1980 PetscCall(MatTransposeMatMultSymbolic_SeqAIJ_SeqAIJ(A, B, fill, C)); 1981 1982 C->ops->productnumeric = MatProductNumeric_AtB_SeqAIJ_SeqAIJ; 1983 PetscFunctionReturn(PETSC_SUCCESS); 1984 } 1985 1986 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AB(Mat C) 1987 { 1988 Mat_Product *product = C->product; 1989 PetscInt alg = 0; /* default algorithm */ 1990 PetscBool flg = PETSC_FALSE; 1991 #if !defined(PETSC_HAVE_HYPRE) 1992 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 1993 PetscInt nalg = 7; 1994 #else 1995 const char *algTypes[8] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge", "hypre"}; 1996 PetscInt nalg = 8; 1997 #endif 1998 1999 PetscFunctionBegin; 2000 /* Set default algorithm */ 2001 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2002 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2003 2004 /* Get runtime option */ 2005 if (product->api_user) { 2006 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMult", "Mat"); 2007 PetscCall(PetscOptionsEList("-matmatmult_via", "Algorithmic approach", "MatMatMult", algTypes, nalg, algTypes[0], &alg, &flg)); 2008 PetscOptionsEnd(); 2009 } else { 2010 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AB", "Mat"); 2011 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AB", algTypes, nalg, algTypes[0], &alg, &flg)); 2012 PetscOptionsEnd(); 2013 } 2014 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2015 2016 C->ops->productsymbolic = MatProductSymbolic_AB; 2017 C->ops->matmultsymbolic = MatMatMultSymbolic_SeqAIJ_SeqAIJ; 2018 PetscFunctionReturn(PETSC_SUCCESS); 2019 } 2020 2021 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_AtB(Mat C) 2022 { 2023 Mat_Product *product = C->product; 2024 PetscInt alg = 0; /* default algorithm */ 2025 PetscBool flg = PETSC_FALSE; 2026 const char *algTypes[3] = {"default", "at*b", "outerproduct"}; 2027 PetscInt nalg = 3; 2028 2029 PetscFunctionBegin; 2030 /* Get runtime option */ 2031 if (product->api_user) { 2032 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatTransposeMatMult", "Mat"); 2033 PetscCall(PetscOptionsEList("-mattransposematmult_via", "Algorithmic approach", "MatTransposeMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2034 PetscOptionsEnd(); 2035 } else { 2036 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_AtB", "Mat"); 2037 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_AtB", algTypes, nalg, algTypes[alg], &alg, &flg)); 2038 PetscOptionsEnd(); 2039 } 2040 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2041 2042 C->ops->productsymbolic = MatProductSymbolic_AtB_SeqAIJ_SeqAIJ; 2043 PetscFunctionReturn(PETSC_SUCCESS); 2044 } 2045 2046 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABt(Mat C) 2047 { 2048 Mat_Product *product = C->product; 2049 PetscInt alg = 0; /* default algorithm */ 2050 PetscBool flg = PETSC_FALSE; 2051 const char *algTypes[2] = {"default", "color"}; 2052 PetscInt nalg = 2; 2053 2054 PetscFunctionBegin; 2055 /* Set default algorithm */ 2056 PetscCall(PetscStrcmp(C->product->alg, "default", &flg)); 2057 if (!flg) { 2058 alg = 1; 2059 PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2060 } 2061 2062 /* Get runtime option */ 2063 if (product->api_user) { 2064 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatTransposeMult", "Mat"); 2065 PetscCall(PetscOptionsEList("-matmattransmult_via", "Algorithmic approach", "MatMatTransposeMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2066 PetscOptionsEnd(); 2067 } else { 2068 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABt", "Mat"); 2069 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABt", algTypes, nalg, algTypes[alg], &alg, &flg)); 2070 PetscOptionsEnd(); 2071 } 2072 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2073 2074 C->ops->mattransposemultsymbolic = MatMatTransposeMultSymbolic_SeqAIJ_SeqAIJ; 2075 C->ops->productsymbolic = MatProductSymbolic_ABt; 2076 PetscFunctionReturn(PETSC_SUCCESS); 2077 } 2078 2079 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_PtAP(Mat C) 2080 { 2081 Mat_Product *product = C->product; 2082 PetscBool flg = PETSC_FALSE; 2083 PetscInt alg = 0; /* default algorithm -- alg=1 should be default!!! */ 2084 #if !defined(PETSC_HAVE_HYPRE) 2085 const char *algTypes[2] = {"scalable", "rap"}; 2086 PetscInt nalg = 2; 2087 #else 2088 const char *algTypes[3] = {"scalable", "rap", "hypre"}; 2089 PetscInt nalg = 3; 2090 #endif 2091 2092 PetscFunctionBegin; 2093 /* Set default algorithm */ 2094 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2095 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2096 2097 /* Get runtime option */ 2098 if (product->api_user) { 2099 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatPtAP", "Mat"); 2100 PetscCall(PetscOptionsEList("-matptap_via", "Algorithmic approach", "MatPtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2101 PetscOptionsEnd(); 2102 } else { 2103 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_PtAP", "Mat"); 2104 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_PtAP", algTypes, nalg, algTypes[0], &alg, &flg)); 2105 PetscOptionsEnd(); 2106 } 2107 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2108 2109 C->ops->productsymbolic = MatProductSymbolic_PtAP_SeqAIJ_SeqAIJ; 2110 PetscFunctionReturn(PETSC_SUCCESS); 2111 } 2112 2113 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_RARt(Mat C) 2114 { 2115 Mat_Product *product = C->product; 2116 PetscBool flg = PETSC_FALSE; 2117 PetscInt alg = 0; /* default algorithm */ 2118 const char *algTypes[3] = {"r*a*rt", "r*art", "coloring_rart"}; 2119 PetscInt nalg = 3; 2120 2121 PetscFunctionBegin; 2122 /* Set default algorithm */ 2123 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2124 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2125 2126 /* Get runtime option */ 2127 if (product->api_user) { 2128 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatRARt", "Mat"); 2129 PetscCall(PetscOptionsEList("-matrart_via", "Algorithmic approach", "MatRARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2130 PetscOptionsEnd(); 2131 } else { 2132 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_RARt", "Mat"); 2133 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_RARt", algTypes, nalg, algTypes[0], &alg, &flg)); 2134 PetscOptionsEnd(); 2135 } 2136 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2137 2138 C->ops->productsymbolic = MatProductSymbolic_RARt_SeqAIJ_SeqAIJ; 2139 PetscFunctionReturn(PETSC_SUCCESS); 2140 } 2141 2142 /* ABC = A*B*C = A*(B*C); ABC's algorithm must be chosen from AB's algorithm */ 2143 static PetscErrorCode MatProductSetFromOptions_SeqAIJ_ABC(Mat C) 2144 { 2145 Mat_Product *product = C->product; 2146 PetscInt alg = 0; /* default algorithm */ 2147 PetscBool flg = PETSC_FALSE; 2148 const char *algTypes[7] = {"sorted", "scalable", "scalable_fast", "heap", "btheap", "llcondensed", "rowmerge"}; 2149 PetscInt nalg = 7; 2150 2151 PetscFunctionBegin; 2152 /* Set default algorithm */ 2153 PetscCall(PetscStrcmp(product->alg, "default", &flg)); 2154 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2155 2156 /* Get runtime option */ 2157 if (product->api_user) { 2158 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatMatMatMult", "Mat"); 2159 PetscCall(PetscOptionsEList("-matmatmatmult_via", "Algorithmic approach", "MatMatMatMult", algTypes, nalg, algTypes[alg], &alg, &flg)); 2160 PetscOptionsEnd(); 2161 } else { 2162 PetscOptionsBegin(PetscObjectComm((PetscObject)C), ((PetscObject)C)->prefix, "MatProduct_ABC", "Mat"); 2163 PetscCall(PetscOptionsEList("-mat_product_algorithm", "Algorithmic approach", "MatProduct_ABC", algTypes, nalg, algTypes[alg], &alg, &flg)); 2164 PetscOptionsEnd(); 2165 } 2166 if (flg) PetscCall(MatProductSetAlgorithm(C, algTypes[alg])); 2167 2168 C->ops->matmatmultsymbolic = MatMatMatMultSymbolic_SeqAIJ_SeqAIJ_SeqAIJ; 2169 C->ops->productsymbolic = MatProductSymbolic_ABC; 2170 PetscFunctionReturn(PETSC_SUCCESS); 2171 } 2172 2173 PetscErrorCode MatProductSetFromOptions_SeqAIJ(Mat C) 2174 { 2175 Mat_Product *product = C->product; 2176 2177 PetscFunctionBegin; 2178 switch (product->type) { 2179 case MATPRODUCT_AB: 2180 PetscCall(MatProductSetFromOptions_SeqAIJ_AB(C)); 2181 break; 2182 case MATPRODUCT_AtB: 2183 PetscCall(MatProductSetFromOptions_SeqAIJ_AtB(C)); 2184 break; 2185 case MATPRODUCT_ABt: 2186 PetscCall(MatProductSetFromOptions_SeqAIJ_ABt(C)); 2187 break; 2188 case MATPRODUCT_PtAP: 2189 PetscCall(MatProductSetFromOptions_SeqAIJ_PtAP(C)); 2190 break; 2191 case MATPRODUCT_RARt: 2192 PetscCall(MatProductSetFromOptions_SeqAIJ_RARt(C)); 2193 break; 2194 case MATPRODUCT_ABC: 2195 PetscCall(MatProductSetFromOptions_SeqAIJ_ABC(C)); 2196 break; 2197 default: 2198 break; 2199 } 2200 PetscFunctionReturn(PETSC_SUCCESS); 2201 } 2202